## Setup Modello

In [None]:
%%capture
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tok_path = "./models/Llama2-13B-nous-hermes"
model_path = "./models/Llama2-13B-nous-hermes"
tokenizer = AutoTokenizer.from_pretrained(tok_path)
model = AutoModelForCausalLM.from_pretrained(model_path,
                                             load_in_8bit = True,
                                             device_map="auto",
                                             torch_dtype=torch.float16,
                                            )
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
import torch

pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.0,
    top_p=0.95,
    repetition_penalty=1.15,
)

local_llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cuda:1"})

load INSTRUCTOR_Transformer
max_seq_length  512


## Splitto un documento di MarkDown in capitoli

In [None]:
from langchain.document_loaders import TextLoader

loader = TextLoader("./Source_documents/GPUs.txt")
gpu_text = loader.load()
#gpu_text = "".join([d.page_content for d in docs])

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.text_splitter import TokenTextSplitter

# MarkdownHeaderSplitter permette di definire le varie header (capitoli, sottocapitoli, ecc..)
# e di creare un documento per header, ma questo solo le le header sono ben definite tramite simboli specifici

headers= [("#", "GPU_name")]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers)

markdown_splits = markdown_splitter.split_text(gpu_text[0].page_content)
# I documenti ottenuti sopra potrebbero essere troppo grandi per il modello, li splitto ancora.
# Usare altri splitter manterrà il metadata dei documenti originali, quindi anche se un documento di un
# header viene diviso in tanti ognuno di questi avrà sempre quell'header nel proprio metadata

token_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=10)
documents = token_splitter.split_documents(markdown_splits)

In [None]:
# Funzione per Stampare i documenti
def stampa(docs):
    for d in docs:
      print(d.metadata)
      print(d.page_content)
      print("------------------------------------")

In [None]:
stampa(documents)

{'GPU_name': 'GeForce RTX 4090'}
The GeForce RTX 4090 is an enthusiast-class graphics card by NVIDIA, launched on September 20th, 2022. Built on the 5 nm process, and based on the AD102 graphics processor, in its AD102-300-A1 variant, the card supports DirectX 12 Ultimate. This ensures that all modern games will run on GeForce RTX 4090. Additionally, the DirectX 12 Ultimate capability guarantees support for hardware-raytracing, variable-rate shading and more, in upcoming video games. The AD102 graphics processor is a large chip with a die area of 609 mm² and 76,300 million transistors. Unlike the fully unlocked RTX TITAN Ada, which uses the same GPU but has all 18432 shaders enabled, NVIDIA has disabled some shading units on the GeForce RTX 4090 to reach the product's target shader count. It features 16384 shading units, 512 texture mapping units, and 176 ROPs. Also included are 512 tensor cores which help improve the speed of machine learning applications
-----------------------------

## Uso Pydantic per estrarre info formattate da documenti

In [None]:
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
from pydantic import BaseModel, Field, validator
from langchain.prompts import PromptTemplate, ChatPromptTemplate

class GPU(BaseModel):
    name: str = Field(description="name of the GPU (graphics card) model", default="")
    memory: int = Field(description="number of Gigabytes (GB) of memory of the gpu", default=0,
                       enum=[8,10,12,24])
    price: float = Field(description="price of the gpu, just the number", default=0)
    rtx_support: bool = Field(description="Whether the gpu supports ray-tracing (RTX) or not, 0 for False and 1 for True",
                            enum=[0,1], default=0)
    def field_names():
        return "['name','memory','price','rtx_support']"

    # Posso aggiungere funzioni per controllare i valori dei campi ritornati dall'LLM,
    # non ha senso però fare Type Checking dei valori visto che lo fa già il parser.

In [None]:
# Posso personalizzare le istruzioni per la formattazione, di norma si usano le istruzioni di default del parser "parser.get_format_instructions()"
format_instructions = """
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema, The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
Another example for the schema {"properties":{"baa": {"title": "Baa", "description": "yes or no", "enum": ["True", "False"], "type": "boolean"}}, "required": ["baa"]}
the object {"baa": "True"} is a well-formatted instance of the schema, the object {"baa": True} is not.

Here is the output schema:
```
{"properties": {"name": {"title": "Name", "description": "name of the GPU (graphics card) model", "type": "string"}, "memory": {"title": "Memory", "description": "number of Gigabytes (GB) of memory of the gpu", "enum": [8, 10, 12, 24], "type": "integer"}, "price": {"title": "Price", "description": "price of the gpu, just the number", "type": "number"}, "rtx_support": {"title": "Rtx Support", "description": "Whether the gpu supports ray-tracing (RTX) or not, assume not if not mentioned", "enum": ["True", "False"], "type": "boolean"}}, "required": ["name", "memory", "price", "rtx_support"]}
"""

In [None]:
# Template per estrarre le info quando è la prima volta che vedo la GPU
gpu_template = """
The following is a text about a GPU:

text: {text}

From the text extract the following information: {info}

{format_instructions}
If you can't find a certain field information just use the specified default value for that field

Formatted json object:
"""
parser = PydanticOutputParser(pydantic_object=GPU)
prompt = PromptTemplate(
    template= gpu_template,
    input_variables=["text"],
    partial_variables={"format_instructions": parser.get_format_instructions(), "info": GPU.field_names()}
)

In [None]:
# Template per raffinare le info già estratte in passato su una GPU usando nuove informazioni
gpu_refine = """
The following is a text about the {gpu_name} GPU:

text: {text}

The following is a json object that contains information about the GPU:

{previous_answer}

Use the text provided ONLY IF NECESSARY to correct or complete the given json object.

Correct json object:
"""
prompt_refine = PromptTemplate(
    template= gpu_refine,
    input_variables=["text", "gpu_name", "previous_answer"],
)

In [None]:
# Funzione che parsa un doc per estrarre le info, se il parsing
# è sbagliato fa un tentativo di fixing tramite OutputFixingParser

from langchain.output_parsers import OutputFixingParser

def process_gpu(llm, doc, i, parser, fixer, input):
    nome_gpu = doc.metadata["GPU_name"]
    llm_output_iniziale = local_llm(input.text)
    try:
        JSON = parser.parse(llm_output_iniziale)
        output = JSON.json()
        print(f"{i}) {nome_gpu}: \n{output} \n-------------------------")
    except:
        try:
            JSON = fixer.parse(llm_output_iniziale)
            output = JSON.json()
            print(f"{i}) {nome_gpu}: \n{output} \n-------------------------")
        except:
            print(f"ERRORE: l'oggetto {i} ha parsing errato: \n {output}")
            return None,None
    return JSON, output

# Funzione che iterativamente scansiona vari documenti, se una GPU non l'ha mai vista ne estrae
# il parsing tramite il 1°template, se l'ha già vista recupera l'ultimo parsing fatto di essa
# da "llm_output_history" e lo raffina con il nuovo documento tramite 2°template

def scan_gpus(llm, docs, GPU, prompts):
    res = []

    llm_output_history = dict() # contiene le info parsate da documenti passati

    # Parser normale (dichiarato di nuovo per comodità)
    extractor = PydanticOutputParser(pydantic_object=GPU)
    # Parser per il fix di output errati
    fixer = OutputFixingParser.from_llm(parser=parser, llm=llm)

    for i,d in enumerate(docs):

        current_gpu = d.metadata["GPU_name"]

        if current_gpu not in llm_output_history:
            # Usa il prompt normale che estrae le info da zero
            input = prompts[0].format_prompt(text=d.page_content)
            JSON, llm_output = process_gpu(llm, d, i, extractor, fixer, input)
            if JSON:
                res.append(JSON)
                llm_output_history[current_gpu] = llm_output
        else:
            # Usa il prompt che raffina l'output precedente nella history
            print(f"...Raffinando info su {current_gpu}...")
            input = prompts[1].format_prompt(text=d.page_content,
                                             gpu_name = current_gpu,
                                             previous_answer = llm_output_history[current_gpu]
                                             )
            #print(f"output da raffinare: \n{llm_output_history[current_gpu]} \n")
            JSON, llm_output = process_gpu(llm, d, i, parser, fixer, input)
            if JSON:
                res.pop(-1)
                res.append(JSON)
                llm_output_history[current_gpu] = llm_output
    return res

In [None]:
prompts = [prompt, prompt_refine]
results = scan_gpus(local_llm, documents, GPU, prompts)

0) GeForce RTX 4090: 
{"name": "GeForce RTX 4090", "memory": 24, "price": 1199.0, "rtx_support": true} 
-------------------------
...Raffinando info su GeForce RTX 4090...
output da raffinare: 
{"name": "GeForce RTX 4090", "memory": 24, "price": 1199.0, "rtx_support": true} 

1) GeForce RTX 4090: 
{"name": "GeForce RTX 4090", "memory": 24, "price": 1599.0, "rtx_support": true} 
-------------------------
2) GeForce RTX 3080: 
{"name": "GeForce RTX 3080", "memory": 10, "price": 699.99, "rtx_support": true} 
-------------------------
...Raffinando info su GeForce RTX 3080...
output da raffinare: 
{"name": "GeForce RTX 3080", "memory": 10, "price": 699.99, "rtx_support": true} 

3) GeForce RTX 3080: 
{"name": "GeForce RTX 3080", "memory": 10, "price": 699.99, "rtx_support": true} 
-------------------------
4) GeForce GTX 1070 Ti: 
{"name": "GeForce GTX 1070 Ti", "memory": 8, "price": 0.0, "rtx_support": false} 
-------------------------
...Raffinando info su GeForce GTX 1070 Ti...
output d

In [None]:
results

[GPU(name='GeForce RTX 4090', memory=24, price=1599.0, rtx_support=True),
 GPU(name='GeForce RTX 3080', memory=10, price=699.99, rtx_support=True),
 GPU(name='GeForce GTX 1070 Ti', memory=8, price=399.0, rtx_support=False)]

## Test Singoli dei parser

In [None]:
# Test del parser
out = """
{
    "name": "GeForce RTX 3080",
    "memory": 10,
    "price": 100,
    "rtx_support": 1
}
"""
parser.parse(out)

GPU(name='GeForce RTX 3080', memory=10, price=100.0, rtx_support=True)

In [None]:
# Test singolo
input = prompt.format_prompt(text=documents[4].page_content)
out = local_llm(input.text)
print(out)



{
    "name": "GeForce GTX 1070 Ti",
    "memory": 8,
    "price": 0,
    "rtx_support": 0
}


In [None]:
input2 = prompt_refine.format_prompt(text=documents[5].page_content,
                                     previous_answer = out,
                                     gpu_name = documents[5].metadata["GPU_name"]
                                     )
out2 = local_llm(input2.text)
print(out2)


{
    "name": "GeForce GTX 1070 Ti",
    "memory": 8,
    "price": 399,
    "rtx_support": 0
}


In [None]:
from langchain.output_parsers import OutputFixingParser

new_parser = OutputFixingParser.from_llm(parser=parser, llm=local_llm)
new_parser.parse(out)

In [None]:
from langchain.output_parsers import RetryWithErrorOutputParser

fix_parser = RetryWithErrorOutputParser.from_llm(parser=parser, llm=local_llm)
fix_parser.parse_with_prompt(out, input)

## Arricchisco i metadata dei documenti con i dati estratti

In [None]:
import json as js

docs = documents

new_meta=[]
for d in docs:
    for r in results:
        json = js.loads(r.json())
        if d.metadata["GPU_name"] == json["name"]:
            new_meta.append(json)
            continue

In [None]:
for d,meta in zip(docs, new_meta):
    d.metadata = meta

In [None]:
stampa(docs) # Ora tutti hanno il metadata più ricco

{'name': 'GeForce RTX 4090', 'memory': 24, 'price': 1599.0, 'rtx_support': True}
The GeForce RTX 4090 is an enthusiast-class graphics card by NVIDIA, launched on September 20th, 2022. Built on the 5 nm process, and based on the AD102 graphics processor, in its AD102-300-A1 variant, the card supports DirectX 12 Ultimate. This ensures that all modern games will run on GeForce RTX 4090. Additionally, the DirectX 12 Ultimate capability guarantees support for hardware-raytracing, variable-rate shading and more, in upcoming video games. The AD102 graphics processor is a large chip with a die area of 609 mm² and 76,300 million transistors. Unlike the fully unlocked RTX TITAN Ada, which uses the same GPU but has all 18432 shaders enabled, NVIDIA has disabled some shading units on the GeForce RTX 4090 to reach the product's target shader count. It features 16384 shading units, 512 texture mapping units, and 176 ROPs. Also included are 512 tensor cores which help improve the speed of machine lea

## Self-Querying permette ad un LLM di creare filtri per Retrieval dei documenti usando i metadata

In [None]:
# Creo prima il Chroma DB con i documenti
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embedding=instructor_embeddings)

In [None]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

# Queste sono le info sui metadata che l'LLM vedrà e userà per creare i filtri

metadata_field_info = [
    AttributeInfo(
        name="name",
        description="The name of the GPU",
        type="string",
    ),
    AttributeInfo(
        name="memory",
        description="number of gigabytes (GB) of memory",
        type="int",
    ),
    AttributeInfo(
        name="price",
        description="price of the GPU",
        type="float",
    ),
    AttributeInfo(
        name="rtx_support",
        description="Whether the GPU supports ray-tracing (RTX) or not",
        type="boolean",
    ),
]
SQ_retriever = SelfQueryRetriever.from_llm(local_llm, db, "Documents about different Graphics Cards (GPUs)",
                                        metadata_field_info, verbose=True,
                                        #search_kwargs = {'k':3},
                                       )

l'LLM crea la una nuova query che viene usata
dal retriever normale per fare cosine similarity. Questo ritorna una serie di documenti
che vengono poi filtrati dal filtro creato dall'LLM.
Il numero di documenti che voglio mi restituisca è specificato da 'k' in search_kwargs.

In [None]:
# SelfQueryRetriever mi fa sapere la nuova query e il filtro dei metadata che l'LLM usa

res = SQ_retriever.get_relevant_documents("what are the GPUs that cost less than 1000 dollars and support ray tracing?")
for r in res: print(r.metadata)



query='gpu rtx' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.LT: 'lt'>, attribute='price', value=1000), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='rtx_support', value=True)]) limit=None


In [None]:
# Per confronto noto che il retriever normale che fa solo cosine similarity
# ricevendo la query "gpu rtx" dal LLM ritonerebbe tutti questi risultati.
x = db.as_retriever(search_kwargs = {'k':10}).get_relevant_documents("gpu rtx")
for xx in x:   print(xx.metadata)

Number of requested results 10 is greater than number of elements in index 6, updating n_results = 6


{'name': 'GeForce RTX 3080', 'memory': 10, 'price': 699.99, 'rtx_support': False}
{'name': 'GeForce RTX 4090', 'memory': 24, 'price': 1599.0, 'rtx_support': True}
{'name': 'GeForce RTX 4090', 'memory': 24, 'price': 1599.0, 'rtx_support': True}
{'name': 'GeForce RTX 3080', 'memory': 10, 'price': 699.99, 'rtx_support': False}
{'name': 'GeForce GTX 1070 Ti', 'memory': 8, 'price': 399.0, 'rtx_support': False}
{'name': 'GeForce GTX 1070 Ti', 'memory': 8, 'price': 399.0, 'rtx_support': False}


## Test con questo SelfQuery retriever all'interno di una QA Chain

Servirebbe un Fixer per il Parser che genera i filtri, come quello che
già c'è per correggere Pydantic Parser.

In [None]:
from langchain.chains import RetrievalQA

query = """what are the GPUs that cost less than 1000 dollars and support ray tracing
and how many transistors do they have?"""

qa_chain = RetrievalQA.from_chain_type(llm=local_llm,
                                    chain_type="stuff",
                                    retriever= SQ_retriever,
                                    verbose = True
                                    )

In [None]:
qa_chain(query)



[1m> Entering new  chain...[0m
query='cheap rtx' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.LT: 'lt'>, attribute='price', value=1000), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='rtx_support', value=True)]) limit=1

[1m> Finished chain.[0m


{'query': 'what are the GPUs that cost less than 1000 dollars and support ray tracing \nand how many transistors do they have?',
 'result': ' There are currently no GPUs that cost less than $1000 and support ray tracing.'}

In [None]:
x = retriever.get_relevant_documents(query)
for xx in x: print(x.metadata)