Testo vari modelli di Embeddings sulle frasi date dal cliente

In [None]:
import pandas as pd
from langchain.docstore.document import Document

data = pd.read_csv("./Source_documents/sample_semantic_similarity.csv")

# Converto le righe in docs per creare Vector Databases da cui fare retrieval
# e quindi testare gli embeddings delle righe

docs = [Document(page_content=row["sentence"],
                 metadata={'row':index+1, "topic":row["topic"]})
        for index, row in data.iterrows()]

## Instructor Embeddings

In [None]:
%%capture
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cuda:1"})

In [None]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-xl')

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [None]:
from scipy import spatial

for i in (0,3,6,9):
  s1 = model.encode([data.iloc[i]["sentence"]])
  s2 = model.encode([data.iloc[i+1]["sentence"]])
  t = model.encode([data.iloc[i+2]["sentence"]])
  sim = 1 - spatial.distance.cosine(s1[0],s2[0])
  print(f"S1-S2: {round(sim*100)}%")
  sim = 1 - spatial.distance.cosine(s1[0],t[0])
  print(f"S1-T: {round(sim*100)}%")
  sim = 1 - spatial.distance.cosine(s2[0],t[0])
  print(f"S2-T: {round(sim*100)}%")
  print("------------------------------------")

S1-S2: 88%
S1-T: 69%
S2-T: 70%
------------------------------------
S1-S2: 99%
S1-T: 83%
S2-T: 83%
------------------------------------
S1-S2: 75%
S1-T: 71%
S2-T: 78%
------------------------------------
S1-S2: 80%
S1-T: 80%
S2-T: 81%
------------------------------------


In [None]:
# Uso db pre-esistente
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embedding_function=instructor_embeddings)

# il retriever "di base" mi ritorna 12 contesti, questi verranno poi filtrati (vedi sotto)
retriever = db.as_retriever(search_kwargs={"k": 12}, search_type="similarity")

In [None]:
# Contextual Compression Retriever accetta diversi tipi di "compressori" che vanno
# a filtrare i contesti ritornati dal base retriever. Uno di questi compressori è EmbeddingsFilter
# che definisce una similarity threshold: i contesti non abbastanza simili alla query vengono scartati

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter

SOGLIA = 0.80
embeddings_filter = EmbeddingsFilter(embeddings=instructor_embeddings,
                                     similarity_threshold=SOGLIA)
compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter,
                                                       base_retriever=retriever)

In [None]:
for index, row in data.iterrows():
    query = data.iloc[index]["sentence"]
    result = compression_retriever.get_relevant_documents(query)
    result_rows = sorted([r.metadata["row"] for r in result])
    print(f"Query Riga: {index+1}")
    print(f"Righe ritornate: {result_rows} \n")

Query Riga: 1
Righe ritornate: [1, 2] 

Query Riga: 2
Righe ritornate: [1, 2] 

Query Riga: 3
Righe ritornate: [3, 9] 

Query Riga: 4
Righe ritornate: [4, 5, 6] 

Query Riga: 5
Righe ritornate: [4, 5, 6] 

Query Riga: 6
Righe ritornate: [4, 5, 6] 

Query Riga: 7
Righe ritornate: [7] 

Query Riga: 8
Righe ritornate: [8] 

Query Riga: 9
Righe ritornate: [3, 9] 

Query Riga: 10
Righe ritornate: [10] 

Query Riga: 11
Righe ritornate: [11, 12] 

Query Riga: 12
Righe ritornate: [11, 12] 



## OpenAI Embeddings

In [None]:
from langchain.embeddings import OpenAIEmbeddings

openai_doc = OpenAIEmbeddings(openai_api_key="",
                                   model = "text-search-davinci-doc-001")
openai_query = OpenAIEmbeddings(openai_api_key="",
                                   model = "text-search-davinci-query-001")
openaiEmb = OpenAIEmbeddings(openai_api_key="")

In [None]:
# Uso db pre-esistente
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs,embedding=openai_doc)
retriever = db.as_retriever(search_kwargs={"k": 12}, search_type="similarity")

In [None]:
# Similarity Threshold
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter

SOGLIA = 0.99
embeddings_filter_OpenAI = EmbeddingsFilter(embeddings=openai_doc, similarity_threshold=SOGLIA)
compression_retriever_OpenAI = ContextualCompressionRetriever(base_compressor=embeddings_filter_OpenAI,
                                                              base_retriever=retriever)

In [None]:
#for index, row in data.iterrows():
query = data.iloc[0]["sentence"]
result = compression_retriever_OpenAI.get_relevant_documents(query)
result_rows = sorted([r.metadata["row"] for r in result])
#print(f"Query Riga: {index+1}")
print(f"Righe ritornate: {result_rows} \n")

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-ZvZrqo2NGOFgLzOcyhl12uiy on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-ZvZrqo2NGOFgLzOcyhl12uiy on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/

Righe ritornate: [] 



In [None]:
vector=openai_query.embed_query(data.iloc[11]["sentence"])
res = db.similarity_search_by_vector_with_relevance_scores(vector, kwargs={'score_threshold':SOGLIA})
for r in res:
    print(f"Riga: {r[0].metadata['row']} con similarity {r[1]}")

Riga: 7 con similarity 0.5228797197341919
Riga: 11 con similarity 0.5262295007705688
Riga: 1 con similarity 0.5331012606620789
Riga: 12 con similarity 0.541365385055542


## E5-Large-V2

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "intfloat/e5-large-v2"
model_kwargs = {'device': 'cuda:0'}
encode_kwargs = {'normalize_embeddings': False}
e5 = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)b9212/.gitattributes: 100%|██████████| 1.48k/1.48k [00:00<00:00, 12.1MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 201/201 [00:00<00:00, 2.05MB/s]
Downloading (…)0777bb9212/README.md: 100%|██████████| 67.5k/67.5k [00:00<00:00, 16.6MB/s]
Downloading (…)77bb9212/config.json: 100%|██████████| 616/616 [00:00<00:00, 6.21MB/s]
Downloading (…)777bb9212/handler.py: 100%|██████████| 1.12k/1.12k [00:00<00:00, 11.3MB/s]
Downloading model.safetensors: 100%|██████████| 1.34G/1.34G [02:02<00:00, 10.9MB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.34G/1.34G [02:01<00:00, 11.0MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 57.0/57.0 [00:00<00:00, 541kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 1.26MB/s]
Downloading (…)b9212/tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 1.74MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 314/314 [00:00<00:00, 3.1

In [None]:
# Uso db pre-esistente
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embedding=e5)
retriever = db.as_retriever(search_kwargs={"k": 12}, search_type="similarity")

In [None]:
# Similarity Threshold
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter

SOGLIA = 0.832
embeddings_filter_e5 = EmbeddingsFilter(embeddings=e5, similarity_threshold=SOGLIA)
compression_retriever_e5 = ContextualCompressionRetriever(base_compressor=embeddings_filter_e5,
                                                          base_retriever=retriever)

In [None]:
for index, row in data.iterrows():
    query = data.iloc[index]["sentence"]
    result = compression_retriever_e5.get_relevant_documents(query)
    result_rows = sorted([r.metadata["row"] for r in result])
    print(f"Query Riga: {index+1}")
    print(f"Righe ritornate: {result_rows} \n")

Query Riga: 1
Righe ritornate: [1, 2, 4, 5] 

Query Riga: 2
Righe ritornate: [1, 2, 4, 5, 6] 

Query Riga: 3
Righe ritornate: [3, 9] 

Query Riga: 4
Righe ritornate: [1, 2, 4, 5, 6] 

Query Riga: 5
Righe ritornate: [1, 2, 4, 5, 6, 7] 

Query Riga: 6
Righe ritornate: [2, 4, 5, 6] 

Query Riga: 7
Righe ritornate: [5, 7] 

Query Riga: 8
Righe ritornate: [8] 

Query Riga: 9
Righe ritornate: [3, 9, 12] 

Query Riga: 10
Righe ritornate: [10, 11, 12] 

Query Riga: 11
Righe ritornate: [10, 11, 12] 

Query Riga: 12
Righe ritornate: [9, 10, 11, 12] 



In [None]:
query = data.iloc[0]["sentence"]
result = db.similarity_search_with_relevance_scores(query, k=12)
result_rows = sorted([r.metadata["row"] for r in result])
print(f"Query Riga: {index+1}")
print(f"Righe ritornate: {result_rows} \n")

In [None]:
for r in result:
    print(f"Riga: {r[0].metadata['row']}")
    print(f"score: {r[1]} \n -------")

Riga: 1
score: 0.9999999999991269 
 -------
Riga: 2
score: 0.8566821809863276 
 -------
Riga: 5
score: 0.7689114848662242 
 -------
Riga: 4
score: 0.7651595513383602 
 -------
Riga: 6
score: 0.7543052627071071 
 -------
Riga: 7
score: 0.7441451537442527 
 -------
Riga: 10
score: 0.721535097538947 
 -------
Riga: 9
score: 0.7144739620534015 
 -------
Riga: 12
score: 0.7132661387422006 
 -------
Riga: 8
score: 0.7023009406790911 
 -------
Riga: 3
score: 0.6953162806501687 
 -------
Riga: 11
score: 0.6941541234493294 
 -------


## all-MPNET-base-v2

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cuda:1'}
encode_kwargs = {'normalize_embeddings': False}
mptNet = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
# Uso db pre-esistente
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embedding=mptNet)
retriever = db.as_retriever(search_kwargs={"k": 12}, search_type="similarity")

In [None]:
# Similarity Threshold
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter

SOGLIA = 0.57
embeddings_filter_mptNet= EmbeddingsFilter(embeddings=mptNet, similarity_threshold=SOGLIA)
compression_retriever_mptNet = ContextualCompressionRetriever(base_compressor=embeddings_filter_mptNet,
                                                          base_retriever=retriever)

In [None]:
for index, row in data.iterrows():
    query = data.iloc[index]["sentence"]
    result = compression_retriever_mptNet.get_relevant_documents(query)
    result_rows = sorted([r.metadata["row"] for r in result])
    print(f"Query Riga: {index+1}")
    print(f"Righe ritornate: {result_rows} \n")

Query Riga: 1
Righe ritornate: [1, 2] 

Query Riga: 2
Righe ritornate: [1, 2] 

Query Riga: 3
Righe ritornate: [3, 6, 9] 

Query Riga: 4
Righe ritornate: [4, 5, 6] 

Query Riga: 5
Righe ritornate: [4, 5, 6] 

Query Riga: 6
Righe ritornate: [3, 4, 5, 6, 9] 

Query Riga: 7
Righe ritornate: [7] 

Query Riga: 8
Righe ritornate: [8] 

Query Riga: 9
Righe ritornate: [3, 6, 9] 

Query Riga: 10
Righe ritornate: [10, 11] 

Query Riga: 11
Righe ritornate: [10, 11] 

Query Riga: 12
Righe ritornate: [12] 



## all-MiniLM-L12-v2

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-MiniLM-L12-v2"
model_kwargs = {'device': 'cuda:1'}
encode_kwargs = {'normalize_embeddings': False}
miniLM = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

Downloading (…)5dded/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 13.0MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 2.50MB/s]
Downloading (…)4d81d5dded/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 84.9MB/s]
Downloading (…)81d5dded/config.json: 100%|██████████| 573/573 [00:00<00:00, 7.44MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 1.56MB/s]
Downloading (…)ded/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 394kB/s]
Downloading pytorch_model.bin: 100%|██████████| 134M/134M [00:11<00:00, 11.2MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 553kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 1.25MB/s]
Downloading (…)5dded/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.48MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 352/352 [00:00<00:00, 3.41MB/s]
Downloading (…)dded/train_script.py: 100%|█

In [None]:
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embedding=miniLM)
retriever = db.as_retriever(search_kwargs={"k": 12}, search_type="similarity")

In [None]:
# Similarity Threshold
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter

SOGLIA = 0.50
embeddings_filter_miniLM = EmbeddingsFilter(embeddings=miniLM, similarity_threshold=SOGLIA)
compression_retriever_miniLM = ContextualCompressionRetriever(base_compressor=embeddings_filter_miniLM,
                                                          base_retriever=retriever)

In [None]:
for index, row in data.iterrows():
    query = data.iloc[index]["sentence"]
    result = compression_retriever_miniLM.get_relevant_documents(query)
    result_rows = sorted([r.metadata["row"] for r in result])
    print(f"Query Riga: {index+1}")
    print(f"Righe ritornate: {result_rows} \n")

Query Riga: 1
Righe ritornate: [1, 2] 

Query Riga: 2
Righe ritornate: [1, 2] 

Query Riga: 3
Righe ritornate: [3, 6, 9] 

Query Riga: 4
Righe ritornate: [4, 5, 6] 

Query Riga: 5
Righe ritornate: [4, 5, 6] 

Query Riga: 6
Righe ritornate: [3, 4, 5, 6] 

Query Riga: 7
Righe ritornate: [7] 

Query Riga: 8
Righe ritornate: [8] 

Query Riga: 9
Righe ritornate: [3, 9] 

Query Riga: 10
Righe ritornate: [10, 11, 12] 

Query Riga: 11
Righe ritornate: [10, 11, 12] 

Query Riga: 12
Righe ritornate: [10, 11, 12] 



## GTE-Large

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "thenlper/gte-large"
model_kwargs = {'device': 'cuda:1'}
encode_kwargs = {'normalize_embeddings': False}
gte = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/thenlper_gte-large. Creating a new one with MEAN pooling.


In [None]:
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embedding=gte)
retriever = db.as_retriever(search_kwargs={"k": 12}, search_type="similarity")

In [None]:
# Similarity Threshold
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter

SOGLIA = 0.87
embeddings_filter_gte = EmbeddingsFilter(embeddings=gte, similarity_threshold=SOGLIA)
compression_retriever_gte = ContextualCompressionRetriever(base_compressor=embeddings_filter_gte,
                                                          base_retriever=retriever)

In [None]:
for index, row in data.iterrows():
    query = data.iloc[index]["sentence"]
    result = compression_retriever_gte.get_relevant_documents(query)
    result_rows = sorted([r.metadata["row"] for r in result])
    print(f"Query Riga: {index+1}")
    print(f"Righe ritornate: {result_rows} \n")

Query Riga: 1
Righe ritornate: [1, 2] 

Query Riga: 2
Righe ritornate: [1, 2, 8] 

Query Riga: 3
Righe ritornate: [3, 6, 9] 

Query Riga: 4
Righe ritornate: [4, 5, 6] 

Query Riga: 5
Righe ritornate: [4, 5, 6] 

Query Riga: 6
Righe ritornate: [3, 4, 5, 6, 9] 

Query Riga: 7
Righe ritornate: [7] 

Query Riga: 8
Righe ritornate: [2, 8, 9] 

Query Riga: 9
Righe ritornate: [3, 6, 8, 9] 

Query Riga: 10
Righe ritornate: [10, 11, 12] 

Query Riga: 11
Righe ritornate: [10, 11, 12] 

Query Riga: 12
Righe ritornate: [10, 11, 12] 



In [None]:
from sentence_transformers import util
A = gte.embed_documents(data.iloc[1]["sentence"], convert_to_tensor=True)
B = gte.embed_documents(data.iloc[2]["sentence"], convert_to_tensor=True)

util.pytorch_cos_sim(A,B)

TypeError: HuggingFaceEmbeddings.embed_documents() got an unexpected keyword argument 'convert_to_tensor'

In [None]:
from langchain.evaluation import EmbeddingDistanceEvalChain

chain = EmbeddingDistanceEvalChain(embeddings = gte)

for i in (0,3,6,9):
    s1 = data.iloc[i]["sentence"]
    s2 = data.iloc[i+1]["sentence"]
    t = data.iloc[i+2]["sentence"]
    distance = chain.evaluate_strings(prediction=s1, reference=t)
    print(f"S1-TARGET: {1-distance['score']}")
    distance = chain.evaluate_strings(prediction=s2, reference=t)
    print(f"S2-TARGET: {1-distance['score']}")
    distance = chain.evaluate_strings(prediction=s1, reference=s2)
    print(f"S1-S2: {1-distance['score']} \n -----------------------------")

S1-TARGET: 0.8230722291311259
S2-TARGET: 0.8461697329265342
S1-S2: 0.9155346744881976 
 -----------------------------
S1-TARGET: 0.9155068380743657
S2-TARGET: 0.9105818033904317
S1-S2: 0.9968708839028425 
 -----------------------------
S1-TARGET: 0.8223792497763691
S2-TARGET: 0.8752487659117552
S1-S2: 0.8448056204368667 
 -----------------------------
S1-TARGET: 0.8758212720703982
S2-TARGET: 0.8784227767909063
S1-S2: 0.9021518479642557 
 -----------------------------


## **LLM Compressor**

Un altro modo per filtrare i contesti ritornati dal base retriever è usare un LLM che li legge insieme alla query e scarta quelli che non considera rilevanti

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainFilter

retriever = db.as_retriever(search_kwargs={"k": 5}, search_type="similarity")

compressor = LLMChainFilter.from_llm(local_llm,
                                     #prompt=myPrompt,
                                     )

compression_retriever = ContextualCompressionRetriever(base_compressor=compressor,
                                                       base_retriever=retriever)

In [None]:
# Posso definire un custom prompt per l'LLM che fungerà da filtro, utile
# per fornirgli documentazione extra riguardo il contesto in cui lavora

myTemplate = """
Given the following question and context, return YES if the context is STRICTLY relevant to the question and NO if it isn't.

> Question: {question}
> Context: {context}
> Is the context relevant? answer:
"""
from langchain.prompts import PromptTemplate

myPrompt = PromptTemplate(template=myTemplate, input_variables=["context", "question"])

In [None]:
# Posso usarlo direttamente come retriever e basta
compression_retriever.get_relevant_documents("query")

# Oppure inserirlo all'interno di una QA chain con vector db al posto del solito base retriever