In [9]:
from nomic import embed
import pandas as pd
import numpy as np
from numpy.linalg import norm
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer
from tqdm import tqdm, trange
import re

In [19]:
def extract_concepts(concepts):
    pattern = r"<example>(.*?)</example>"
    extracted_concepts = []
    for concept in concepts:
        extracted_concepts.extend(re.findall(pattern, concept))
    return extracted_concepts

In [21]:
try:
    df = pd.read_csv('dp.csv', encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv('dp.csv', encoding='latin1')

df['texts'] = df['texts'].astype(str)
df['concepts'] = df['concepts'].astype(str)

T = df['texts'].tolist()
C = extract_concepts(df['concepts'].tolist())

nomic-embed-text-v1

In [27]:
texts_output = embed.text(texts = T,
                    model = 'nomic-embed-text-v1',
                    task_type='clustering',
                    inference_mode='local',
                    device='gpu',
)
concepts_output = embed.text(texts = C,
                    model = 'nomic-embed-text-v1',
                    task_type='clustering',
                    inference_mode='local',
                    device='gpu',
)

In [28]:
concepts = concepts_output['embeddings']
texts = texts_output['embeddings']

In [35]:
# Remove similarity less than 0.5
def cosine_similarity_05(concepts, texts):
    Similarity_results = {}
    for i, text in enumerate(texts):
        Similarity_results[f'text_{i}'] = []
        for j, concept in concepts:
            similarity = np.dot(text,concept)/(norm(text)*norm(concept))
            if similarity <= 0.5 :
                Similarity_results[f'text_{i}'].append(0)
            else:
                Similarity_results[f'text_{i}'].append(similarity)
                
    return Similarity_results

In [37]:
Similarity_results = cosine_similarity_05(concepts, texts)
print(Similarity_results)


{'text_0': [0.5996681682493854, 0.5282199175499961, 0.5338829048831386, 0.6448488690824624, 0.5848336239250377, 0.5379601127553603, 0.5425707676555278, 0.5355060533203366, 0.583590149424887, 0.5608638796180688, 0.544777864627073, 0.5993819816453823, 0.5935803667730819, 0.6009109976083896, 0.5436365880364012, 0.524851832236069, 0.5719641100832754, 0.5553188098238013, 0.607258395074126, 0.5370032672874123, 0.5419039918805215, 0.5267723518559189, 0.560862352426763, 0.5112695413021853, 0.5138644932773803, 0.5293370514245297, 0.5094700128244218, 0.5082485578559017, 0.5256897759415491, 0.530172321903927, 0.5260336132891552, 0.5055127102400155, 0, 0, 0.5002839272603072, 0.535238164895063, 0, 0.5364247738885162, 0.5559961363646121, 0.5006203758769172, 0, 0.5272015329505249, 0, 0.5563067359012899, 0.5621914011208828, 0.6177972699885242, 0.6091184091160821, 0.5870357363852784, 0.574914775009062, 0.5841518877697858, 0.5446160395620119, 0.5433041181350224, 0.6279958316872976, 0.5789181966606881, 0

In [45]:
#Retain only those above the mean
def cosine_similarity_mean(concepts, texts):
    Similarity_results = {}
    for i, text in enumerate(texts):
        Similarity_results[f'text_{i}'] = []
        for concept in concepts:
            similarity = np.dot(text,concept)/(norm(text)*norm(concept))
            Similarity_results[f'text_{i}'].append(similarity)
        average = sum(Similarity_results[f'text_{i}'])/len(concepts)
        Similarity_results[f'text_{i}'] = [0 if sim < average else sim for sim in Similarity_results[f'text_{i}']]
    return Similarity_results

In [47]:
Similarity_results_mean = cosine_similarity_mean(concepts, texts)
print(Similarity_results_mean)

{'text_0': [0.5996681682493854, 0, 0, 0.6448488690824624, 0.5848336239250377, 0, 0, 0, 0.583590149424887, 0.5608638796180688, 0, 0.5993819816453823, 0.5935803667730819, 0.6009109976083896, 0, 0, 0.5719641100832754, 0, 0.607258395074126, 0, 0, 0, 0.560862352426763, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5621914011208828, 0.6177972699885242, 0.6091184091160821, 0.5870357363852784, 0.574914775009062, 0.5841518877697858, 0, 0, 0.6279958316872976, 0.5789181966606881, 0, 0.5621337479212506, 0.5711964064612669, 0.5871419433950428, 0, 0.5663976787909932, 0, 0, 0.6042716209397702, 0.55873942325173, 0, 0.5628978005039055, 0, 0, 0.5979474975188511, 0.5714686986806854, 0.5821761565524733, 0.5753918364808446, 0.5824260061575316, 0.6057690520444483, 0.6084788225301212, 0.5740141606638826, 0, 0.6158791051120667, 0, 0.5951850325564005, 0.5964941011127561, 0.5706098320386903, 0, 0, 0.581196722045183, 0.5600073082633933, 0.5893688162635694, 0.5782553529349189, 0.581618165992913

In [61]:
#Retain only n elements
def cosine_similarity_ranking(concepts, texts):
    Similarity_results = {}
    for i, text in enumerate(texts):
        similarities = []
        for j, concept in enumerate(concepts):
            similarity = np.dot(text,concept)/(norm(text)*norm(concept))
            similarities.append(similarity)
        top_indices = np.argsort(similarities)[-10:]
        top_similarities = [similarities[k] if k in top_indices else 0 for k in range(len(similarities))]       
        Similarity_results[f'text_{i}'] = top_similarities
            
    return Similarity_results

In [63]:
Similarity_results_ranking = cosine_similarity_ranking(concepts, texts)
print(Similarity_results_ranking)

{'text_0': [0, 0, 0, 0.6448488690824624, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.607258395074126, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6177972699885242, 0.6091184091160821, 0, 0, 0, 0, 0, 0.6279958316872976, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6084788225301212, 0, 0, 0.6158791051120667, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.617443220730784, 0.6076486619311432, 0, 0, 0.6072489988699115, 0, 0, 0, 0, 0], 'text_1': [0, 0, 0, 0.6574574054634, 0, 0, 0, 0, 0.6296174960197024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6236706159428739, 0, 0, 0.6203260211500566, 0.6457911707470764, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6224575878079945, 0, 0, 0, 0, 0, 0.6269228283646546, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.613706441184669, 0, 0, 0.6265033326620506, 0.6332540141931048, 0], 'text_2': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6197754

test model

In [42]:
Text1 = "A man is eating a piece of bread."
Text2 = "A girl is riding a horse."

output = embed.text(texts = [text1,text2],
                    model = 'nomic-embed-text-v1.5',
                    task_type='classification',
                    inference_mode='local',
                    device='gpu',
)

In [44]:
similarity= cos_sim(output['embeddings'][0],output['embeddings'][1])
similarity

tensor([[0.4971]])

mxbai-embed-large-v1

In [109]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from sentence_transformers.quantization import quantize_embeddings


dimensions = 512

model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=dimensions)

query = ' A man is eating a piece of bread'

docs = [
    query,
    "A man is eating food.",
    "A man is eating pasta.",
    "The girl is eating a piece of bread.",
    "A girl is riding a horse.",
    
]


embeddings = model.encode(docs)


binary_embeddings = quantize_embeddings(embeddings, precision="ubinary")

similarities = cos_sim(embeddings[0], embeddings[1:])
print('similarities:', similarities)



similarities: tensor([[0.7928, 0.6144, 0.4431, 0.0625]])


all-mpnet-base-v2

In [73]:
sentences = ["A man is eating a piece of bread.", "A girl is riding a horse."]

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(sentences)
print(embeddings)

[[ 0.0178611  -0.03334754 -0.00598936 ...  0.01937947 -0.00063177
   0.01153586]
 [ 0.03499187  0.00259032  0.01316311 ...  0.02277771  0.00654575
   0.00372357]]


In [74]:
similarities = cos_sim(embeddings[0], embeddings[1:])

In [75]:
similarities

tensor([[0.0236]])