# Comparison of multilingual embedding models

In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [4]:
pip install -q llama-index

In [5]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.llms import OpenAI
fr_dataset = EmbeddingQAFinetuneDataset.from_json("/content/fr_dataset.json")
it_dataset = EmbeddingQAFinetuneDataset.from_json("/content/it_dataset.json")
keys_to_delete = [key for key in it_dataset.corpus.keys() if it_dataset.corpus[key] == '']
for key in keys_to_delete:
    del it_dataset.corpus[key]
keys_to_delete = [key for key in fr_dataset.corpus.keys() if fr_dataset.corpus[key] == '']
for key in keys_to_delete:
    del fr_dataset.corpus[key]

# Italian dataset

In [None]:
print(it_dataset.__dict__.keys())
print(it_dataset.queries["26c7b899-bb79-4dfe-bcf6-0a69649932aa"])
print(it_dataset.relevant_docs["26c7b899-bb79-4dfe-bcf6-0a69649932aa"])
print(it_dataset.corpus["1dc6e9c6-90f3-419c-9d03-01ea745f0f18"])

dict_keys(['queries', 'corpus', 'relevant_docs', 'mode'])
Qual è il titolo del documento di lavoro dei servizi della Commissione che accompagna la Raccomandazione del Consiglio sul programma nazionale di riforma 2023 dell'Italia?
['1dc6e9c6-90f3-419c-9d03-01ea745f0f18']
IT   IT 
 
 
 COMMISSIONE  
EUROPEA   
Bruxelles, 24.5.2023  
SWD(2023) 612 final  
 
DOCUMENTO DI LAVORO DEI SERVIZI DELLA COMMISSIONE  
Relazione per paese 2023 - Italia  
che accompagna il documento  
Raccomandazione di RACCOMANDAZIONE DEL CONSIGLIO  
sul programma nazionale di riforma 2023 dell'Italia e che formula un parere del 
Consiglio sul programma di stabilità 2023 dell'Italia  
{COM(2023)  612 final}


# French dataset

In [None]:
print(fr_dataset.__dict__.keys())
print(fr_dataset.queries["132b18e9-cb9c-4c24-9a81-2b5b1dec3478"])
print(fr_dataset.relevant_docs["132b18e9-cb9c-4c24-9a81-2b5b1dec3478"])
print(fr_dataset.corpus["eacf897e-c2b7-4666-b905-d7bb2a05a2e4"])

dict_keys(['queries', 'corpus', 'relevant_docs', 'mode'])
Quelle est la date du document de travail des services de la Commission pour la France ?
['eacf897e-c2b7-4666-b905-d7bb2a05a2e4']
FR   FR 
 
 
 COMMISSION  
EUROPÉENNE   
Bruxelles, le 24.5.2023  
SWD(2023) 610 final  
 
DOCUMENT DE TRAVAIL DES SERVICES DE LA COMMISSION  
Rapport 2023 pour la France  
accompagnant le document  
Recommandation de recommandation du Conseil  
 
concernant le programme national de réforme de la France pour 2023 et portant avis du 
Conseil sur le programme de stabilité de la France pour 2023  
{COM(2023)  610 final}


## Evaluation functions:
- Hit Rate
- MRR

In [6]:
from tqdm.notebook import tqdm
import pandas as pd
from llama_index.evaluation import RetrieverEvaluator
import numpy as np
from llama_index.schema import TextNode
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.embeddings import OpenAIEmbedding
import os
os.environ["OPENAI_API_KEY"] = "sk-"
def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    service_context = ServiceContext.from_defaults(embed_model=embed_model)
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, service_context=service_context, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc
        if is_hit:
            rank = retrieved_ids.index(expected_id) + 1
            mrr = 1 / rank
        else:
            mrr = 0

        eval_result = {
            "is_hit": is_hit,
            "mrr": mrr,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

# text-embedding-3-large

In [None]:
from llama_index.embeddings import OpenAIEmbedding

text_embedding_3_large = OpenAIEmbedding(model="text-embedding-3-large")

# results on the Italian dataset
text_embedding_3_large_results_it = evaluate(it_dataset,text_embedding_3_large)
df_text_embedding_3_large_results_it = pd.DataFrame(text_embedding_3_large_results_it)

hit_rate_text_embedding_3_large_it = df_text_embedding_3_large_results_it["is_hit"].mean()
mrr_text_embedding_3_large_it = df_text_embedding_3_large_results_it["mrr"].mean()

# results on the French dataset
text_embedding_3_large_results_fr = evaluate(fr_dataset,text_embedding_3_large)
df_text_embedding_3_large_results_fr = pd.DataFrame(text_embedding_3_large_results_fr)

hit_rate_text_embedding_3_large_fr = df_text_embedding_3_large_results_fr["is_hit"].mean()
mrr_text_embedding_3_large_fr = df_text_embedding_3_large_results_fr["mrr"].mean()

# Now, create a DataFrame with both Italian and French results
results = {
    "Hit Rate": [hit_rate_text_embedding_3_large_it, hit_rate_text_embedding_3_large_fr],
    "MRR": [mrr_text_embedding_3_large_it, mrr_text_embedding_3_large_fr]
}
df_results_text_embedding_3_large = pd.DataFrame(results, index=["Italian", "French"])
df_results_text_embedding_3_large

Generating embeddings:   0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/609 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/1342 [00:00<?, ?it/s]

Unnamed: 0,Hit Rate,MRR
Italian,0.844007,0.646962
French,0.808495,0.608371


# text-embedding-3-large reduced to 256 dimensions

In [None]:
from llama_index.embeddings import OpenAIEmbedding

text_embedding_3_large_256 = OpenAIEmbedding(model="text-embedding-3-large", dimensions = 256)

# results on the Italian dataset
text_embedding_3_large_256_results_it = evaluate(it_dataset,text_embedding_3_large_256)
df_text_embedding_3_large_256_results_it = pd.DataFrame(text_embedding_3_large_256_results_it)

hit_rate_text_embedding_3_large_256_it = df_text_embedding_3_large_256_results_it["is_hit"].mean()
mrr_text_embedding_3_large_256_it = df_text_embedding_3_large_256_results_it["mrr"].mean()

# results on the French dataset
text_embedding_3_large_256_results_fr = evaluate(fr_dataset,text_embedding_3_large_256)
df_text_embedding_3_large_256_results_fr = pd.DataFrame(text_embedding_3_large_256_results_fr)

hit_rate_text_embedding_3_large_256_fr = df_text_embedding_3_large_256_results_fr["is_hit"].mean()
mrr_text_embedding_3_large_256_fr = df_text_embedding_3_large_256_results_fr["mrr"].mean()

# Now, create a DataFrame with both Italian and French results
results = {
    "Hit Rate": [hit_rate_text_embedding_3_large_256_it, hit_rate_text_embedding_3_large_256_fr],
    "MRR": [mrr_text_embedding_3_large_256_it, mrr_text_embedding_3_large_256_fr]
}
df_results_text_embedding_3_large_256 = pd.DataFrame(results, index=["Italian", "French"])
df_results_text_embedding_3_large_256

Generating embeddings:   0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/609 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/1342 [00:00<?, ?it/s]

Unnamed: 0,Hit Rate,MRR
Italian,0.789819,0.582649
French,0.749627,0.554347


## Ada-002 embeddings

In [None]:
ada = OpenAIEmbedding()

# results on the Italian dataset
ada_results_it = evaluate(it_dataset, ada)
df_ada_results_it = pd.DataFrame(ada_results_it)

hit_rate_ada_it = df_ada_results_it["is_hit"].mean()
mrr_ada_it = df_ada_results_it["mrr"].mean()

# results on the French dataset
ada_results_fr = evaluate(fr_dataset, ada)
df_ada_results_fr = pd.DataFrame(ada_results_fr)

hit_rate_ada_fr = df_ada_results_fr["is_hit"].mean()
mrr_ada_fr = df_ada_results_fr["mrr"].mean()

# Now, create a DataFrame with both Italian and French results
results = {
    "Hit Rate": [hit_rate_ada_it, hit_rate_ada_fr],
    "MRR": [mrr_ada_it, mrr_ada_fr]
}

df_results_ada = pd.DataFrame(results, index=["Italian", "French"])
df_results_ada

Generating embeddings:   0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/609 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/1342 [00:00<?, ?it/s]

Unnamed: 0,Hit Rate,MRR
Italian,0.758621,0.609524
French,0.772727,0.600025


## Cohere/Cohere-embed-multilingual-v3.0


In [None]:
pip install cohere

In [None]:
import numpy as np
from llama_index.embeddings.cohereai import CohereEmbedding

cohere_api_key = ""

embed_model_co = CohereEmbedding(
    cohere_api_key=cohere_api_key,
    model_name="embed-multilingual-v3.0",
    input_type="search_document",
)

# Italian dataset evaluation
cohere_results_it = evaluate(it_dataset, embed_model_co)
df_cohere_results_it = pd.DataFrame(cohere_results_it)
hit_rate_co_it = df_cohere_results_it["is_hit"].mean()
mrr_co_it = df_cohere_results_it["mrr"].mean()

# French dataset evaluation
cohere_results_fr = evaluate(fr_dataset, embed_model_co)
df_cohere_results_fr = pd.DataFrame(cohere_results_fr)
hit_rate_co_fr = df_cohere_results_fr["is_hit"].mean()
mrr_co_fr = df_cohere_results_fr["mrr"].mean()

# DataFrame with both Italian and French results for Cohere
results_cohere = {
    "Hit Rate": [hit_rate_co_it, hit_rate_co_fr],
    "MRR": [mrr_co_it, mrr_co_fr]
}

df_results_cohere = pd.DataFrame(results_cohere, index=["Italian", "French"])
df_results_cohere

Generating embeddings:   0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/609 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/1342 [00:00<?, ?it/s]

Unnamed: 0,Hit Rate,MRR
Italian,0.83087,0.647345
French,0.782414,0.583358


## intfloat/multilingual-e5-large

In [7]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model_e5 = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-large", max_length=512, pooling='mean')

# Italian dataset evaluation
e5_large_results_it = evaluate(it_dataset, embed_model_e5)
df_e5_large_results_it = pd.DataFrame(e5_large_results_it)
hit_rate_e5_large_it = df_e5_large_results_it["is_hit"].mean()
mrr_e5_large_it = df_e5_large_results_it["mrr"].mean()
print(f"Hit rate e5 large italian: {hit_rate_e5_large_it}")
print(f"MRR e5 large italian: {mrr_e5_large_it}")

# French dataset evaluation
e5_large_results_fr = evaluate(fr_dataset, embed_model_e5)
df_e5_large_results_fr = pd.DataFrame(e5_large_results_fr)
hit_rate_e5_large_fr = df_e5_large_results_fr["is_hit"].mean()
mrr_e5_large_fr = df_e5_large_results_fr["mrr"].mean()
print(f"Hit rate e5 large french: {hit_rate_e5_large_fr}")
print(f"MRR e5 large french: {mrr_e5_large_fr}")

# DataFrame with both Italian and French results
results_e5_large = {
    "Hit Rate": [hit_rate_e5_large_it, hit_rate_e5_large_fr],
    "MRR": [mrr_e5_large_it, mrr_e5_large_fr]
}

df_results_e5_large = pd.DataFrame(results_e5_large, index=["Italian", "French"])
df_results_e5_large

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Generating embeddings:   0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/609 [00:00<?, ?it/s]

Hit rate e5 large italian: 0.8390804597701149
MRR e5 large italian: 0.6702244116037219


Generating embeddings:   0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/1342 [00:00<?, ?it/s]

Hit rate e5 large french: 0.7742175856929955
MRR e5 large french: 0.59826130153999


Unnamed: 0,Hit Rate,MRR
Italian,0.83908,0.670224
French,0.774218,0.598261


## Cohere/Cohere-embed-multilingual-light-v3.0


In [None]:
embed_model_co_light = CohereEmbedding(
    cohere_api_key=cohere_api_key,
    model_name="embed-multilingual-light-v3.0",
    input_type="search_document",
)

# Italian dataset evaluation
cohere_light_results_it = evaluate(it_dataset, embed_model_co_light)
df_cohere_light_results_it = pd.DataFrame(cohere_light_results_it)
hit_rate_co_light_it = df_cohere_light_results_it["is_hit"].mean()
mrr_co_light_it = df_cohere_light_results_it["mrr"].mean()
print(f"Hit rate cohere light italian: {hit_rate_co_light_it}")
print(f"MRR cohere light italian: {mrr_co_light_it}")

# French dataset evaluation
cohere_light_results_fr = evaluate(fr_dataset, embed_model_co_light)
df_cohere_light_results_fr = pd.DataFrame(cohere_light_results_fr)
hit_rate_co_light_fr = df_cohere_light_results_fr["is_hit"].mean()
mrr_co_light_fr = df_cohere_light_results_fr["mrr"].mean()
print(f"Hit rate cohere light french: {hit_rate_co_light_fr}")
print(f"MRR cohere light french: {mrr_co_light_fr}")

# DataFrame with both Italian and French results
results_cohere_light = {
    "Hit Rate": [hit_rate_co_light_it, hit_rate_co_light_fr],
    "MRR": [mrr_co_light_it, mrr_co_light_fr]
}

df_results_cohere_light = pd.DataFrame(results_cohere_light, index=["Italian", "French"])
df_results_cohere_light

Generating embeddings:   0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/609 [00:00<?, ?it/s]

Hit rate cohere light italian: 0.7980295566502463
MRR cohere light italian: 0.6211001642036125


Generating embeddings:   0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/1342 [00:00<?, ?it/s]

Hit rate cohere light french: 0.7690014903129657
MRR cohere light french: 0.5830601092896175


Unnamed: 0,Hit Rate,MRR
Italian,0.79803,0.6211
French,0.769001,0.58306


## sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2

In [8]:
embed_model_paraphrase_l12 = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", pooling='mean', max_length=128)

# Italian dataset evaluation
paraphrase_l12_results_it = evaluate(it_dataset, embed_model_paraphrase_l12)
df_paraphrase_l12_results_it = pd.DataFrame(paraphrase_l12_results_it)
hit_paraphrase_l12_results_it = df_paraphrase_l12_results_it["is_hit"].mean()
mrr_paraphrase_l12_results_it = df_paraphrase_l12_results_it["mrr"].mean()
print(f"Hit rate paraphrase l12 italian: {hit_paraphrase_l12_results_it}")
print(f"MRR paraphrase l12 italian: {mrr_paraphrase_l12_results_it}")

# French dataset evaluation
paraphrase_l12_results_fr = evaluate(fr_dataset, embed_model_paraphrase_l12)
df_paraphrase_l12_results_fr = pd.DataFrame(paraphrase_l12_results_fr)
hit_paraphrase_l12_results_fr = df_paraphrase_l12_results_fr["is_hit"].mean()
mrr_paraphrase_l12_results_fr = df_paraphrase_l12_results_fr["mrr"].mean()
print(f"Hit rate paraphrase l12 french: {hit_paraphrase_l12_results_fr}")
print(f"MRR paraphrase l12 french: {mrr_paraphrase_l12_results_fr}")

# DataFrame with both Italian and French results
results_paraphrase_l12 = {
    "Hit Rate": [hit_paraphrase_l12_results_it, hit_paraphrase_l12_results_fr],
    "MRR": [mrr_paraphrase_l12_results_it, mrr_paraphrase_l12_results_fr]
}

df_results_paraphrase_l12 = pd.DataFrame(results_paraphrase_l12, index=["Italian", "French"])
df_results_paraphrase_l12

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Generating embeddings:   0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/609 [00:00<?, ?it/s]

Hit rate paraphrase l12 italian: 0.5402298850574713
MRR paraphrase l12 italian: 0.3936781609195402


Generating embeddings:   0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/1342 [00:00<?, ?it/s]

Hit rate paraphrase l12 french: 0.48360655737704916
MRR paraphrase l12 french: 0.3544088425235966


Unnamed: 0,Hit Rate,MRR
Italian,0.54023,0.393678
French,0.483607,0.354409


# izhx/udever-bloom-1b1

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model_bloom_1b = HuggingFaceEmbedding(model_name="izhx/udever-bloom-1b1", max_length=512)

# Italian dataset evaluation
bloom_results_it = evaluate(it_dataset, embed_model_bloom_1b)
df_bloom_results_it = pd.DataFrame(bloom_results_it)
hit_bloom_results_it = df_bloom_results_it["is_hit"].mean()
mrr_bloom_results_it = df_bloom_results_it["mrr"].mean()
print(f"Hit rate Bloom italian: {hit_bloom_results_it}")
print(f"MRR Bloom italian: {mrr_bloom_results_it}")

# French dataset evaluation
bloom_results_fr = evaluate(fr_dataset, embed_model_bloom_1b)
df_bloom_results_fr = pd.DataFrame(bloom_results_fr)
hit_bloom_results_fr = df_bloom_results_fr["is_hit"].mean()
mrr_bloom_results_fr = df_bloom_results_fr["mrr"].mean()
print(f"Hit rate Bloom french: {hit_bloom_results_fr}")
print(f"MRR Bloom french: {mrr_bloom_results_fr}")

# DataFrame with both Italian and French results
results_bloom = {
    "Hit Rate": [hit_bloom_results_it, hit_bloom_results_fr],
    "MRR": [mrr_bloom_results_it, mrr_bloom_results_fr]
}

df_results_bloom = pd.DataFrame(results_bloom, index=["Italian", "French"])
df_results_bloom

Generating embeddings:   0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/609 [00:00<?, ?it/s]

Hit rate Bloom italian: 0.024630541871921183
MRR Bloom italian: 0.006787082649151614


Generating embeddings:   0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/1342 [00:00<?, ?it/s]

Hit rate Bloom french: 0.03353204172876304
MRR Bloom french: 0.014294585196224539


Unnamed: 0,Hit Rate,MRR
Italian,0.024631,0.006787
French,0.033532,0.014295
