In [24]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer

In [3]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"
url = f'{github_url}?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]

## Q1. Getting the embeddings model

In [6]:
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [11]:
answer_llm = df.iloc[0].answer_llm
emb = embedding_model.encode(answer_llm)
emb[0]

-0.42244682

## Q2. Computing the dot product

In [14]:
documents = df.to_dict(orient="records")

In [25]:
evaluations = []

for doc in tqdm(documents):
    v_llm = embedding_model.encode(doc["answer_llm"])
    v_orig = embedding_model.encode(doc["answer_orig"])
    dot_product = v_llm.dot(v_orig)
    evaluations.append(dot_product)

100%|██████████| 300/300 [00:38<00:00,  7.85it/s]


In [30]:
np.percentile(evaluations, 75)

31.674309253692627

## Q3. Computing the cosine

In [35]:
def normalize(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

In [36]:
evaluations = []

for doc in tqdm(documents):
    v_llm = embedding_model.encode(doc["answer_llm"])
    v_orig = embedding_model.encode(doc["answer_orig"])
    dot_product = normalize(v_llm).dot(normalize(v_orig))
    evaluations.append(dot_product)

100%|██████████| 300/300 [00:38<00:00,  7.84it/s]


In [37]:
np.percentile(evaluations, 75)

0.836234912276268

## Q4. Rouge

In [40]:
from rouge import Rouge


rouge_scorer = Rouge()

r = df.loc[10]
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

## Q5. Average rouge score

In [47]:
np.mean([scores[k]["f"] for k in scores])

0.35490034990035496

## Q6. Average rouge score for all the data points

In [50]:
scores = rouge_scorer.get_scores(df['answer_llm'], df['answer_orig'])

In [60]:
df_scores = pd.DataFrame([{k: s[k]["f"] for k in s} for s in scores])
df_scores["rouge_avg"] = df_scores.mean(axis=1)
df_scores

Unnamed: 0,rouge-1,rouge-2,rouge-l,rouge_avg
0,0.095238,0.028169,0.095238,0.072882
1,0.125000,0.055556,0.093750,0.091435
2,0.415584,0.177778,0.389610,0.327658
3,0.216216,0.047059,0.189189,0.150821
4,0.142076,0.033898,0.120219,0.098731
...,...,...,...,...
295,0.654545,0.540984,0.618182,0.604570
296,0.590164,0.460432,0.557377,0.535991
297,0.654867,0.564516,0.637168,0.618851
298,0.304762,0.132231,0.304762,0.247252


In [61]:
df_scores["rouge-2"].mean()

0.20696501983423318