In [1]:
import pandas as pd

github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [2]:
df = df.iloc[:300]

## Q1. Getting the embeddings model

In [3]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [4]:
answer_llm = df.iloc[0].answer_llm
vector_llm = embedding_model.encode(answer_llm)

In [5]:
embedding_model.encode(answer_llm)[0]

np.float32(-0.42244655)

## Q2. Computing the dot product

In [6]:
evaluations = []
for i in range(len(df)):
    answer_llm = df.iloc[i].answer_llm
    vector_llm = embedding_model.encode(answer_llm)

    answer_orig = df.iloc[i].answer_orig
    vector_orig = embedding_model.encode(answer_orig)

    evaluations.append(vector_llm.dot(vector_orig))

In [7]:
evaluations = pd.Series(evaluations)
evaluations.describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547923
25%       24.307844
50%       28.336870
75%       31.674309
max       39.476013
dtype: float64

## Q3. Computing the cosine

In [8]:
import numpy as np

def normalize(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm

    return v_norm

evaluations = []
for i in range(len(df)):
    answer_llm = df.iloc[i].answer_llm
    vector_llm = embedding_model.encode(answer_llm)
    vector_llm = normalize(vector_llm)

    answer_orig = df.iloc[i].answer_orig
    vector_orig = embedding_model.encode(answer_orig)
    vector_orig = normalize(vector_orig)

    evaluations.append(vector_llm.dot(vector_orig))

In [9]:
evaluations = pd.Series(evaluations)
evaluations.describe()

count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
dtype: float64

## Q4. Rouge

In [11]:
from rouge import Rouge
rouge_scorer = Rouge()

records = df.to_dict(orient='records')
r = records[10]

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [12]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

## Q5. Average rouge score

In [13]:
rouge_1 = scores['rouge-1']['f']
rouge_2 = scores['rouge-2']['f']
rouge_l = scores['rouge-l']['f']
rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3
rouge_avg

0.35490034990035496

## Q6. Average rouge score for all the data points

In [14]:
evaluations = []

for r in records:
    scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
    
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']
    
    evaluations.append({
        'rouge_1': rouge_1,
        'rouge_2': rouge_2,
        'rouge_l': rouge_l,
        'rouge_avg': (rouge_1 + rouge_2 + rouge_l) / 3
    })

In [15]:
pd.DataFrame(evaluations).rouge_2.mean()

np.float64(0.20696501983423318)