## Prep

In [44]:
import pandas as pd

github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'

url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [49]:
df = df.iloc[:300]

## Q1 Getting the Models

In [6]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [7]:
answer_llm = df.iloc[0].answer_llm
answer_llm_embedding = embedding_model.encode(answer_llm)

In [9]:
answer_llm_embedding[0]

-0.42244658

## Q2 Dot Product

In [13]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [15]:
from tqdm.auto import tqdm

evaluations = []

for record in tqdm(df.to_dict(orient='records')):
    sim = compute_similarity(record)
    evaluations.append(sim)

100%|██████████| 300/300 [01:22<00:00,  3.64it/s]


In [17]:
df['dot'] = evaluations
df['dot'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dot'] = evaluations


count    300.000000
mean      27.495996
std        6.384743
min        4.547925
25%       24.307841
50%       28.336864
75%       31.674304
max       39.476013
Name: dot, dtype: float64

## Q3 Cosine

In [18]:
import numpy as np

def normalize(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

In [24]:
def compute_cosine_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return normalize(v_llm).dot(normalize(v_orig))

In [25]:
from tqdm.auto import tqdm

evaluations_cosine = []

for record in tqdm(df.to_dict(orient='records')):
    sim = compute_cosine_similarity(record)
    evaluations_cosine.append(sim)

100%|██████████| 300/300 [00:19<00:00, 15.33it/s]


In [26]:
df['cosine'] = evaluations_cosine
df['cosine'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cosine'] = evaluations_cosine


count    300.000000
mean       0.728392
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: cosine, dtype: float64

## Q4 Rouge

In [30]:
r = df.iloc[10]

In [31]:
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [33]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

## Q5. Average rouge score

In [34]:
np.average([s['f'] for s in scores.values()])

0.35490034990035496

## Q6. Average rouge score for all the data points

Reset the df to original data to use entire dataset

In [50]:
from tqdm.auto import tqdm

rouge_2s = []

for record in tqdm(df.to_dict(orient='records')):
    rouge_2 = rouge_scorer.get_scores(record['answer_llm'], record['answer_orig'])[0]['rouge-2']
    rouge_2s.append(rouge_2)

100%|██████████| 300/300 [00:00<00:00, 436.18it/s]


In [51]:
scores_df = pd.DataFrame(rouge_2s)

In [52]:
scores_df.describe()

Unnamed: 0,r,p,f
count,300.0,300.0,300.0
mean,0.198613,0.258626,0.206965
std,0.164964,0.174559,0.15355
min,0.0,0.0,0.0
25%,0.074632,0.138093,0.097809
50%,0.159075,0.230769,0.178671
75%,0.260995,0.335366,0.286181
max,0.805556,1.0,0.73913
