### Getting the data


In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np 
from rouge import Rouge


  from .autonotebook import tqdm as notebook_tqdm


In [33]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"


url = f'{github_url}?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]


### Q1. Getting the embeddings model



In [7]:
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)
answer_llm = df.iloc[0].answer_llm


You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [16]:
embedding_vector = embedding_model.encode(answer_llm)

# Get the first value of the resulting vector
first_value = embedding_vector[0]
print(first_value)


-0.42244655


### Q2. Computing the dot product

31.67430591583252

In [34]:
embeddings_llm = [embedding_model.encode(answer) for answer in df['answer_llm']]
embeddings_orig = [embedding_model.encode(answer) for answer in df['answer_orig']]


In [35]:
# Compute dot product between each pair of answers
evaluations = []
for emb_llm, emb_orig in zip(embeddings_llm, embeddings_orig):
    dot_product = np.dot(emb_llm, emb_orig)
    evaluations.append(dot_product)


In [36]:
# Compute the 75th percentile of the scores
percentile_75 = np.percentile(evaluations, 75)
print(percentile_75)

31.67430591583252


### Q3. Computing the cosine

0.8362348228693008

In [19]:
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

In [42]:
evaluations = []
for emb_llm, emb_orig in zip(embeddings_llm, embeddings_orig):
    emb_llm_norm = normalize_vector(emb_llm)
    emb_orig_norm = normalize_vector(emb_orig)
    cosine_similarity = np.dot(emb_llm_norm, emb_orig_norm)
    evaluations.append(cosine_similarity)

In [43]:
# Compute the 75th percentile of the cosine similarities
percentile_75_cosine = np.percentile(evaluations, 75)
print(percentile_75_cosine)

0.8362348228693008


### Q4. Rouge

0.45454544954545456

In [48]:
rouge_scorer = Rouge()
r = df.loc[df['document'] == '5170565b']

score_q4 = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]



# Extract the F1 score for ROUGE-1
f1_score_rouge_1 = scores['rouge-1']['f']
print(f1_score_rouge_1)


0.45454544954545456


### Q5. Average rouge score


In [49]:
rouge_1 = score_q4['rouge-1']['f']
rouge_2 = score_q4['rouge-2']['f']
rouge_l = score_q4['rouge-l']['f']
rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3
print(rouge_avg)

0.35490034990035496


### Q6. Average rouge score for all the data points


In [51]:
rouge_1_f_scores = []
rouge_2_f_scores = []
rouge_l_f_scores = []

# Loop through each record and compute ROUGE scores
for idx, row in df.iterrows():
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    rouge_1_f_scores.append(scores['rouge-1']['f'])
    rouge_2_f_scores.append(scores['rouge-2']['f'])
    rouge_l_f_scores.append(scores['rouge-l']['f'])

# Compute the average ROUGE score for each type
average_rouge_1_f = sum(rouge_1_f_scores) / len(rouge_1_f_scores)
average_rouge_2_f = sum(rouge_2_f_scores) / len(rouge_2_f_scores)
average_rouge_l_f = sum(rouge_l_f_scores) / len(rouge_l_f_scores)

# Compute the overall average ROUGE score
average_rouge = (average_rouge_1_f + average_rouge_2_f + average_rouge_l_f) / 3

print(f"Average ROUGE-1 F1: {average_rouge_1_f}")
print(f"Average ROUGE-2 F1: {average_rouge_2_f}")
print(f"Average ROUGE-L F1: {average_rouge_l_f}")
print(f"Overall Average ROUGE score: {average_rouge}")

Average ROUGE-1 F1: 0.37884361657741583
Average ROUGE-2 F1: 0.20696501983423318
Average ROUGE-L F1: 0.35380746560786525
Overall Average ROUGE score: 0.3132053673398381
