In [1]:
import pandas as pd

In [2]:
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'

In [3]:
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [4]:
df = df.iloc[:300]

In [5]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [6]:
answer_llm = df.iloc[0].answer_llm
embeddings = embedding_model.encode(answer_llm)

In [7]:
first_value = embeddings[0]
print(f"The first value of the resulting vector is: {first_value}")

The first value of the resulting vector is: -0.4224468767642975


In [8]:
import numpy as np

def compute_dot_product(emb1, emb2):
    return np.dot(emb1, emb2)

In [9]:
df.columns

Index(['answer_llm', 'answer_orig', 'document', 'question', 'course'], dtype='object')

In [10]:
evaluations = []

for _, row in df.iterrows():
    emb_llm = embedding_model.encode(row.answer_llm)
    emb_orig = embedding_model.encode(row.answer_orig)  
    score = compute_dot_product(emb_llm, emb_orig)
    evaluations.append(score)

percentile_75 = np.percentile(evaluations, 75)
print(f"The 75th percentile of the scores is: {percentile_75}")

The 75th percentile of the scores is: 31.67430353164673


In [11]:
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

def compute_cosine_similarity(emb1, emb2):
    emb1_norm = normalize_vector(emb1)
    emb2_norm = normalize_vector(emb2)
    return np.dot(emb1_norm, emb2_norm)

In [12]:
evaluations = []

for _, row in df.iterrows():
    emb_llm = embedding_model.encode(row.answer_llm)
    emb_orig = embedding_model.encode(row.answer_orig)
    score = compute_cosine_similarity(emb_llm, emb_orig)
    evaluations.append(score)

percentile_75 = np.percentile(evaluations, 75)
print(f"The 75th percentile of the cosine similarity scores is: {percentile_75}")

The 75th percentile of the cosine similarity scores is: 0.8362348079681396


In [14]:
from rouge import Rouge

# Initialize the ROUGE scorer
rouge_scorer = Rouge()

# Get the row at index 10
r = df.iloc[10]

# Compute ROUGE scores
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

# Extract the F1 score for ROUGE-1
rouge_1_f1 = scores['rouge-1']['f']

print(f"The F1 score for ROUGE-1 is: {rouge_1_f1}")

The F1 score for ROUGE-1 is: 0.45454544954545456


In [15]:
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

# Extract F1 scores for ROUGE-1, ROUGE-2, and ROUGE-L
rouge_1_f1 = scores['rouge-1']['f']
rouge_2_f1 = scores['rouge-2']['f']
rouge_l_f1 = scores['rouge-l']['f']

# Compute the average
average_rouge = (rouge_1_f1 + rouge_2_f1 + rouge_l_f1) / 3

print(f"ROUGE-1 F1: {rouge_1_f1}")
print(f"ROUGE-2 F1: {rouge_2_f1}")
print(f"ROUGE-L F1: {rouge_l_f1}")
print(f"Average ROUGE F1: {average_rouge}")

ROUGE-1 F1: 0.45454544954545456
ROUGE-2 F1: 0.21621621121621637
ROUGE-L F1: 0.393939388939394
Average ROUGE F1: 0.35490034990035496


In [17]:
rouge_scorer = Rouge()
rouge_scores = []

for _, row in df.iterrows():
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']
    rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3
    rouge_scores.append({
        'rouge_1': rouge_1,
        'rouge_2': rouge_2,
        'rouge_l': rouge_l,
        'rouge_avg': rouge_avg
    })

rouge_df = pd.DataFrame(rouge_scores)

average_rouge_2 = rouge_df['rouge_2'].mean()

print(f"The average ROUGE-2 score across all records is: {average_rouge_2:.4f}")

The average ROUGE-2 score across all records is: 0.2070
