In [1]:
import pandas as pd

github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'

df = pd.read_csv(f'{github_url}?raw=1')

df_300 = df.iloc[:300]

df.head(3)

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp


In [2]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)
embedding_vector = embedding_model.encode(df_300['answer_llm'].iloc[0])

embedding_vector[0]

-0.42244658

In [3]:
import numpy as np
from tqdm.auto import tqdm

def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    return 0 if norm==0 else (v / norm)

def compute_similarity_percentile(df, normalize=False, percentile=75):
    evaluations = []

    for _, row in tqdm(df.iterrows()):
        answer_llm = row['answer_llm']
        answer_orig = row['answer_orig']
    
        if normalize:
            embedding_llm = normalize_vector(embedding_model.encode(answer_llm))
            embedding_orig= normalize_vector(embedding_model.encode(answer_orig))
        else:
            embedding_llm = embedding_model.encode(answer_llm)
            embedding_orig = embedding_model.encode(answer_orig)
    
        score = np.dot(embedding_llm, embedding_orig)
    
        evaluations.append(score)

    return np.percentile(evaluations, percentile)

In [5]:
compute_similarity_percentile(df_300, False, 75)

300it [02:18,  2.16it/s]


31.674312114715576

In [6]:
compute_similarity_percentile(df_300, True, 75)

300it [02:21,  2.12it/s]


0.8362347930669785

In [8]:
from rouge import Rouge

rouge_scorer = Rouge()

r = df_300[df_300['document'] == '5170565b'].iloc[0]

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

rouge_1_f1 = scores['rouge-1']['f']

print("The F1 score for ROUGE-1 is:", rouge_1_f1)

rouge_2_f1 = scores['rouge-2']['f']
rouge_l_f1 = scores['rouge-l']['f']

average_f1_score = (rouge_1_f1 + rouge_2_f1 + rouge_l_f1) / 3

print("The average F1 score between ROUGE-1, ROUGE-2, and ROUGE-L is:", average_f1_score)

The F1 score for ROUGE-1 is: 0.45454544954545456
The average F1 score between ROUGE-1, ROUGE-2, and ROUGE-L is: 0.35490034990035496


In [9]:
import pandas as pd
from rouge import Rouge

rouge_scorer = Rouge()

def compute_rouge_scores(row):
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    return {
        'document': row['document'],
        'rouge_1_f1': scores['rouge-1']['f'],
        'rouge_2_f1': scores['rouge-2']['f'],
        'rouge_l_f1': scores['rouge-l']['f']
    }

scores_df = pd.DataFrame(list(map(compute_rouge_scores, df_300.to_dict('records'))))

average_rouge_2_f1 = scores_df['rouge_2_f1'].mean()

print("The average ROUGE-2 F1 score across all records is:", average_rouge_2_f1)

The average ROUGE-2 F1 score across all records is: 0.20696501983423318
