In [59]:
import pandas as pd
import requests 
from tqdm.auto import tqdm
import numpy as np

In [2]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '04-monitoring/data/results-gpt4o-mini.csv'
url = f'{base_url}/{relative_url}?raw=1'

In [3]:
df = pd.read_csv(url)

In [4]:
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [6]:
from sentence_transformers import SentenceTransformer
model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [7]:
answer_llm = df.iloc[0].answer_llm
answer_llm

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

<b> Q1. Getting the embeddings model

In [10]:
embedding_model.encode(answer_llm)[:4]

array([-0.42244673, -0.22485629, -0.32405835, -0.2847585 ], dtype=float32)

<b> Q2. Computing the dot product

In [42]:
results_pair = df[['answer_llm','answer_orig']].to_dict(orient='records')

In [66]:
def compute_similarity(record, model):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [67]:
similarity = []
for record in tqdm(results_pair):
    sim = compute_similarity(record, embedding_model)
    similarity.append(sim)

100%|██████████████████████████████████████████████████████████████████████████████| 1830/1830 [10:05<00:00,  3.02it/s]


In [68]:
df['cosine'] = similarity
df['cosine'].describe()

count    1830.000000
mean       28.015772
std         6.413295
min         3.511811
25%        24.631171
50%        28.897566
75%        32.389799
max        44.296772
Name: cosine, dtype: float64

In [69]:
def normalize_vec(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [70]:
def compute_cosine_normalized(record, model):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = normalize_vec(model.encode(answer_llm))
    v_orig = normalize_vec(model.encode(answer_orig))
    return v_llm.dot(v_orig)

In [71]:
similarity_norm = []

for record in tqdm(results_pair):
    sim = compute_cosine_normalized(record, embedding_model)

    similarity_norm.append(sim)


100%|██████████████████████████████████████████████████████████████████████████████| 1830/1830 [09:43<00:00,  3.13it/s]


In [72]:
df['cosine_norm'] = similarity_norm
df['cosine_norm'].describe()

count    1830.000000
mean        0.738963
std         0.156405
min         0.090286
25%         0.655194
50%         0.772145
75%         0.853112
max         0.993684
Name: cosine_norm, dtype: float64

<b> Q4. Rouge

In [73]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [97]:
df.iloc[11]

answer_llm     Yes, you can ask your questions in advance if ...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question       Can I ask questions in advance if I can't atte...
course                                 machine-learning-zoomcamp
cosine_norm                                             0.783566
cosine                                                 31.441833
Name: 11, dtype: object

In [86]:
answer_llm_10 = df.iloc[11]['answer_llm']
answer_orig_10 = df.iloc[11]['answer_orig']

In [87]:
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(df.iloc[11]['answer_llm'], df.iloc[11]['answer_orig'])[0]

There are three scores: rouge-1, rouge-2 and rouge-l, and precision, recall and F1 score for each.
<br>rouge-1 - the overlap of unigrams,
<br>rouge-2 - bigrams,
<br>rouge-l - the longest common subsequence
<br>https://klu.ai/glossary/rouge-score
<br>https://medium.com/@sthanikamsanthosh1994/understanding-bleu-and-rouge-score-for-nlp-evaluation-1ab334ecadcb

In [88]:
scores

{'rouge-1': {'r': 0.6060606060606061,
  'p': 0.6060606060606061,
  'f': 0.6060606010606061},
 'rouge-2': {'r': 0.43243243243243246,
  'p': 0.41025641025641024,
  'f': 0.42105262658241},
 'rouge-l': {'r': 0.5757575757575758,
  'p': 0.5757575757575758,
  'f': 0.5757575707575758}}

In [90]:
#What's the F score for rouge-1?
scores['rouge-1']['f']

0.6060606010606061

<b> Q5. Average rouge score

In [96]:
av_score = (scores['rouge-1']['f'] + scores['rouge-2']['f'] + scores['rouge-l']['f'])/3
av_score

0.5342902661335307

<b> Q6. Average rouge score for all the data points

In [102]:
scores

{'rouge-1': {'r': 0.061224489795918366,
  'p': 0.21428571428571427,
  'f': 0.09523809178130524},
 'rouge-2': {'r': 0.017543859649122806,
  'p': 0.07142857142857142,
  'f': 0.028169010918468917},
 'rouge-l': {'r': 0.061224489795918366,
  'p': 0.21428571428571427,
  'f': 0.09523809178130524}}

In [105]:
total_scores = {key: {'r': 0, 'p': 0, 'f': 0} for key in scores}
total_scores

{'rouge-1': {'r': 0, 'p': 0, 'f': 0},
 'rouge-2': {'r': 0, 'p': 0, 'f': 0},
 'rouge-l': {'r': 0, 'p': 0, 'f': 0}}

In [106]:
for record in tqdm(results_pair):   
    scores = rouge_scorer.get_scores(record['answer_llm'], record['answer_orig'])[0]
    total_scores['rouge-1']['f'] += scores['rouge-1']['f']
    total_scores['rouge-2']['f'] += scores['rouge-2']['f']
    total_scores['rouge-l']['f'] += scores['rouge-l']['f']


100%|█████████████████████████████████████████████████████████████████████████████| 1830/1830 [00:05<00:00, 313.10it/s]


In [107]:
total_scores

{'rouge-1': {'r': 0, 'p': 0, 'f': 643.6012007368518},
 'rouge-2': {'r': 0, 'p': 0, 'f': 323.39219597819846},
 'rouge-l': {'r': 0, 'p': 0, 'f': 599.4817512725124}}

In [108]:
#What's the agerage rouge_2 across all the records?
rouge_2_av = total_scores['rouge-2']['f']/len(results_pair)
rouge_2_av

0.1767170469826221