In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'

url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [3]:
df = df.iloc[:300]

In [4]:
model_name = 'multi-qa-mpnet-base-dot-v1'

In [5]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [6]:
answer_llm = df.iloc[0].answer_llm

In [7]:
v = embedding_model.encode(answer_llm)

In [8]:
v[0]

-0.42244655

In [14]:
evaluations = []

for i, r in tqdm(df.iterrows()):
    answer_llm = r.answer_llm
    answer_orig = r.answer_orig
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    evaluations.append(v_llm.dot(v_orig))

300it [02:25,  2.07it/s]


In [18]:
np.percentile(evaluations, 75)

31.67430877685547

In [19]:
def normalized(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [34]:
pairs = [(embedding_model.encode(answer_orig), embedding_model.encode(answer_llm)) for answer_orig, answer_llm in tqdm(zip(df['answer_orig'], df['answer_llm']))]

300it [02:24,  2.08it/s]


In [37]:
evaluations_n = []

for v_orig, v_llm in pairs:
    v_orig = normalized(v_orig)
    v_llm = normalized(v_llm)
    evaluations_n.append(v_orig.dot(v_llm))

In [38]:
np.percentile(evaluations_n, 75)

0.8362348973751068

In [39]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [44]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df.iloc[10]

print(r['document'])

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

5170565b


In [47]:
scores['rouge-1']['f']

0.45454544954545456

In [49]:
rouge_1 = scores['rouge-1']['f']
rouge_2 = scores['rouge-2']['f']
rouge_l = scores['rouge-l']['f']
rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3

In [50]:
rouge_avg

0.35490034990035496

In [51]:
rouge_1 = []
rouge_2 = []
rouge_l = []

for i, r in tqdm(df.iterrows()):
    rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

    rouge_1.append( scores['rouge-1']['f'] )
    rouge_2.append( scores['rouge-2']['f'] )
    rouge_l.append( scores['rouge-l']['f'] )

300it [00:00, 325.61it/s]


In [60]:
np.average(rouge_2)

0.21621621121621634