In [1]:
import pandas as pd

In [2]:
df_results_gpt_4o_mini = pd.read_csv('../results-gpt4o-mini.csv')

In [3]:
df_results = df_results_gpt_4o_mini[:300]

In [4]:
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer

In [5]:
model_name = 'multi-qa-mpnet-base-dot-v1'
model = SentenceTransformer(model_name)

In [6]:
answer_llm = df_results.iloc[0].answer_llm

In [7]:
answer_llm

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [8]:
v = model.encode(answer_llm)

In [9]:
v[0]

-0.42244658

In [10]:
import numpy as np

In [11]:
evaluations = []

for _, row in df_results.iterrows():
    answer_llm_embedding = model.encode(row.answer_llm)
    answer_orig_embedding = model.encode(row.answer_orig)

    results = np.dot(answer_llm_embedding, answer_orig_embedding)

    evaluations.append(results)

In [12]:
percentile_75 = np.percentile(evaluations, 75)

In [13]:
percentile_75

31.67430353164673

In [15]:
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm

    return v_norm

In [16]:
evaluations = []

for _, row in df_results.iterrows():
    answer_llm_embedding = model.encode(row.answer_llm)
    answer_orig_embedding = model.encode(row.answer_orig)

    answer_llm_embedding_norm = normalize_vector(answer_llm_embedding)
    answer_orig_embedding_norm = normalize_vector(answer_orig_embedding)

    cosine_similarity = np.dot(answer_llm_embedding_norm, answer_orig_embedding_norm)

    evaluations.append(cosine_similarity)

In [17]:
percentile_75 = np.percentile(evaluations, 75)

In [18]:
percentile_75

0.8362347632646561

In [19]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [20]:
from rouge import Rouge

In [21]:
rouge_scorer = Rouge()

rouge_scorer

<rouge.rouge.Rouge at 0x3cee1f3b0>

In [32]:
r = df_results.iloc[10]

In [33]:
r

answer_llm     Yes, all sessions are recorded, so if you miss...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question                    Are sessions recorded if I miss one?
course                                 machine-learning-zoomcamp
Name: 10, dtype: object

In [34]:
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [35]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [36]:
f_score_rouge_1 = scores['rouge-1']['f']

In [37]:
f_score_rouge_1

0.45454544954545456

In [42]:
rouge_1 = scores['rouge-1']['f']
rouge_2 = scores['rouge-2']['f']
rouge_l = scores['rouge-l']['f']

In [43]:
rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3

In [44]:
rouge_avg

0.35490034990035496

In [46]:
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []
rouge_avg_scores = []

for _, row in df_results.iterrows():
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']
    
    rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3
    
    rouge_1_scores.append(rouge_1)
    rouge_2_scores.append(rouge_2)
    rouge_l_scores.append(rouge_l)
    rouge_avg_scores.append(rouge_avg)

In [48]:
df_rouge_scores = pd.DataFrame({
    'rouge_1': rouge_1_scores,
    'rouge_2': rouge_2_scores,
    'rouge_l': rouge_l_scores,
    'rouge_avg': rouge_avg_scores
})

In [52]:
avg_rouge_2 = df_rouge_scores['rouge_2'].mean()

In [53]:
avg_rouge_2

0.20696501983423318