# Homework

https://github.com/DataTalksClub/llm-zoomcamp/blob/main/cohorts/2024/04-monitoring/homework.md

In [24]:
# Load libraries
import numpy as np
import pandas as pd
from rouge import Rouge # pip install rouge
from sentence_transformers import SentenceTransformer # pip install sentence-transformers

In [2]:
# Get data
url = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/04-monitoring/data/results-gpt4o-mini.csv?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]
df.sample(5)

Unnamed: 0,answer_llm,answer_orig,document,question,course
263,"Yes, performing the inverse of X twice in your...","It’s possible that when you follow the videos,...",6f3bdd20,Can performing the inverse of X twice in my co...,machine-learning-zoomcamp
252,You can find the Week 2 homework for the cours...,Here are the crucial links for this Week 2 tha...,50d737e7,Where can I find the Week 2 homework for the c...,machine-learning-zoomcamp
15,The course will cover the bare minimum of theo...,The bare minimum. The focus is more on practic...,ecca790c,How much theoretical content is there in the c...,machine-learning-zoomcamp
107,"No, it is not possible to earn more than 7 poi...",When you post about what you learned from the ...,f7bc2f65,Is it possible to earn more than 7 points for ...,machine-learning-zoomcamp
228,"For scalar multiplication in numpy, you can us...","Note, that matrix multiplication (matrix-matri...",735e6c78,What functions can be used for scalar multipli...,machine-learning-zoomcamp


## Q1. Getting the embeddings model

In [4]:
model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name, cache_folder='D:/.cache/')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
answer_llm = df.iloc[0].answer_llm
answer_llm

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [7]:
embeddings = embedding_model.encode(answer_llm)

In [11]:
# Q1. Embedding first value
print(embeddings.shape)
embeddings[0]

(768,)


-0.4224466

## Q2. Computing the dot product

In [13]:
evaluations = []

for row in df.itertuples(index=True):
    v_llm = embedding_model.encode(row.answer_llm)
    v_orig = embedding_model.encode(row.answer_orig)
    dotprod = v_llm.dot(v_orig)
    evaluations.append(dotprod)

In [18]:
# Q2. Dot product: 75th percentile
pd.DataFrame(evaluations,columns=['dot_product']).describe().loc['75%',]

dot_product    31.674313
Name: 75%, dtype: float64

## Q3. Computing the cosine

In [21]:
def normalize(vector):
    norm = np.sqrt((vector*vector).sum())
    return vector/norm

In [22]:
evaluations_norm = []

for row in df.itertuples(index=True):
    v_llm = embedding_model.encode(row.answer_llm)
    v_orig = embedding_model.encode(row.answer_orig)
    cos_similarity = normalize(v_llm).dot(normalize(v_orig))
    evaluations_norm.append(cos_similarity)

In [23]:
# Q3. Cosine 75th percentile
pd.DataFrame(evaluations_norm, columns=['cos_similarity']).describe().loc['75%',]

cos_similarity    0.836235
Name: 75%, dtype: float64

## Q4. Rouge
https://github.com/DataTalksClub/llm-zoomcamp/blob/main/cohorts/2024/04-monitoring/homework.md#q4-rouge

In [49]:
rouge_scorer = Rouge()

r = df[df['document']=='5170565b']
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0] # 10th Index

In [50]:
r

Unnamed: 0,answer_llm,answer_orig,document,question,course
10,"Yes, all sessions are recorded, so if you miss...","Everything is recorded, so you won’t miss anyt...",5170565b,Are sessions recorded if I miss one?,machine-learning-zoomcamp
11,"Yes, you can ask your questions in advance if ...","Everything is recorded, so you won’t miss anyt...",5170565b,Can I ask questions in advance if I can't atte...,machine-learning-zoomcamp
12,"If you miss a session, don't worry! Everything...","Everything is recorded, so you won’t miss anyt...",5170565b,How will my questions be addressed if I miss a...,machine-learning-zoomcamp
13,"Yes, there is a way to catch up on a missed se...","Everything is recorded, so you won’t miss anyt...",5170565b,Is there a way to catch up on a missed session?,machine-learning-zoomcamp
14,"Yes, you can still interact with instructors a...","Everything is recorded, so you won’t miss anyt...",5170565b,Can I still interact with instructors after mi...,machine-learning-zoomcamp


In [51]:
scores # for 10th Index

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [52]:
# Q4. Rouge 1 F
round(scores['rouge-1']['f'], 4)

0.4545

## Q5. Average rouge score

In [58]:
# Q5. Average Rouge
f_scores = [r['f'] for r in scores.values()]
round(np.mean(f_scores), 4)

0.3549

## Q6. Average rouge score for all the data points

In [68]:
# Q6. Average Rouge 2
all_rouge_vals = rouge_scorer.get_scores(df['answer_llm'], df['answer_orig'])
round(pd.DataFrame([r['rouge-2'] for r in all_rouge_vals]).describe().loc['mean', 'f'], 4)

0.207