In [None]:
%pip install sentence-transformers

In [3]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

df = pd.read_csv("zero_shot_predictions.csv")
model = SentenceTransformer("all-MiniLM-L6-v2")

emb_review = model.encode(df["review"].tolist(), convert_to_tensor=True)
emb_output = model.encode(df["zero_shot_review"].tolist(), convert_to_tensor=True)

cos_scores = util.cos_sim(emb_output, emb_review).diagonal().cpu().numpy()
print(f"[ZS vs REVIEW] Mean Cosine Similarity: {np.mean(cos_scores):.4f}")

[ZS vs REVIEW] Mean Cosine Similarity: 0.6944


In [None]:

# Evaluation: Fine-tuned output vs. original human review
# Quanto il modello imita lo stile e contenuto delle review originali
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

df = pd.read_csv("finetuned_predictions.csv")
model = SentenceTransformer("all-MiniLM-L6-v2")

emb_review = model.encode(df["review"].tolist(), convert_to_tensor=True)
emb_output = model.encode(df["generated_review"].tolist(), convert_to_tensor=True)

cos_scores = util.cos_sim(emb_output, emb_review).diagonal().cpu().numpy()
print(f"[FT vs REVIEW] Mean Cosine Similarity: {np.mean(cos_scores):.4f}")


# 2. Zero-shot output vs. original review 
#Quanto bene il modello base sa già generalizzare senza training
df = pd.read_csv("zero_shot_predictions.csv")
model = SentenceTransformer("all-MiniLM-L6-v2")

emb_review = model.encode(df["review"].tolist(), convert_to_tensor=True)
emb_output = model.encode(df["zero_shot_review"].tolist(), convert_to_tensor=True)

cos_scores = util.cos_sim(emb_output, emb_review).diagonal().cpu().numpy()
print(f"[ZS vs REVIEW] Mean Cosine Similarity: {np.mean(cos_scores):.4f}")


# 3. Fine-tuned output vs. structured review (i.e., the true target)
# Quanto bene il modello ha appreso il compito per cui è stato fine-tunato

df = pd.read_csv("train_structured.csv")  # contains structured_review
df_pred = pd.read_csv("finetuned_predictions.csv")  # contains generated_review

# Merge to align outputs with targets
merged = pd.merge(df_pred, df[["title", "structured_review"]], on="title")

model = SentenceTransformer("all-MiniLM-L6-v2")
emb_target = model.encode(merged["structured_review"].tolist(), convert_to_tensor=True)
emb_output = model.encode(merged["generated_review"].tolist(), convert_to_tensor=True)

cos_scores = util.cos_sim(emb_output, emb_target).diagonal().cpu().numpy()
print(f"[FT vs STRUCTURED] Mean Cosine Similarity: {np.mean(cos_scores):.4f}")

# 4. Fine-tuned vs. zero-shot output (same input, different model)
# Quanto il fine-tuning ha modificato lo stile/output rispetto alla base

df_ft = pd.read_csv("finetuned_predictions.csv")
df_zs = pd.read_csv("zero_shot_predictions.csv")

merged = pd.merge(df_ft, df_zs, on="title")  # align predictions

model = SentenceTransformer("all-MiniLM-L6-v2")
emb_zs = model.encode(merged["zero_shot_review"].tolist(), convert_to_tensor=True)
emb_ft = model.encode(merged["generated_review"].tolist(), convert_to_tensor=True)

cos_scores = util.cos_sim(emb_ft, emb_zs).diagonal().cpu().numpy()
print(f"[FT vs ZS] Mean Cosine Similarity: {np.mean(cos_scores):.4f}")


| **Confronto**                             | **Interpretazione dei valori**                                                                                                                                                        |
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `generated_review` vs `review`            | **0.6–0.7**: buona fedeltà allo **stile e contenuto umano**, anche se non erano il target diretto. <br>**>0.7**: possibile overfitting stilistico. <br>**<0.5**: troppo diverso.      |
| `generated_review` vs `structured_review` | ✅ Questo è il **vero obiettivo del fine-tuning**. <br>**>0.8** = ottimo apprendimento.<br>**0.6–0.8** = sufficiente. <br>**<0.6** = segno che il modello non ha appreso bene il task. |
| `zero_shot_review` vs `review`            | Se **simile a `generated_review` vs `review`**, allora il base model già capisce bene. <br>**>0.7**: base model forte. <br>**\~0.5**: il fine-tuning porta valore.                    |
| `generated_review` vs `zero_shot_review`  | **Alta similarità (>0.8)** = il fine-tuning ha cambiato poco lo stile. <br>**Moderata (\~0.6)** = il modello ha raffinato la risposta. <br>**Bassa (<0.4)** = ha cambiato approccio.  |
