# Evaluation scores calculation

In [None]:
%pip install datasets evaluate rouge_score sacrebleu sentence_transformers -q

In [None]:
from datasets import load_dataset
ds = load_dataset("romikgosai/academicMCQ-phi3-response")

In [None]:
ds

## ROGUE

In [None]:
from datasets import load_dataset
import evaluate

# Load your dataset
ds = load_dataset("romikgosai/academicMCQ-phi3-response")

# Load ROUGE metric
rouge = evaluate.load('rouge')

# Prepare the dataset
references = ds['train']['target']
finetuned_predictions = ds['train']['finetuned_response']
not_finetuned_oneshot_predictions = ds['train']['one_shot_response']
non_finetuned_zeroshot_predictions = ds['train']['zero_shot_response']

# Define a function to compute ROUGE metrics
def compute_rouge(predictions, references):
    results = rouge.compute(predictions=predictions, references=references)
    print("ROUGE-1: {:.4f}".format(results['rouge1']))
    print("ROUGE-2: {:.4f}".format(results['rouge2']))
    print("ROUGE-L: {:.4f}".format(results['rougeL']))
    print(results)
    return results

# Compute ROUGE metrics for fine-tuned responses
print("Metrics for Fine-tuned PHI-3 Responses (ROUGE):")
rogue_finetuned = compute_rouge(finetuned_predictions, references)

# Compute ROUGE metrics for not fine-tuned zero shot responses
print("\nMetrics for Not Fine-tuned zero shot PHI-3 Responses (ROUGE):")
rogue_not_finetuned = compute_rouge(non_finetuned_zeroshot_predictions, references)


# Compute ROUGE metrics for not fine-tuned one shot responses
print("\nMetrics for Not Fine-tuned one shot PHI-3 Responses (ROUGE):")
rogue_not_finetuned = compute_rouge(not_finetuned_oneshot_predictions, references)


## BLEU

In [None]:
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu
import nltk

# Download necessary NLTK data
nltk.download('punkt')

# # Load the dataset
# ds = load_dataset("zeref713/gsm8k_phi3_responses")

# Extract the test set
test_dataset = ds['train']

# Calculate BLEU scores for finetuned and non-finetuned responses
finetuned_bleu_scores = []
non_finetuned_zeroshot_bleu_scores = []
non_finetuned_oneshot_bleu_scores = []

for i in range(len(test_dataset)):
    reference = [test_dataset[i]['target'].split()]  # Tokenize the reference answer

    # Finetuned responses
    finetuned_candidate = test_dataset[i]['finetuned_response'].split()
    finetuned_bleu_score = sentence_bleu(reference, finetuned_candidate)
    finetuned_bleu_scores.append(finetuned_bleu_score)

    # Non-finetuned responses
    non_finetuned_candidate = test_dataset[i]['zero_shot_response'].split()
    non_finetuned_zeroshot_bleu_score = sentence_bleu(reference, non_finetuned_candidate)
    non_finetuned_zeroshot_bleu_scores.append(non_finetuned_zeroshot_bleu_score)

    non_finetuned_candidate = test_dataset[i]['one_shot_response'].split()
    non_finetuned_oneshot_bleu_score = sentence_bleu(reference, non_finetuned_candidate)
    non_finetuned_oneshot_bleu_scores.append(non_finetuned_oneshot_bleu_score)

# Calculate the average BLEU scores
average_finetuned_bleu = sum(finetuned_bleu_scores) / len(finetuned_bleu_scores)
average_non_finetuned_zeroshot_bleu = sum(non_finetuned_zeroshot_bleu_scores) / len(non_finetuned_zeroshot_bleu_scores)
average_non_finetuned_oneshot_bleu = sum(non_finetuned_oneshot_bleu_scores) / len(non_finetuned_oneshot_bleu_scores)
print(f"Average BLEU score for finetuned responses: {average_finetuned_bleu:.4f}")
print(f"Average BLEU score for non-finetuned zero shot responses: {average_non_finetuned_zeroshot_bleu:.4f}")
print(f"Average BLEU score for non-finetuned one shot responses: {average_non_finetuned_oneshot_bleu:.4f}")

In [None]:
print(f"Average BLEU score for finetuned responses: {average_finetuned_bleu:.4f}")
print(f"Average BLEU score for non-finetuned zero shot responses: {average_non_finetuned_zeroshot_bleu:.4f}")
print(f"Average BLEU score for non-finetuned one shot responses: {average_non_finetuned_oneshot_bleu:.4f}")

## Cosine Similarity

In [None]:
from datasets import load_dataset
from gensim import downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk

# # Download the necessary package
# nltk.download('punkt')

# Load the dataset
# ds = load_dataset("zeref713/gsm8k_phi3_responses")
test_dataset = ds['train']

## Load the Word2Vec model
model_vec = api.load('word2vec-google-news-300')

def get_sentence_embedding(sentence, model):
    words = nltk.word_tokenize(sentence.lower())
    word_vecs = [model[word] for word in words if word in model]
    if not word_vecs:
        return np.zeros((model.vector_size,))
    return np.mean(word_vecs, axis=0)

# Set batch size
batch_size = 100
finetuned_similarities = []
non_finetuned_zeroshot_similarities = []
non_finetuned_oneshot_similarities = []

for start_idx in range(0, len(test_dataset), batch_size):
    batch_refs = test_dataset['target'][start_idx:start_idx + batch_size]
    batch_finetuned = test_dataset['finetuned_response'][start_idx:start_idx + batch_size]
    batch_non_finetuned_zeroshot = test_dataset['zero_shot_response'][start_idx:start_idx + batch_size]
    batch_non_finetuned_oneshot = test_dataset['one_shot_response'][start_idx:start_idx + batch_size]

    ref_embeddings = np.array([get_sentence_embedding(text, model_vec) for text in batch_refs])
    finetuned_embeddings = np.array([get_sentence_embedding(text, model_vec) for text in batch_finetuned])
    non_finetuned_zeroshot_embeddings = np.array([get_sentence_embedding(text, model_vec) for text in batch_non_finetuned_zeroshot])
    non_finetuned_oneshot_embeddings = np.array([get_sentence_embedding(text, model_vec) for text in batch_non_finetuned_oneshot])

    finetuned_similarities.extend(cosine_similarity(ref_embeddings, finetuned_embeddings).diagonal())
    non_finetuned_zeroshot_similarities.extend(cosine_similarity(ref_embeddings, non_finetuned_zeroshot_embeddings).diagonal())
    non_finetuned_oneshot_similarities.extend(cosine_similarity(ref_embeddings, non_finetuned_oneshot_embeddings).diagonal())

# Calculate the average cosine similarities
average_finetuned_similarity_Word2Vec = np.mean(finetuned_similarities)
average_non_finetuned_zeroshot_similarity_Word2Vec = np.mean(non_finetuned_zeroshot_similarities)
average_non_finetuned_oneshot_similarity_Word2Vec = np.mean(non_finetuned_oneshot_similarities)

print(f"Average Cosine Similarity (Word2Vec) for finetuned responses: {average_finetuned_similarity_Word2Vec:.4f}")
print(f"Average Cosine Similarity (Word2Vec) for non-finetuned zero shot responses: {average_non_finetuned_zeroshot_similarity_Word2Vec:.4f}")
print(f"Average Cosine Similarity (Word2Vec) for non-finetuned one shot responses: {average_non_finetuned_oneshot_similarity_Word2Vec:.4f}")


## WMD

In [None]:
!pip install pyemd
!pip install POT

In [None]:
import gensim.downloader as api

# Load the pre-trained Word2Vec model
model = api.load('word2vec-google-news-300')

In [None]:
from datasets import load_dataset
from gensim import downloader as api
import nltk

# Download the necessary package
nltk.download('punkt')

# Load the dataset
# ds = load_dataset("zeref713/gsm8k_phi3_responses")
# test_dataset = ds['test']

# Preprocess the texts
reference_texts = [nltk.word_tokenize(text.lower()) for text in test_dataset['target']]
finetuned_texts = [nltk.word_tokenize(text.lower()) for text in test_dataset['finetuned_response']]
non_finetuned_zeroshot_texts = [nltk.word_tokenize(text.lower()) for text in test_dataset['zero_shot_response']]
non_finetuned_oneshot_texts = [nltk.word_tokenize(text.lower()) for text in test_dataset['one_shot_response']]

# Load the pre-trained Word2Vec model
# model = api.load('word2vec-google-news-300')

In [None]:
# Set batch size
batch_size = 100

# Calculate WMD for finetuned responses
finetuned_wmd_distances = []

for start_idx in range(0, len(reference_texts), batch_size):
    batch_refs = reference_texts[start_idx:start_idx + batch_size]
    batch_finetuned = finetuned_texts[start_idx:start_idx + batch_size]

    for ref, cand in zip(batch_refs, batch_finetuned):
        distance = model.wmdistance(ref, cand)
        finetuned_wmd_distances.append(distance)

# Calculate WMD for non-finetuned responses
non_finetuned_zeroshot_wmd_distances = []

for start_idx in range(0, len(reference_texts), batch_size):
    batch_refs = reference_texts[start_idx:start_idx + batch_size]
    batch_non_finetuned_zeroshot = non_finetuned_zeroshot_texts[start_idx:start_idx + batch_size]

    for ref, cand in zip(batch_refs, batch_non_finetuned_zeroshot):
        distance = model.wmdistance(ref, cand)
        non_finetuned_zeroshot_wmd_distances.append(distance)

non_finetuned_oneshot_wmd_distances = []

for start_idx in range(0, len(reference_texts), batch_size):
    batch_refs = reference_texts[start_idx:start_idx + batch_size]
    batch_non_finetuned_oneshot = non_finetuned_oneshot_texts[start_idx:start_idx + batch_size]

    for ref, cand in zip(batch_refs, batch_non_finetuned_oneshot):
        distance = model.wmdistance(ref, cand)
        non_finetuned_oneshot_wmd_distances.append(distance)

# Calculate average WMD
average_finetuned_wmd = sum(finetuned_wmd_distances) / len(finetuned_wmd_distances)
average_non_finetuned_zeroshot_wmd = sum(non_finetuned_zeroshot_wmd_distances) / len(non_finetuned_zeroshot_wmd_distances)
average_non_finetuned_oneshot_wmd = sum(non_finetuned_oneshot_wmd_distances) / len(non_finetuned_oneshot_wmd_distances)

print(f"Average WMD for finetuned responses: {average_finetuned_wmd:.4f}")
print(f"Average WMD for non-finetuned zeroshot responses: {average_non_finetuned_zeroshot_wmd:.4f}")
print(f"Average WMD for non-finetuned oneshot responses: {average_non_finetuned_oneshot_wmd:.4f}")

In [None]:
print(f"Average WMD for finetuned responses: {average_finetuned_wmd:.4f}")
print(f"Average WMD for non-finetuned zeroshot responses: {average_non_finetuned_zeroshot_wmd:.4f}")
print(f"Average WMD for non-finetuned oneshot responses: {average_non_finetuned_oneshot_wmd:.4f}")

# Evaluation Results