This notebook was run in Google Colab. To run, connect to a GPU, such as the V100, and upload the necessary datasets. Access to the llama-2-7b model is controlled by Meta. To request access, follow this link: https://huggingface.co/meta-llama/Llama-2-7b. Once permission is granted, you will need to log in to Hugging Face with a valid token. The code for evaluation was partially taken from others papers and merged together. Boilerplate code was typically generated using Claude 3 Opus and GPT-4 and then adjusted for our specific use case. Sources of pre-built functions are referenced within the cell.


In [None]:
#%%capture
!pip install accelerate peft bitsandbytes transformers trl sacrebleu rouge


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List, Tuple

from huggingface_hub import login
login()

In [None]:
# Check if a GPU is available
if not torch.cuda.is_available():
    raise EnvironmentError("This script requires a GPU to run.")

# Constants
MAX_INPUT_TOKEN_LENGTH = 4096
DEFAULT_MAX_NEW_TOKENS = 50

# Load the model and tokenizer
model_id = "meta-llama/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.use_default_system_prompt = False


In [None]:
# IMPORTING LLAMA OUTPUTS FILE
import csv

model_response_flattened = []
conversation_golden_responses_flattened = []

filename = "llama_outputs.csv"

with open(filename, 'r', newline='', encoding='utf-8') as csvfile:

    csvreader = csv.reader(csvfile)
    next(csvreader)

    for row in csvreader:
      model_response_flattened.append(row[0])
      conversation_golden_responses_flattened.append(row[1])

print("CSV file data imported successfully.")

In [None]:
print(conversation_golden_responses_flattened)

In [None]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction

def calculate_bleu_scores(model_responses, golden_responses, is_corpus=False):
    assert len(model_responses) == len(golden_responses), "The lengths of model responses and golden responses should match."

    bleu_1_scores = []
    bleu_2_scores = []
    bleu_3_scores = []
    bleu_4_scores = []

    # Calculate sentence BLEU scores for each response
    for model_response, golden_response in zip(model_responses, golden_responses):
        reference = [golden_response.split()]  # Tokenize the golden response
        candidate = model_response.split()     # Tokenize the model response

        # Calculate and store BLEU scores for each sentence
        bleu_1_scores.append(sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=SmoothingFunction().method1))
        bleu_2_scores.append(sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0), smoothing_function=SmoothingFunction().method1))
        bleu_3_scores.append(sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0), smoothing_function=SmoothingFunction().method1))
        bleu_4_scores.append(sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1))

    # Calculate average BLEU scores across all responses
    avg_bleu_1 = sum(bleu_1_scores) / len(bleu_1_scores)
    avg_bleu_2 = sum(bleu_2_scores) / len(bleu_2_scores)
    avg_bleu_3 = sum(bleu_3_scores) / len(bleu_3_scores)
    avg_bleu_4 = sum(bleu_4_scores) / len(bleu_4_scores)

    return avg_bleu_1, avg_bleu_2, avg_bleu_3, avg_bleu_4

In [None]:
avg_bleu_1, avg_bleu_2, avg_bleu_3, avg_bleu_4 = calculate_bleu_scores(model_response_flattened, conversation_golden_responses_flattened)
print("Average BLEU-1 score:", avg_bleu_1 * 100)
print("Average BLEU-2 score:", avg_bleu_2 * 100)
print("Average BLEU-3 score:", avg_bleu_3 * 100)
print("Average BLEU-4 score:", avg_bleu_4 * 100)

In [None]:
#%%capture
!pip install accelerate rouge_score
from rouge_score import rouge_scorer

# Initialize the scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Initialize sums for each ROUGE score
sum_rougeL_precision, sum_rougeL_recall, sum_rougeL_fmeasure = 0, 0, 0

# Calculate scores for each sentence pair
for m, c in zip(model_response_flattened, conversation_golden_responses_flattened):
    score = scorer.score(m, c)

    # Accumulate the scores
    sum_rougeL_precision += score["rougeL"].precision
    sum_rougeL_recall += score["rougeL"].recall
    sum_rougeL_fmeasure += score["rougeL"].fmeasure

# Calculate the averages
avg_rougeL_precision = sum_rougeL_precision / len(model_response_flattened)
avg_rougeL_recall = sum_rougeL_recall / len(model_response_flattened)
avg_rougeL_fmeasure = sum_rougeL_fmeasure / len(model_response_flattened)

# Print the average scores
print(f'Average ROUGE-L Precision: {avg_rougeL_precision}')
print(f'Average ROUGE-L Recall: {avg_rougeL_recall}')
print(f'Average ROUGE-L F-measure: {avg_rougeL_fmeasure}')


In [None]:
# METEOR:
#%%capture
!pip install nltk

import nltk
nltk.download("wordnet")
nltk.download("punkt")

from nltk.translate import meteor
from nltk.tokenize import word_tokenize


def calculate_meteor(candidate, reference):
  '''
  candidate, reference: tokenized list of words in the sentence
  '''
  mt_list = []
  mt_sum = 0
  for c, r in zip(candidate, reference):
    r_tokenized = word_tokenize(r)
    c_tokenized = word_tokenize(c)
    meteor_score = round(meteor([c_tokenized], r_tokenized), 4)
    #print("Model output: ", c)
    #print("Gold reference: ", r)
    #print("Meteor score: ", meteor_score)
    mt_list.append(meteor_score)
    mt_sum += meteor_score
    #print("\n")
  avg_mt = mt_sum / len(mt_list)
  print(f"Total average meteor score: {str(avg_mt)}")

  return meteor_score

calculate_meteor(model_response_flattened, conversation_golden_responses_flattened)

In [None]:
# PERPLEXITY: https://huggingface.co/docs/transformers/perplexity

def calculate_perplexity(response: str, model, tokenizer, max_length):
    encodings = tokenizer(response, return_tensors="pt")
    seq_len = encodings.input_ids.size(1)
    stride = 512
    nlls = []

    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(model.device)
        target_ids = input_ids.clone()
        target_ids[:, :-stride] = -100  # ignore the shifted tokens for loss calculation

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss
            nlls.append(neg_log_likelihood)

    ppl = torch.exp(torch.stack(nlls).mean())
    return ppl.item()

In [None]:
perplexities = []

for response in model_response_flattened:
    ppl = calculate_perplexity(response, model, tokenizer, max_length=512)
    perplexities.append(ppl)

print(perplexities)

In [None]:
avg_perplexity = sum(perplexities) / len(perplexities)
print(avg_perplexity)

In [None]:
# Clone the Distinct-N repository
!git clone https://github.com/neural-dialogue-metrics/Distinct-N.git
%cd Distinct-N

from distinct_n.utils import ngrams

def distinct_n_sentence_level(sentence, n):
    """
    Compute distinct-N for a single sentence.
    :param sentence: a list of words.
    :param n: int, ngram.
    :return: float, the metric value.
    """
    if len(sentence) == 0:
        return 0.0  # Prevent a zero division
    distinct_ngrams = set(ngrams(sentence, n))
    return len(distinct_ngrams) / len(sentence)

def distinct_n_corpus_level(sentences, n):
    """
    Compute average distinct-N of a list of sentences (the corpus).
    :param sentences: a list of sentence.
    :param n: int, ngram.
    :return: float, the average value.
    """
    return sum(distinct_n_sentence_level(sentence, n) for sentence in sentences) / len(sentences)

In [None]:
distinct_1_list = []

for response in model_response_flattened:
    d_1 = distinct_n_sentence_level(response, 1)  # or adjust max_length as needed
    distinct_1_list.append(d_1)

# Now, 'perplexities' contains the perplexity for each response.
# You can print them out or analyze them further as needed.
print(distinct_1_list)

In [None]:
distinct_2_list = []

for response in model_response_flattened:
    d_2 = distinct_n_sentence_level(response, 2)  # or adjust max_length as needed
    distinct_2_list.append(d_2)

# Now, 'perplexities' contains the perplexity for each response.
# You can print them out or analyze them further as needed.
print(distinct_2_list)

In [None]:
d_1 = sum(distinct_1_list) / len(distinct_1_list)
print(d_1)

In [None]:
d_2 = sum(distinct_2_list) / len(distinct_2_list)
print(d_2)