This notebook was run in Google Colab. To run, connect to a GPU, such as the V100, and upload the necessary datasets. Access to the llama-2-7b model is controlled by Meta. To request access, follow this link: https://huggingface.co/meta-llama/Llama-2-7b. Once permission is granted, you will need to log in to Hugging Face with a valid token. The code for evaluation was partially taken from others papers and merged together. Boilerplate code was typically generated using Claude 3 Opus and GPT-4 and then adjusted for our specific use case. Sources of pre-built functions are referenced within the cell.



In [None]:
#%%capture
!pip install accelerate peft bitsandbytes transformers trl sacrebleu rouge


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List, Tuple

from huggingface_hub import login
login()

In [None]:
# Check if a GPU is available
if not torch.cuda.is_available():
    raise EnvironmentError("This script requires a GPU to run.")

# Constants
MAX_INPUT_TOKEN_LENGTH = 4096
DEFAULT_MAX_NEW_TOKENS = 50

# Load the model and tokenizer
model_id = "benschlagman/llama-2-7b-chat-esconv"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.use_default_system_prompt = False


In [None]:
# IMPORTING LLAMA OUTPUTS FILE

import csv

model_response_flattened = []
conversation_golden_responses_flattened = []


filename = "raw_finetuned_output_1e.csv"


with open(filename, 'r', newline='', encoding='utf-8') as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader)

    for row in csvreader:
      model_response_flattened.append(row[0])
      conversation_golden_responses_flattened.append(row[1])

print("CSV file data imported successfully.")

In [None]:
print(conversation_golden_responses_flattened)


In [None]:
# PERPLEXITY: https://huggingface.co/docs/transformers/perplexity

def calculate_perplexity(response: str, model, tokenizer, max_length):
    encodings = tokenizer(response, return_tensors="pt")
    seq_len = encodings.input_ids.size(1)
    stride = 512
    nlls = []

    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(model.device)
        target_ids = input_ids.clone()
        target_ids[:, :-stride] = -100  # ignore the shifted tokens for loss calculation

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss
            nlls.append(neg_log_likelihood)

    ppl = torch.exp(torch.stack(nlls).mean())
    return ppl.item()

In [None]:
perplexities = []

for response in model_response_flattened:
    ppl = calculate_perplexity(response, model, tokenizer, max_length=512)
    perplexities.append(ppl)

print(perplexities)

In [None]:
perplexities

In [None]:
import math

#Filter out NaN values
filtered_perplexities = [value for value in perplexities if not math.isnan(value)]

#Compute the average of the filtered list
average = sum(filtered_perplexities) / len(filtered_perplexities) if filtered_perplexities else float('nan')
print(average)

In [None]:
avg_perplexity = sum(perplexities) / len(perplexities)
print(avg_perplexity)