In [None]:
%pip install -q -U torch==2.0.1 bitsandbytes==0.40.2
%pip install -q -U transformers==4.35.2 peft==0.4.0 accelerate==0.21.0
%pip install -q -U datasets py7zr einops tensorboardX
!pip install evaluate
# Add installed cuda runtime to path for bitsandbytes
import os
import nvidia

cuda_install_dir = '/'.join(nvidia.__file__.split('/')[:-1]) + '/cuda_runtime/lib/'
os.environ['LD_LIBRARY_PATH'] =  cuda_install_dir

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import pandas as pd
import os
import torch
import numpy as np
from scipy import stats
# Path to your env.txt file
env_file_path = 'data/env.txt'

# Read and set environment variables
with open(env_file_path, 'r') as file:
    for line in file:
        key, value = line.strip().split('=')
        os.environ[key] = value
token = os.environ['huggingface_token']

In [None]:
model_path = "meta-llama/Llama-2-70b-hf"   # Specify the path to the model
tokenizer = AutoTokenizer.from_pretrained(model_path, token=token)
model = AutoModelForCausalLM.from_pretrained(
  model_path,
  device_map='auto',
  load_in_4bit=True,
  max_memory=max_memory,
  do_sample=True,
  torch_dtype="auto"
)

In [17]:
df = pd.read_csv('data/all_models.csv')
gsm8k_all = pd.read_csv('data/gsm8k_all.csv')
gsm8k_questions = pd.read_csv('data/gsm8k_questions.csv')
mathwell_all = pd.read_csv('data/mathwell_annotations.csv')
mathwell_all_good = mathwell_all[mathwell_all['good']==1]
llama = df[df['model']=='llama']
llama_good = llama[llama['good']==1]
llema = df[df['model']=='llema']
llema_good = llema[llema['good']==1]
mathwell = df[df['model']=='mathwell']
mathwell_good = mathwell[mathwell['good']==1]
mammoth = df[df['model']=='mammoth']
mammoth_good = mammoth[mammoth['good']==1]

In [None]:
def perplexity(df):
    ppls = []
    for i in range(0, len(df)):
        text = "Question: " + df.iloc[i]['question'] + "\n" + "Solution:\n" + df.iloc[i]['solution']
        inputs = tokenizer(text, return_tensors = "pt")
        loss = model(input_ids = inputs["input_ids"], labels = inputs["input_ids"]).loss
        ppl = torch.exp(loss)
        ppls.append(ppl)
    return ppls

def perplexity_question(df):
    ppls = []
    for i in range(0, len(df)):
        text = df.iloc[i]['question']
        inputs = tokenizer(text, return_tensors = "pt")
        loss = model(input_ids = inputs["input_ids"], labels = inputs["input_ids"]).loss
        ppl = torch.exp(loss)
        ppls.append(ppl)
    return ppls

def perplexity_gsm(df):
    ppls = []
    for i in range(0, len(df)):
        text = df.iloc[i]['output']
        inputs = tokenizer(text, return_tensors = "pt")
        loss = model(input_ids = inputs["input_ids"], labels = inputs["input_ids"]).loss
        ppl = torch.exp(loss)
        ppls.append(ppl)
    return ppls

def perplexity_gsm_question(df):
    ppls = []
    for i in range(0, len(df)):
        text = df.iloc[i]['instruction']
        inputs = tokenizer(text, return_tensors = "pt")
        loss = model(input_ids = inputs["input_ids"], labels = inputs["input_ids"]).loss
        ppl = torch.exp(loss)
        ppls.append(ppl)
    return ppls

## GSM8K Perplexity

In [None]:
gsm8k_ppl = perplexity_gsm(gsm8k_all)
gsm8k_question_ppl = perplexity_gsm_question(gsm8k_questions)
print(f'Average overall perplexity: {np.mean(gsm8k_ppl)} Standard Deviation: {np.std(gsm8k_ppl)}')
print(f'Average overall perplexity for questions only: {np.mean(gsm8k_question_ppl)} Standard Deviation: {np.std(gsm8k_question_ppl)}')

## MATHWELL Annotated Perplexity

In [None]:
mathwell_all_ppl = perplexity(mathwell_all)
mathwell_all_question_ppl = perplexity_question(mathwell_all)
print(f'Average overall perplexity: {np.mean(mathwell_all_ppl)} Standard Deviation: {np.std(mathwell_all_ppl)}')
print(f'Average overall perplexity for questions only: {np.mean(mathwell_all_question_ppl)} Standard Deviation: {np.std(mathwell_all_question_ppl)}')

In [None]:
mathwell_all_good_ppl = perplexity(mathwell_all_good)
mathwell_all_good_question_ppl = perplexity_question(mathwell_all_good)
print(f'Average overall perplexity: {np.mean(mathwell_all_good_ppl)} Standard Deviation: {np.std(mathwell_all_good_ppl)}')
print(f'Average overall perplexity for questions only: {np.mean(mathwell_all_good_question_ppl)} Standard Deviation: {np.std(mathwell_all_good_question_ppl)}')

## MATHWELL Final Perplexity

In [None]:
mathwell_ppl = perplexity(mathwell)
mathwell_question_ppl = perplexity_question(mathwell)
print(f'Average overall perplexity: {np.mean(mathwell_ppl)} Standard Deviation: {np.std(mathwell_ppl)}')
print(f'Average overall perplexity for questions only: {np.mean(mathwell_question_ppl)} Standard Deviation: {np.std(mathwell_question_ppl)}')

In [None]:
mathwell_good_ppl = perplexity(mathwell_good)
mathwell_good_question_ppl = perplexity_question(mathwell_good)
print(f'Average overall perplexity: {np.mean(mathwell_good_ppl)} Standard Deviation: {np.std(mathwell_good_ppl)}')
print(f'Average overall perplexity for questions only: {np.mean(mathwell_good_question_ppl)} Standard Deviation: {np.std(mathwell_good_question_ppl)}')

## Llama Perplexity

In [None]:
llama_ppl = perplexity(llama)
llama_question_ppl = perplexity_question(llama)
print(f'Average overall perplexity: {np.mean(llama_ppl)} Standard Deviation: {np.std(llama_ppl)}')
print(f'Average overall perplexity for questions only: {np.mean(llama_question_ppl)} Standard Deviation: {np.std(llama_question_ppl)}')

In [None]:
llama_good_ppl = perplexity(llama_good)
llama_good_question_ppl = perplexity_question(llama_good)
print(f'Average overall perplexity: {np.mean(llama_good_ppl)} Standard Deviation: {np.std(llama_good_ppl)}')
print(f'Average overall perplexity for questions only: {np.mean(llama_good_question_ppl)} Standard Deviation: {np.std(llama_good_question_ppl)}')

## Llemma Perplexity

In [None]:
llema_ppl = perplexity(llema)
llema_question_ppl = perplexity_question(llema)
print(f'Average overall perplexity: {np.mean(llema_ppl)} Standard Deviation: {np.std(llema_ppl)}')
print(f'Average overall perplexity for questions only: {np.mean(llema_question_ppl)} Standard Deviation: {np.std(llema_question_ppl)}')

In [None]:
llema_good_ppl = perplexity(llema_good)
llema_good_question_ppl = perplexity_question(llema_good)
print(f'Average overall perplexity: {np.mean(llema_good_ppl)} Standard Deviation: {np.std(llema_good_ppl)}')
print(f'Average overall perplexity for questions only: {np.mean(llema_good_question_ppl)} Standard Deviation: {np.std(llema_good_question_ppl)}')

## Mammoth Perplexity

In [None]:
mammoth_ppl = perplexity(mammoth)
mammoth_question_ppl = perplexity_question(mammoth)
print(f'Average overall perplexity: {np.mean(mammoth_ppl)} Standard Deviation: {np.std(mammoth_ppl)}')
print(f'Average overall perplexity for questions only: {np.mean(mammoth_question_ppl)} Standard Deviation: {np.std(mammoth_question_ppl)}')

In [None]:
mammoth_good_ppl = perplexity(mammoth_good)
mammoth_good_question_ppl = perplexity_question(mammoth_good)
print(f'Average overall perplexity: {np.mean(mammoth_good_ppl)} Standard Deviation: {np.std(mammoth_good_ppl)}')
print(f'Average overall perplexity for questions only: {np.mean(mammoth_good_question_ppl)} Standard Deviation: {np.std(mammoth_good_question_ppl)}')