## ICL - GPT2

In [1]:
import torch
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric
import json
import bert_score
import evaluate
import random
import matplotlib.pyplot as plt
import pandas as pd

MAX_LENGTH = 1024


  from .autonotebook import tqdm as notebook_tqdm





  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
import torch
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric
import bert_score
import evaluate
import random
import matplotlib.pyplot as plt
import pandas as pd

MAX_LENGTH = 1024

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device being used:", device)

Device being used: cuda


In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


sent_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def sent_similarity(sent1, sent2):
    sentences = [sent1, sent2]
    embeddings = sent_model.encode(sentences)
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix[0][1]


def format_examples(ds, ds_name='ni'):
    prompts = []
    if ds_name == 'ni':
        for example in ds:
            # prompt = f"### Question: {example['input']} \n ###Targets: {example['output']}"
            prompt = f"### Task: {example['definition']}\n ### Inputs: {example['inputs']}\n ### Targets: {example['targets']}"
            prompts.append(prompt)
    elif ds_name == 'medmcq':
         for example in ds:
            prompt = f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Targets: {example['output']}"
            prompts.append(prompt)
    elif ds_name == 'finance_sent':
        for example in ds:
            prompt = f"### Text: {example['text']}\n ### Targets: {example['label']}"
            prompts.append(prompt)

    return prompts

def select_characters_before_target(string):
    target_phrase = "### Targets:"
    target_index = string.find(target_phrase)
    if target_index != -1:  # If the phrase is found
        return string[:target_index] + target_phrase
    else:
        return string 

def group_examples_random(ds, n): #this is where we group examples into a larger prompt
    random.seed()
    samples = random.sample(ds, n)
    new_prompt = ""
    for i in range(n):
        new_prompt += samples[i]
        new_prompt += "\n"
    return new_prompt

def group_by_similarity(prompt, ds, n_egs, m_choices):
    choices = random.sample(ds, m_choices)
    cos_sim_dict = {}
    for c in choices:
        cos_sim_dict[c] = sent_similarity(prompt, select_characters_before_target(c))

    sorted_cos_sim = sorted(cos_sim_dict.items(), key=lambda x: x[1], reverse=True)
    top_egs = ""
    for item in sorted_cos_sim[:n_egs]:
        top_egs += item[0]
        top_egs += "\n"

    top_egs = "".join([item[0] for item in sorted_cos_sim[:n_egs]])
    return top_egs

def count_tokens(tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, add_special_tokens=True)
    return len(input_ids)


def evaluate_example(model, tokenizer, prompt, model_name, max_tokens):
    if model_name == 'gpt2':
        tokenized_prompt = tokenizer(prompt, return_tensors='pt').to(device)
        if len(tokenized_prompt['input_ids'][0]) > MAX_LENGTH: #currently just checking if random prompt is too big or not
            return None 
        outputs =model.generate(**tokenized_prompt, pad_token_id=tokenizer.eos_token_id, max_length=1024)
        decoded_output = tokenizer.decode(outputs[0][len(tokenized_prompt['input_ids'][0]):], skip_special_tokens=True)
        return decoded_output
    elif model_name == 'mistral':
        num_tokens = count_tokens(tokenizer, prompt)
        if num_tokens > 3500:
            return None
        model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
        outputs =model.generate(**model_inputs, pad_token_id=tokenizer.eos_token_id, do_sample=True, max_new_tokens=max_tokens)
        decoded_output = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):], skip_special_tokens=True) #only get output
        return decoded_output
  

def evaluate_icl(train_dataset, test_dataset, model, tokenizer, num_egs, model_name, ds_name='ni', method='similarity'):
    reals = []
    preds = []
    counter = 0
    for example in test_dataset:
        # print(example)
        # prompt = group_examples(train_dataset, num_egs) + f"### Question: {example['input']} \n ###Targets:"
        if ds_name == 'ni':
            curr_prompt = f"### Task: {example['definition']}\n ### Inputs: {example['inputs']}\n ### Targets:"
            real = f"{example['targets']}"
        elif ds_name == 'medqa':
            curr_prompt = f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Targets:"
            real = f"{example['output']}"
        elif ds_name == 'finance_sent':
            curr_prompt = f"### Text: {example['text']}\n ### Targets:"
            real = f"{example['label']}"

        tokens = tokenizer(real, return_tensors='pt').to(device)
        max_tokens = len(tokens['input_ids'][0])
    
        if method == 'similarity':
            icl_prompt = group_by_similarity(curr_prompt,train_dataset, num_egs, 100) + curr_prompt
        elif method == 'random':
            icl_prompt = group_examples_random(train_dataset, num_egs) + curr_prompt

        # print("MAX TOKENS:\n", max_tokens)

        pred = evaluate_example(model, tokenizer, icl_prompt, model_name, max_tokens)

        if counter % 50 == 0:
            print("PROMPT:\n", icl_prompt)
            print("REAL ANSWER:\n", real)
            print("PREDICTION:\n", pred)
        if pred:
            reals.append(real)
            preds.append(pred)
        counter+=1

    return reals, preds

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig




# model_plain =  GPT2LMHeadModel.from_pretrained("gpt2").to(device)
# tokenizer_plain = GPT2Tokenizer.from_pretrained("gpt2")
# print("models retrieved")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)



model_mist = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", quantization_config=bnb_config, device_map="auto")
tokenizer_mist = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")


Loading checkpoint shards: 100%|██████████| 2/2 [00:38<00:00, 19.19s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_gpt2=  GPT2LMHeadModel.from_pretrained("gpt2").to(device)
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained("gpt2")

In [6]:
def evaluate_example2(model, tokenizer, prompt):
    model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
    # print(prompt)
    # if len(tokenized_prompt['input_ids'][0]) > MAX_LENGTH: #currently just checking if random prompt is too big or not
    #     return None 
    outputs =model.generate(**model_inputs, pad_token_id= tokenizer.eos_token_id, do_sample=False, max_length = 10)
    decoded_output = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):], skip_special_tokens=True)
    # print("prediction: ",decoded_output)/
    return decoded_output


prompt = """ "featuring an oscar-worthy performance => positive\n"
    "completely messed up => negative\n"
    "masterpiece => positive\n"
    "the action is stilted => negative\n"
    "by far the worst movie of the year =>" """
pred = evaluate_example2(model_mist, tokenizer_mist, prompt) 
print(pred)



"


### Test on Natural Instructions Data

In [None]:
data = load_from_disk('data/1000_per_task')

# data = filter_icl(data, max_num_egs, tokenizer_plain)

max_num_egs =  5   #natural instructions are just too big

bleu = evaluate.load('bleu')

# train_test_split = data.train_test_split(test_size=0.2, seed=42)
# train_dataset = train_test_split['train']
# test_dataset = train_test_split['test']

train_dataset = data['train']
test_dataset = data['test'].select(range(5))

print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset)
print("Length of train set", len(train_list))

icl_method = 'random'
model_name = 'mistral'
ds_name = 'ni'

bert_scores = []
results_data = []
for i in range(max_num_egs):
    reals, preds = evaluate_icl(train_list, test_dataset, model_mist, tokenizer_mist, i, model_name=model_name, ds_name=ds_name, method=icl_method)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    print(order)
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order) #set order to mean of real values
    results_data.append({'num_samples' : len(preds), 'num_demonstrations':i, 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu']})

# results_df = pd.DataFrame(results_data)
# results_df.to_csv(f'icl_results/icl_results_{ds_name}_{icl_method}_{model_name}.csv', index=False)

# plt.figure(figsize=(10, 2))
# plt.plot(range(max_num_egs), bert_scores) 
# plt.xlabel('Number of examples') 
# plt.ylabel('BERT F1 Score') 
# plt.title('BERT F1 Score vs Number of Examples') 
# plt.xticks(range(max_num_egs))
# plt.savefig('BERT_scores_icl_ni.png')



In [73]:
# print(int(sum(len(s) for s in refs)/len(refs)))
# test = bleu.compute(predictions=['No', 'Yes', 'Yes', 'yes', 'Yes', 'yes', 'yes', 'No', 'yes', 'No'],
#                     references=['No.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.'], max_order=int(sum(len(s) for s in refs)/len(refs)))
# test

1


{'bleu': 0.07357588823428847,
 'precisions': [0.2],
 'brevity_penalty': 0.36787944117144233,
 'length_ratio': 0.5,
 'translation_length': 10,
 'reference_length': 20}

### Test on Medical MCQ Dataset

In [None]:
bleu = evaluate.load('bleu')

max_num_egs = 5

data = load_dataset('medalpaca/medical_meadow_medqa')['train']
train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='medqa')
print("Length of train set", len(train_list))

icl_method = 'random'
model_name = 'mistral'
ds_name = 'medmcq'

bert_scores = []
results_data = []


for i in range(max_num_egs):
    accuracy = 0
    reals, preds = evaluate_icl(train_list, test_dataset, model_mist, tokenizer_mist, i, model_name=model_name, ds_name=ds_name, method=icl_method)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    print("order: ", order)
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order=order)
    for r,p in zip(reals, preds):
        if len(p.strip()) != 0:
            if r.strip()[0] == p.strip()[0]:
                accuracy+=1
    accuracy = accuracy/len(preds)
    results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i, 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu'], 'accuracy':accuracy})

# results_df = pd.DataFrame(results_data)
# results_df.to_csv(f'icl_results/icl_results_{ds_name}_{icl_method}_{model_name}.csv', index=False)

# plt.figure(figsize=(10, 2))
# plt.plot(range(max_num_egs), bert_scores) 
# plt.xlabel('Number of examples') 
# plt.ylabel('BERT F1 Score')
# plt.title('BERT F1 Score vs Number of Examples') 
# plt.xticks(range(max_num_egs))
# plt.savefig('BERT_scores_icl_medqa.png')


In [43]:
results_df = pd.DataFrame(results_data)
# results_df.to_csv(f'icl_results/icl_results_{ds_name}_{icl_method}_{model_name}.csv', index=False)
results_df.head()

Unnamed: 0,num_samples,num_demonstrations,bert_score,bleu_score,accuracy
0,98,0,0.814683,0.1104,0.020408
1,100,1,0.911485,0.407569,0.24
2,100,2,0.917994,0.4329,0.3
3,100,3,0.923827,0.432153,0.28
4,100,4,0.911291,0.406805,0.19


### Test on Transcript Sentiment Analysis

In [None]:

max_num_egs = 5

data = load_dataset('jlh-ibm/earnings_call', 'transcript-sentiment')['train']
bleu = evaluate.load('bleu')

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='finance_sent')
print("Length of train set", len(train_list))

icl_method = 'random'
model_name = 'mistral'
ds_name = 'finance_sent'

bert_scores = []
results_data = []


for i in range(max_num_egs):
    accuracy = 0
    reals, preds = evaluate_icl(train_list, test_dataset, model_mist, tokenizer_mist, i, model_name=model_name, ds_name=ds_name, method=icl_method)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    print("order: ", order)
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order=order)
    for r,p in zip(reals, preds):
        if len(p.strip()) != 0:
            if r.strip()[0] == p.strip()[0]:
                accuracy+=1
    accuracy = accuracy/len(preds)
    results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i, 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu'], 'accuracy':accuracy})

results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/icl_results_{ds_name}_{icl_method}_{model_name}.csv', index=False)

In [7]:
results_df.head()

Unnamed: 0,num_samples,num_demonstrations,bert_score,bleu_score,accuracy
0,99,0,0.727094,0.0,0.0
1,100,1,0.953518,0.396396,0.45
2,100,2,0.964684,0.471698,0.52
3,98,3,0.963729,0.49505,0.530612
4,97,4,0.970722,0.587629,0.597938
