## ICL - GPT2

In [1]:
import torch
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from encodeinstruction import encodeinstruction
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import json
import bert_score
import evaluate
import random
import matplotlib.pyplot as plt
import pandas as pd

MAX_LENGTH = 1024

random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device being used:", device)

  from .autonotebook import tqdm as notebook_tqdm



Device being used: cuda


In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


sent_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def sent_similarity(sent1, sent2):
    sentences = [sent1, sent2]
    embeddings = sent_model.encode(sentences)
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix[0][1]


def format_examples(ds, ds_name='ni'):
    prompts = []
    if ds_name == 'ni':
        for example in ds:
            # prompt = f"### Question: {example['input']} \n ###Targets: {example['output']}"
            prompt = f"### Task: {example['definition']}\n ### Inputs: {example['inputs']} \n ### Targets: {example['targets']}"
            prompts.append(prompt)
    elif ds_name == 'medqa':
         for example in ds:
            prompt = f"### Question: {example['input']} \n ###Targets: {example['output']}"
            prompts.append(prompt)

    return prompts

def select_characters_before_target(string):
    target_phrase = "### Targets:"
    target_index = string.find(target_phrase)
    if target_index != -1:  # If the phrase is found
        return string[:target_index] + target_phrase
    else:
        return string 

def group_examples_random(ds, n): #this is where we group examples into a larger prompt
    samples = random.sample(ds, n)
    new_prompt = ""
    for i in range(n):
        new_prompt += samples[i]
    return new_prompt

def group_by_similarity(prompt, ds, n_egs, m_choices):
    choices = random.sample(ds, m_choices)
    cos_sim_dict = {}
    for c in choices:
        cos_sim_dict[c] = sent_similarity(prompt, select_characters_before_target(c))

    sorted_cos_sim = sorted(cos_sim_dict.items(), key=lambda x: x[1], reverse=True)
    top_egs = ""
    for item in sorted_cos_sim[:n_egs]:
        top_egs += item[0]

    top_egs = "".join([item[0] for item in sorted_cos_sim[:n_egs]])
    return top_egs


def evaluate_example(model, tokenizer, prompt):
    tokenized_prompt = tokenizer(prompt, return_tensors='pt').to(device)
    # print(prompt)
    if len(tokenized_prompt['input_ids'][0]) > MAX_LENGTH: #currently just checking if random prompt is too big or not
        return None 
    outputs =model.generate(**tokenized_prompt, pad_token_id=tokenizer.eos_token_id, max_length=MAX_LENGTH)
    decoded_output = tokenizer.decode(outputs[0][len(tokenized_prompt['input_ids'][0]):], skip_special_tokens=True)
    # print("prediction: ",decoded_output)
    return decoded_output
  

def evaluate_icl(train_dataset, test_dataset, model, tokenizer, num_egs, ds_name='ni', method='similarity'):
    reals = []
    preds = []
    for example in test_dataset:
        # print(example)
        # prompt = group_examples(train_dataset, num_egs) + f"### Question: {example['input']} \n ###Targets:"
        if ds_name == 'ni':
            curr_prompt = f"### Task: {example['definition']}\n ### Inputs: {example['inputs']} \n ### Targets:"
            real = f"{example['targets']}"
        elif ds_name == 'medqa':
            curr_prompt = f"### Question: {example['input']} \n ###Targets:"
            real = f"{example['output']}"
        
        if method == 'similarity':
            icl_prompt = group_by_similarity(curr_prompt,train_dataset, num_egs, 100) + curr_prompt
        elif method == 'random':
            icl_prompt = group_examples_random(train_dataset, num_egs) + curr_prompt


        # print(prompt)
        # real = f"{example['output']}"
        pred = evaluate_example(model, tokenizer, icl_prompt)
        if pred:
            reals.append(real)
            preds.append(pred)

    return reals, preds

In [3]:
max_num_egs = 3 #natural instructions are just too big

model_plain =  GPT2LMHeadModel.from_pretrained("gpt2").to(device)
tokenizer_plain = GPT2Tokenizer.from_pretrained("gpt2")
print("models retrieved")


models retrieved


### Test on Natural Instructions Data

In [7]:
data = load_from_disk('data/1000_per_task')

# data = filter_icl(data, max_num_egs, tokenizer_plain)

print("Filtering done")


# train_test_split = data.train_test_split(test_size=0.2, seed=42)
# train_dataset = train_test_split['train']
# test_dataset = train_test_split['test']

train_dataset = data['train']
test_dataset = data['test'].select(range(10))

print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset)
print("Length of train set", len(train_list))

bert_scores = []
results_data = []
for i in range(1, max_num_egs):
    reals, preds = evaluate_icl(train_list, test_dataset, model_plain, tokenizer_plain, i, method='similarity')
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    print(f"Average F1 score for ICL ({i} eg) with {len(preds)} test cases: ", average_F1)
    bert_scores.append(average_F1)
    results_data.append({'num_egs' : len(preds), 'bert_score' : float(average_F1)})

results_df = pd.DataFrame(results_data)
results_df.to_csv('icl_results_ni_similarity.csv', index=False)

# plt.figure(figsize=(10, 2))
# plt.plot(range(max_num_egs), bert_scores) 
# plt.xlabel('Number of examples') 
# plt.ylabel('BERT F1 Score') 
# plt.title('BERT F1 Score vs Number of Examples') 
# plt.xticks(range(max_num_egs))
# plt.savefig('BERT_scores_icl_ni.png')



Filtering done
Length of test set:  10
Length of train set 575481


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average F1 score for ICL (1 eg) with 10 test cases:  tensor(0.7873)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average F1 score for ICL (2 eg) with 10 test cases:  tensor(0.7909)


### Test on Medical QA Dataset

In [9]:
max_num_egs=3
bleu = evaluate.load('bleu')

data = load_dataset('medalpaca/medical_meadow_medqa')['train']
train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(10))


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='medqa')
print("Length of train set", len(train_list))

bert_scores = []
results_data = []
for i in range(max_num_egs):
    reals, preds = evaluate_icl(train_list, test_dataset, model_plain, tokenizer_plain, i, ds_name='medqa', method='random')
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    print(f"Average F1 score for ICL ({i} eg) with {len(preds)} test cases: ", average_F1)
    bert_scores.append(average_F1)
    bleu_score = bleu.compute(predictions=preds, references=reals)
    results_data.append({'num_egs' : len(preds), 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu']})

results_df = pd.DataFrame(results_data)
results_df.to_csv('icl_results_med_mea_qa_random.csv', index=False)

# plt.figure(figsize=(10, 2))
# plt.plot(range(max_num_egs), bert_scores) 
# plt.xlabel('Number of examples') 
# plt.ylabel('BERT F1 Score')
# plt.title('BERT F1 Score vs Number of Examples') 
# plt.xticks(range(max_num_egs))
# plt.savefig('BERT_scores_icl_medqa.png')


Length of test set:  10
Length of train set 8142


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average F1 score for ICL (0 eg) with 10 test cases:  tensor(0.7270)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average F1 score for ICL (1 eg) with 10 test cases:  tensor(0.7703)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average F1 score for ICL (2 eg) with 10 test cases:  tensor(0.7694)
