## ICL (Done by Zaki)

In [1]:
import torch
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric
import bert_score
import evaluate
import random
import matplotlib.pyplot as plt
import pandas as pd
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device being used:", device)
bleu = evaluate.load('bleu')

  from .autonotebook import tqdm as notebook_tqdm



Device being used: cuda


In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


sent_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def sent_similarity(sent1, sent2):
    sentences = [sent1, sent2]
    embeddings = sent_model.encode(sentences)
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix[0][1]

# def sent_similarity(prompt, sentences):
#     P, R, F1 = bert_score.score([prompt]*len(sentences),sentences, lang="en")
#     return F1.tolist()


def format_examples(ds, ds_name='ni'):
    prompts = []
    if ds_name == 'ni':
        for example in ds:
            # prompt = f"### Question: {example['input']} \n ###Targets: {example['output']}"
            prompt = f"### Task: {example['definition']}\n ### Inputs: {example['inputs']}\n ### Targets: {example['targets']}"
            prompts.append(prompt)
    elif ds_name == 'medmcq'or ds_name == 'medmcq_2tok':
         for example in ds:
            prompt = f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Answer: {example['output']}"
            prompts.append(prompt)
    elif ds_name == 'finance_sent' or ds_name == 'finance_sent_2tok':
        for example in ds:
            prompt = f"### Text: {example['text']}\n ### Sentiment: {example['label']}"
            prompts.append(prompt)
    elif ds_name == 'medqa':
        for example in ds:
            prompts.append(example['text'])
    elif ds_name == 'lawqa':
        for example in ds:
            prompt = f"### Question: {example['question']}\n ### Answer: {example['answer']}"
            prompts.append(prompt)
    elif ds_name == 'alpaca':
        for example in ds:
            prompt = f"### Instruction: {example['instruction']}\n ### Input: {example['input']}\n ### Output: {example['output']}"
            prompts.append(prompt)

    return prompts

def select_characters_before_target(string, target_phrase="\n ### Targets:"): #this is a function to remove the actual target values from the train example so that the matching can be improved
    target_index = string.find(target_phrase)
    if target_index != -1:  # If the phrase is found
        return string[:target_index] + target_phrase
    else:
        return string 
    
def extract_response_content(string, target_phrase):
    response_index = string.find(target_phrase)
    return string[response_index + len(target_phrase):].strip()

def group_examples_random(ds, n): #this is where we group examples into a larger prompt
    random.seed()
    samples = random.sample(ds, n)
    new_prompt = ""
    for i in range(n):
        new_prompt += samples[i]
        new_prompt += "\n"
    return new_prompt

def create_similarity_dict(prompt, train_ds, n_egs=5, target_phrase="\n ### Targets:"):
    similarity_dict = {}
    for eg in train_ds:
        similarity_dict[eg] = sent_similarity(prompt, select_characters_before_target(eg, target_phrase))
    sorted_dict = sorted(similarity_dict.items(), key=lambda x: x[1], reverse=True)
    top_egs = []
    for item in sorted_dict[:n_egs]:
        top_egs.append(item[0])
    return top_egs


def group_by_similarity(prompt, ds, n_egs, m_choices, target_phrase="\n ### Targets:"):
    random.seed(42)
    choices = random.sample(ds, m_choices)
    cos_sim_dict = {}


    for c in choices:
        cos_sim_dict[c] = sent_similarity(prompt, select_characters_before_target(c, target_phrase))

    sorted_cos_sim = sorted(cos_sim_dict.items(), key=lambda x: x[1], reverse=True)
    top_egs = ""
    for item in sorted_cos_sim[:n_egs]:
        top_egs += item[0]
        top_egs += "\n"

    # top_egs = "".join([item[0] for item in sorted_cos_sim[:n_egs]])
    return top_egs

def count_tokens(tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, add_special_tokens=True)
    return len(input_ids)


def evaluate_example(model, tokenizer, prompt, model_name, max_tokens):
    if model_name == 'gpt2_small' or model_name == 'gpt2_medium':
        num_tokens = count_tokens(tokenizer, prompt)
        if num_tokens >= 900:
            return None
        print(max_tokens, num_tokens)
        model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs =model.generate(**model_inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=max_tokens) #set max_length to 1024 since GPT2 doesnt take nearly as long with ICL
        decoded_output = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):], skip_special_tokens=True)
        return decoded_output
    elif model_name == 'mistral':
        model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
        print("Max number of tokens: ", max_tokens)
        outputs =model.generate(**model_inputs, pad_token_id=tokenizer.eos_token_id, do_sample=True, max_new_tokens=max_tokens)
        decoded_output = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):], skip_special_tokens=True) #only get output
        return decoded_output
  

def evaluate_icl(train_dataset, test_dataset, model, tokenizer, num_egs, model_name, ds_name='ni', method='similarity', max_tokens_dict=None):
    reals = []
    preds = []
    counter = 0
    for example in test_dataset:
        # prompt = group_examples(train_dataset, num_egs) + f"### Question: {example['input']} \n ###Targets:"
        target_phrase = "\n ### Targets:"
        if ds_name == 'ni':
            curr_prompt = f"### Task: {example['definition']}\n ### Inputs: {example['inputs']}\n ### Targets:"
            real = f"{example['targets']}"
            max_tokens = max_tokens_dict[real] + 100
        elif ds_name == 'medmcq':
            curr_prompt = f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Answer:"
            real = f"{example['output']}"
            tokens = tokenizer(real, return_tensors='pt').to(device)
            max_tokens = len(tokens['input_ids'][0]) + 100
            target_phrase="### Answer:"
        elif ds_name == 'medmcq_2tok':
            curr_prompt = f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Answer:"
            real = f"{example['output']}"
            tokens = tokenizer(real, return_tensors='pt').to(device)
            max_tokens = len(tokens['input_ids'][0])
            target_phrase="### Answer:"
        elif ds_name == 'finance_sent':
            curr_prompt = f"### Text: {example['text']}\n ### Sentiment:"
            real = f"{example['label']}"
            tokens = tokenizer(real, return_tensors='pt').to(device)
            max_tokens = len(tokens['input_ids'][0]) + 100
        elif ds_name == 'finance_sent_2tok':
            curr_prompt = f"### Text: {example['text']}\n ### Sentiment:"
            real = f"{example['label']}"
            max_tokens = 2
        elif ds_name == 'medqa':
            curr_prompt = select_characters_before_target(example['text'], "### Response:")
            real = extract_response_content(example['text'], "### Response:")
            max_tokens = max_tokens_dict[real] + 100
        elif ds_name == 'lawqa':
            curr_prompt = f"### Question: {example['question']}\n ### Answer:"
            real = example['answer']
            max_tokens = max_tokens_dict[real] + 100
            target_phrase = "### Answer:"
        elif ds_name == 'alpaca':
            curr_prompt = f"### Instruction: {example['instruction']}\n ### Input: {example['input']}\n ### Output:"
            real = example['output']
            max_tokens = max_tokens_dict[real]+100
            target_phrase="### Output:"

        # tokens = tokenizer(real, return_tensors='pt').to(device)
        # max_tokens = len(tokens['input_ids'][0])
    
        if method == 'similarity':
            icl_prompt = group_by_similarity(curr_prompt, train_dataset, num_egs, 250, target_phrase) + curr_prompt
        elif method == 'random':
            icl_prompt = group_examples_random(train_dataset, num_egs) + curr_prompt

        # print("MAX TOKENS:\n", max_tokens)
        # print("\n ICL Prompt: ",icl_prompt)
        print("ICL prompt complete")
        pred = evaluate_example(model, tokenizer, icl_prompt, model_name, max_tokens)
        print("Prediction complete")

        if counter % 2 == 0:
            print("PROMPT:\n", icl_prompt)
            print("REAL ANSWER:\n", real)
            print("PREDICTION:\n", pred)
        if pred:
            reals.append(real.lower())
            preds.append(pred.lower())
        counter+=1

    return reals, preds

#### Mistral 8-bit too large for memory

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# tokenizer_mist_8= AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
# model_mist_8 = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1",  load_in_8bit=True, device_map='cuda')

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig




# model_plain =  GPT2LMHeadModel.from_pretrained("gpt2").to(device)
# tokenizer_plain = GPT2Tokenizer.from_pretrained("gpt2")
# print("models retrieved")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)



model_mist = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", quantization_config=bnb_config, device_map="auto")
tokenizer_mist = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

Loading checkpoint shards: 100%|██████████| 2/2 [00:43<00:00, 21.66s/it]


In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_gpt2=  GPT2LMHeadModel.from_pretrained("gpt2").to(device)
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:


model_gpt2_med=  GPT2LMHeadModel.from_pretrained("gpt2-medium").to(device)
tokenizer_gpt2_med = GPT2Tokenizer.from_pretrained("gpt2-medium")

In [None]:
def evaluate_example2(model, tokenizer, prompt):
    model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
    # print(prompt)
    # if len(tokenized_prompt['input_ids'][0]) > MAX_LENGTH: #currently just checking if random prompt is too big or not
    #     return None 
    outputs =model.generate(**model_inputs, pad_token_id= tokenizer.eos_token_id, do_sample=False, max_new_tokens = 5)
    decoded_output = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):], skip_special_tokens=True)
    # print("prediction: ",decoded_output)/
    return decoded_output


prompt = """ "featuring an oscar-worthy performance => positive\n"
    "completely messed up => negative\n"
    "masterpiece => positive\n"
    "the action is stilted => negative\n"
    "by far the worst movie of the year =>" """
pred = evaluate_example2(model_mist, tokenizer_mist, prompt) 
print(pred)

In [5]:

bleurt = evaluate.load("bleurt",'bleurt-large-512')



INFO:tensorflow:Reading checkpoint C:\Users\zakit\.cache\huggingface\metrics\bleurt\bleurt-large-512\downloads\extracted\3f937bb8d45f43db16ed64e68427a81be6250c9c6b0704e2e5ce3e3099d274c8\bleurt-large-512.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.

INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


### Test on Natural Instructions Data

In [None]:


tokenizer = tokenizer_gpt2_med
model = model_gpt2_med

data = load_from_disk('data/1000_per_task')

# data = filter_icl(data, max_num_egs, tokenizer_plain)

max_num_egs =  3   #natural instructions are just too big



# train_test_split = data.train_test_split(test_size=0.2, seed=42)
# train_dataset = train_test_split['train']
# test_dataset = train_test_split['test']

train_dataset = data['train']
test_dataset = data['test']

grouped_test_dataset = test_dataset.to_pandas().groupby('task_name').apply(lambda x: x.head(10)).reset_index(drop=True) #pick an array of tasks

test_dataset = Dataset.from_pandas(grouped_test_dataset.head(100))


icl_method = 'similarity'
model_name = 'gpt2_medium'
ds_name = 'ni'

def filter_example(example):
    return count_tokens(tokenizer, f"### Task: {example['definition']}\n ### Inputs: {example['inputs']}\n ### Targets: {example['targets']}") <= 300 #speeds up process

train_dataset = train_dataset.filter(filter_example)

print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name="ni")
print("Length of train set", len(train_list))

max_token_dict = {}
avg_token_size = 0
for eg in test_dataset:
    max_token_dict[eg['targets']] = count_tokens(tokenizer, eg['targets'])
    tokens = count_tokens(tokenizer, f"### Task: {eg['definition']}\n ### Inputs: {eg['inputs']}\n ### Targets: {eg['targets']}")
    avg_token_size+= tokens

print(avg_token_size/len(test_dataset))

bert_scores = []
results_data = []
with open(f'icl_results/outputs/{icl_method}_{ds_name}_{model_name}8bit.txt', 'w', encoding='utf-8') as file:
    for i in range(max_num_egs):
        reals, preds = evaluate_icl(train_list, test_dataset, model, tokenizer, i, model_name=model_name, ds_name=ds_name, method=icl_method, max_tokens_dict=max_token_dict)

        file.write(f'Reals ({i}): {reals}\n')
        file.write(f'Preds ({i}): {preds}\n\n')

### Test on Alpaca Dataset

In [None]:
tokenizer = tokenizer_gpt2_med
model = model_gpt2_med

data = load_dataset('tatsu-lab/alpaca')['train']

# data = filter_icl(data, max_num_egs, tokenizer_plain)

max_num_egs =  3   #natural instructions are just too big

bleu = evaluate.load('bleu')

def filter_example(example):
    return count_tokens(tokenizer, f"### Instruction: {example['instruction']}\n ### Input: {example['input']}\n ### Output: {example['output']}") <= 300

data = data.filter(filter_example)


train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))

# train_dataset = data['train']
# test_dataset = data['test'].select(range(100))


icl_method = 'random'
model_name = 'gpt2_medium'
ds_name = 'alpaca'

print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name="alpaca")
print("Length of train set", len(train_list))

max_token_dict = {}
for eg in test_dataset:
    max_token_dict[eg['output']] = count_tokens(tokenizer, eg['output'])

avg_token_size = 0
for eg in test_dataset:
    print(eg)
    tokens = count_tokens(tokenizer, f"### Instruction: {eg['instruction']}\n ### Input: {eg['input']}\n ### Output: {eg['output']}")
    avg_token_size+= tokens

print(avg_token_size/len(test_dataset))

bert_scores = []
results_data = []
with open(f'icl_results/outputs/{icl_method}_{ds_name}_{model_name}.txt', 'w', encoding='utf-8') as file:

    for i in range(2, max_num_egs):
        reals, preds = evaluate_icl(train_list, test_dataset, model, tokenizer, i, model_name=model_name, ds_name=ds_name, method=icl_method, max_tokens_dict=max_token_dict)
        # P, R, F1 = bert_score.score(preds, reals, lang="en")
        # average_F1 = sum(F1) / len(F1)
        # bert_scores.append(average_F1)
        # refs = [[r] for r in reals]
        # bleu_score = bleu.compute(predictions=preds, references=refs, max_order=1)  # set order to mean of real values
        # bleurt_score = bleurt.compute(predictions=preds, references=reals)
        # avg_bleurt = sum(bleurt_score['scores']) / len(bleurt_score['scores'])
        
        # Write real values and predictions to the file
        file.write(f'Reals ({i}): {reals}\n')
        file.write(f'Preds ({i}): {preds}\n\n')


In [None]:
# print(int(sum(len(s) for s in refs)/len(refs)))
# test = bleu.compute(predictions=['No', 'Yes', 'Yes', 'yes', 'Yes', 'yes', 'yes', 'No', 'yes', 'No'],
#                     references=['No.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.'], max_order=int(sum(len(s) for s in refs)/len(refs)))
# test

### Test on Medical MCQ Dataset

In [7]:
bleu = evaluate.load('bleu')

tokenizer = tokenizer_mist
model = model_mist

max_num_egs = 3

data = load_dataset('medalpaca/medical_meadow_medqa')['train']
train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))

def filter_example(example):
    return count_tokens(tokenizer, f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Answer: {example['output']}") <= 300

avg_token_size = 0
for example in test_dataset:
    tokens = count_tokens(tokenizer, f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Answer: {example['output']}")
    avg_token_size+= tokens

print(avg_token_size/len(test_dataset))

train_dataset = train_dataset.filter(filter_example)


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='medmcq')
print("Length of train set", len(train_list))

icl_method = 'similarity'
model_name = 'mistral'
ds_name = 'medmcq'

bert_scores = []
results_data = []

with open(f'icl_results/outputs/{icl_method}_{ds_name}_{model_name}.txt', 'w', encoding='utf-8') as file: #when opened, replaces everything -- be careful
    for i in range(max_num_egs):
        accuracy = 0
        reals, preds = evaluate_icl(train_list, test_dataset, model, tokenizer, i, model_name=model_name, ds_name=ds_name, method=icl_method)
       

        file.write(f'Reals ({i}): {reals}\n')
        file.write(f'Preds ({i}): {preds}\n\n')



293.23
Length of test set:  100
Length of train set 4826
ICL prompt complete
Max number of tokens:  110


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Prediction complete
PROMPT:
 ### Task: Please answer with one of the option in the bracket
 ### Question: Q:A 35-year-old woman comes to your office with a variety of complaints. As part of her evaluation, she undergoes laboratory testing which reveals the presence of anti-centromere antibodies. All of the following symptoms and signs would be expected to be present EXCEPT:? 
{'A': 'Pallor, cyanosis, and erythema of the hands', 'B': 'Calcium deposits on digits', 'C': 'Blanching vascular abnormalities', 'D': 'Hypercoagulable state', 'E': 'Heartburn and regurgitation'},
 ### Answer:
REAL ANSWER:
 D: Hypercoagulable state
PREDICTION:
 'D' hypercoagulable state
 ### Explanation:
 #### Q:A 35-year-old woman comes to your office with a variety of complaints. As part of her evaluation, she undergoes laboratory testing which reveals the presence of anti-centromere antibodies.
 #### Answer: 'D'
 #### Hypercoagulable state would not be expected during the patient's evaluation
 #### Cytokatin C1 

In [None]:
preds

In [None]:
P, R, F1 = bert_score.score(preds, reals, lang="en")
average_F1 = sum(F1) / len(F1)
average_F1

In [None]:
# results_df = pd.DataFrame(results_data)
# results_df.to_csv(f'icl_results/icl_results_{ds_name}_{icl_method}_{model_name}.csv', index=False)
# results_df.head()

### Test on Transcript Sentiment Analysis

In [None]:

max_num_egs = 3

data = load_dataset('jlh-ibm/earnings_call', 'transcript-sentiment')['train']
bleu = evaluate.load('bleu')

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))

tokenizer = tokenizer_gpt2
model = model_gpt2

def filter_example(example):
    return count_tokens(tokenizer, f"### Text: {example['text']}\n ### Sentiment: {example['label']}") <= 300

train_dataset = train_dataset.filter(filter_example)


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='finance_sent_2tok')
print("Length of train set", len(train_list))

icl_method = 'similarity'
model_name = 'gpt2_small'
ds_name = 'finance_sent_2tok'

bert_scores = []
results_data = []

with open(f'icl_results/outputs/{icl_method}_{ds_name}_{model_name}.txt', 'w', encoding='utf-8') as file:
    for i in range(max_num_egs):
        accuracy = 0
        reals, preds = evaluate_icl(train_list, test_dataset, model, tokenizer, i, model_name=model_name, ds_name=ds_name, method=icl_method)
  
        # file.write(f'Reals ({i}): {reals}\n')
        # file.write(f'Preds ({i}): {preds}\n\n')



In [None]:
average_F1

### Testing on Medicine QA

In [5]:

max_num_egs = 3

data = load_dataset('Laurent1/MedQuad-MedicalQnADataset_128tokens_max')['train']
bleu = evaluate.load('bleu')

tokenizer = tokenizer_gpt2
model = model_gpt2

def filter_example(example):
    return count_tokens(tokenizer, extract_response_content(example['text'], "### Response:")) <= 300

data = data.filter(filter_example)

print(len(data))

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


max_token_dict = {}
for example in test_dataset:
    real = extract_response_content(example['text'], "### Response:")
    max_token_dict[real] = count_tokens(tokenizer, real)


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='medqa')
print("Length of train set", len(train_list))

# avg_tokens = 0
# for eg in train_list:
#     avg_tokens+= count_tokens(tokenizer_mist, eg)
# avg_tokens  = avg_tokens/len(train_list)
# print("AVG TOKENS: ",avg_tokens)
#avg_tokens = 


icl_method = 'similarity'
model_name = 'gpt2_small'
ds_name = 'medqa'

bert_scores = []
results_data = []

with open(f'icl_results/outputs/{icl_method}_{ds_name}_{model_name}.txt', 'w', encoding='utf-8') as file:
    for i in range(max_num_egs):
        # accuracy = 0
        reals, preds = evaluate_icl(train_list, test_dataset, model, tokenizer, i, model_name=model_name, ds_name=ds_name, method=icl_method, max_tokens_dict=max_token_dict)
        # P, R, F1 = bert_score.score(preds, reals, lang="en")
        # average_F1 = sum(F1) / len(F1)
        # bert_scores.append(average_F1)
        # refs = [[r] for r in reals]
        # order = int(sum(len(s) for s in refs)/len(refs))
        # print("order: ", order)
        # bleu_score = bleu.compute(predictions=preds, references=refs, max_order=order)
        # print("DOING BLEURT")
        # bleurt_score = bleurt.compute(predictions=preds, references=reals)
        # print("DONE BLEURT")
        # avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
        # results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i, 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu'], 'bleurt' : avg_bleurt})

        file.write(f'Reals ({i}): {reals}\n')
        file.write(f'Preds ({i}): {preds}\n\n')

# results_df = pd.DataFrame(results_data)
# results_df.to_csv(f'icl_results/{icl_method}/icl_results_{ds_name}_{model_name}.csv', index=False)

15549
Length of test set:  100
Length of train set 12439
ICL prompt complete
152 36
Prediction complete
PROMPT:
 Below is an instruction from Human. Write a response.
    ### Instruction:
    Is anencephaly inherited ?
    ### Response:
REAL ANSWER:
 Most cases of anencephaly are sporadic, which means they occur in people with no history of the disorder in their family. A small percentage of cases have been reported to run in families; however, the condition does not have a clear pattern of inheritance.
PREDICTION:
                                                                                                                                                         
ICL prompt complete
158 41
Prediction complete
ICL prompt complete
170 41
Prediction complete
PROMPT:
 Below is an instruction from Human. Write a response.
    ### Instruction:
    What is (are) isolated Duane retraction syndrome ?
    ### Response:
REAL ANSWER:
 Isolated Duane retraction syndrome is a disorder of eye move

### Law QA

In [None]:

max_num_egs = 3

data = load_dataset('dzunggg/legal-qa-v1')['train']
bleu = evaluate.load('bleu')

tokenizer = tokenizer_gpt2
model = model_gpt2

def filter_example(example):
    return count_tokens(tokenizer, f"### Question: {example['question']}\n ### Answer: {example['answer']}") <= 300

data = data.filter(filter_example)

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='lawqa')
print("Length of train set", len(train_list))

max_token_dict = {}
for eg in test_dataset:
    max_token_dict[eg['answer']] = count_tokens(tokenizer, eg['answer'])

# avg_tokens = 0
# for eg in train_list:
#     avg_tokens+= count_tokens(tokenizer_mist, eg)
# avg_tokens  = avg_tokens/len(train_list)
# print("AVG TOKENS: ",avg_tokens)
#avg_tokens = 



# print("Token Dict complete")

icl_method = 'random'
model_name = 'gpt2_small'
ds_name = 'lawqa'

bert_scores = []
results_data = []

with open(f'icl_results/outputs/{icl_method}_{ds_name}_{model_name}.txt', 'w', encoding='utf-8') as file:
    for i in range(max_num_egs):
        accuracy = 0
        reals, preds = evaluate_icl(train_list, test_dataset, model, tokenizer, i, model_name=model_name, ds_name=ds_name, method=icl_method, max_tokens_dict=max_token_dict)
        # P, R, F1 = bert_score.score(preds, reals, lang="en")
        # average_F1 = sum(F1) / len(F1)
        # bert_scores.append(average_F1)
        # refs = [[r] for r in reals]
        # order = int(sum(len(s) for s in refs)/len(refs))
        # bleu_score = bleu.compute(predictions=preds, references=refs, max_order=order)
        # print("DOING BLEURT")
        # bleurt_score = bleurt.compute(predictions=preds, references=reals)
        # print("DONE BLEURT")
        # avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
        # results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i, 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu'], 'bleurt':avg_bleurt})

        file.write(f'Reals ({i}): {reals}\n')
        file.write(f'Preds ({i}): {preds}\n\n')

results_df = pd.DataFrame(results_data)
# results_df.to_csv(f'icl_results/{icl_method}/icl_results_{ds_name}_{model_name}.csv', index=False)

### Reading in ICL Results

In [9]:
import ast

reals_list = []
preds_list = []

icl_method = 'similarity'
ds_name = 'finance_sent'
model_name='gpt2_small'

results_data = []

# Open the file containing the real values and predictions
with open(f'icl_results/outputs/{icl_method}_{ds_name}_{model_name}.txt', 'r', encoding='utf-8') as file:
    # Read lines from the file
    lines = file.readlines()
    print(len(lines))
    i=0
    # Iterate through lines
    while i < len(lines)-1: # Assuming that every 3 lines correspond to one iteration
        # Extract real values and predictions and append them to the respective lists
        preds = ast.literal_eval(lines[i+1][11:])
        reals = ast.literal_eval(lines[i][11:])
        print(reals)
        print(preds)
        P, R, F1 = bert_score.score(preds, reals, lang="en")
        average_F1 = sum(F1) / len(F1)
        # bert_scores.append(average_F1)
        refs = [[r] for r in reals]
        order = int(sum(len(s) for s in refs)/len(refs))
        bleu_score = bleu.compute(predictions=preds, references=refs, max_order=1)
        bleu_score2 = bleu.compute(predictions=preds, references=refs, max_order = 2)
        # bleurt_score = bleurt.compute(predictions=preds, references=reals)
        # avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
        results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i//3, 'bert_score' : float(average_F1), 'bleu_score-1' : bleu_score['bleu'], 'bleu-2':bleu_score2['bleu']})
        i+=3

results_df = pd.DataFrame(results_data)
results_df.head()

9
['positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'negative', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'positive', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative', 'negative', 'positive', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'positive', 'positive', 'positive', 'negative', '

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [None]:
results_df.head()

### Looking at some individual responses

In [None]:
import ast

reals_list = []
preds_list = []

icl_method = 'similarity'
ds_name = 'medmcq'
model_name='gpt2_small'

results_data = []

# Open the file containing the real values and predictions
with open(f'icl_results/outputs/{icl_method}_{ds_name}_{model_name}.txt', 'r', encoding='utf-8') as file:
    # Read lines from the file
    lines = file.readlines()
    i=0
    # Iterate through lines
    while i < len(lines)-1: # Assuming that every 3 lines correspond to one iteration
        # Extract real values and predictions and append them to the respective lists
        preds = ast.literal_eval(lines[i+1][11:])
        reals = ast.literal_eval(lines[i][11:])
        reals_list.append(reals)
        preds_list.append(preds)
        i+=3


In [None]:
import ast

reals_list2 = []
preds_list2 = []


results_data = []

# Open the file containing the real values and predictions
with open(f'it_results/outputs/medmcq_gpt2ni.txt', 'r', encoding='utf-8') as file:
    # Read lines from the file
    lines = file.readlines()
    i=0
    # Iterate through lines
    while i < len(lines)-1: # Assuming that every 3 lines correspond to one iteration
        # Extract real values and predictions and append them to the respective lists
        preds = ast.literal_eval(lines[i+1][11:])
        reals = ast.literal_eval(lines[i][11:])
        reals_list2.append(reals)
        preds_list2.append(preds)
        i+=3

In [None]:
bleu_score2 = bleu.compute(predictions=preds_list2[0], references= reals_list2[0], max_order=1)
bleu_score2

In [None]:
bleu_score = bleu.compute(predictions=[preds_list2[0][5]], references= [reals_list2[0][5]], max_order=1)
bleu_score

In [None]:
bleu_score = bleu.compute(predictions=[preds_list[2][5]], references= [reals_list[2][5]])
bleu_score

In [None]:
print("PREDICTION (ICL): ", preds_list[2][5])
print("PREDICTION (IT): ", preds_list2[0][5])
print("REAL: ", reals_list2[0][5])

In [None]:
acc = 0
for pred, real in zip(preds_list2[0], reals_list2[0]):
    print(pred, real.strip()[0])
    if pred.strip()[0] == real.strip()[0]:
        acc+=1
acc = acc/len(reals_list2[0])
acc

### Averaging ICL Results

In [None]:
import os
import pandas as pd

# Specify the directory containing your CSV files
directory = 'icl_results/random'

# Initialize an empty list to store DataFrames
dfs = []

for filename in os.listdir(directory):
    if filename.endswith('.csv') and 'gpt2_small' in filename:
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)

concatenated_df = pd.concat(dfs, ignore_index=True)
grouped_df = concatenated_df.groupby('num_demonstrations')[['bert_score', 'bleu_score', 'bleurt']].mean()
print(grouped_df.head())


In [None]:
import os
import pandas as pd

# Specify the directory containing your CSV files
directory = 'icl_results/similarity'

# Initialize an empty list to store DataFrames
dfs = []

for filename in os.listdir(directory):
    if filename.endswith('.csv') and 'gpt2_small' in filename:
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)

concatenated_df = pd.concat(dfs, ignore_index=True)
grouped_df = concatenated_df.groupby('num_demonstrations')[['bert_score', 'bleu_score', 'bleurt']].mean()
print(grouped_df.head())


In [None]:
import os
import pandas as pd

# Specify the directory containing your CSV files
directory = 'icl_results/random'

# Initialize an empty list to store DataFrames
dfs = []

for filename in os.listdir(directory):
    if filename.endswith('.csv') and 'mistral' in filename:
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)

concatenated_df = pd.concat(dfs, ignore_index=True)
grouped_df = concatenated_df.groupby('num_demonstrations')[['bert_score', 'bleu_score', 'bleurt']].mean()
print(grouped_df.head())


In [None]:
import os
import pandas as pd

# Specify the directory containing your CSV files
directory = 'icl_results/similarity'

# Initialize an empty list to store DataFrames
dfs = []

for filename in os.listdir(directory):
    if filename.endswith('.csv') and 'mistral' in filename:
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)

concatenated_df = pd.concat(dfs, ignore_index=True)
grouped_df = concatenated_df.groupby('num_demonstrations')[['bert_score', 'bleu_score', 'bleurt']].mean()
print(grouped_df.head())
