## ICL - GPT2

In [1]:
import torch
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric
import bert_score
import evaluate
import random
import matplotlib.pyplot as plt
import pandas as pd
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device being used:", device)

  from .autonotebook import tqdm as notebook_tqdm



Device being used: cuda


In [17]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


sent_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def sent_similarity(sent1, sent2):
    sentences = [sent1, sent2]
    embeddings = sent_model.encode(sentences)
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix[0][1]

# def sent_similarity(prompt, sentences):
#     P, R, F1 = bert_score.score([prompt]*len(sentences),sentences, lang="en")
#     return F1.tolist()


def format_examples(ds, ds_name='ni'):
    prompts = []
    if ds_name == 'ni':
        for example in ds:
            # prompt = f"### Question: {example['input']} \n ###Targets: {example['output']}"
            prompt = f"### Task: {example['definition']}\n ### Inputs: {example['inputs']}\n ### Targets: {example['targets']}"
            prompts.append(prompt)
    elif ds_name == 'medmcq':
         for example in ds:
            prompt = f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Answer: {example['output']}"
            prompts.append(prompt)
    elif ds_name == 'finance_sent':
        for example in ds:
            prompt = f"### Text: {example['text']}\n ### Targets: {example['label']}"
            prompts.append(prompt)
    elif ds_name == 'medqa':
        for example in ds:
            prompts.append(example['text'])
    elif ds_name == 'lawqa':
        for example in ds:
            prompt = f"### Question: {example['question']}\n ### Answer: {example['answer']}"
            prompts.append(prompt)
    elif ds_name == 'alpaca':
        for example in ds:
            prompt = f"### Instruction: {example['instruction']}\n ### Input: {example['input']}\n ### Text: {example['text']} \n ### Output: {example['output']}"
            prompts.append(prompt)

    return prompts

def select_characters_before_target(string, target_phrase="\n ### Targets:"): #this is a function to remove the actual target values from the train example so that the matching can be improved
    target_index = string.find(target_phrase)
    if target_index != -1:  # If the phrase is found
        return string[:target_index] + target_phrase
    else:
        return string 
    
def extract_response_content(string, target_phrase):
    response_index = string.find(target_phrase)
    return string[response_index + len(target_phrase):].strip()

def group_examples_random(ds, n): #this is where we group examples into a larger prompt
    random.seed()
    samples = random.sample(ds, n)
    new_prompt = ""
    for i in range(n):
        new_prompt += samples[i]
        new_prompt += "\n"
    return new_prompt

def create_similarity_dict(prompt, train_ds, n_egs=5, target_phrase="\n ### Targets:"):
    similarity_dict = {}
    for eg in train_ds:
        similarity_dict[eg] = sent_similarity(prompt, select_characters_before_target(eg, target_phrase))
    sorted_dict = sorted(similarity_dict.items(), key=lambda x: x[1], reverse=True)
    top_egs = []
    for item in sorted_dict[:n_egs]:
        top_egs.append(item[0])
    return top_egs


def group_by_similarity(prompt, ds, n_egs, m_choices, target_phrase="\n ### Targets:"):
    random.seed(42)
    choices = random.sample(ds, m_choices)
    cos_sim_dict = {}
    # bert_scores = sent_similarity(prompt, choices)
    # choices_with_scores = list(zip(choices, bert_scores))

    # sorted_choices = sorted(choices_with_scores, key=lambda x: x[1], reverse=True)
    # top_egs = ""
    # for choice, _ in sorted_choices[:n_egs]:
    #     top_egs += choice
    #     top_egs += "\n"

    # return top_egs
    

    for c in choices:
        cos_sim_dict[c] = sent_similarity(prompt, select_characters_before_target(c, target_phrase))

    sorted_cos_sim = sorted(cos_sim_dict.items(), key=lambda x: x[1], reverse=True)
    top_egs = ""
    for item in sorted_cos_sim[:n_egs]:
        top_egs += item[0]
        top_egs += "\n"

    # top_egs = "".join([item[0] for item in sorted_cos_sim[:n_egs]])
    return top_egs

def count_tokens(tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, add_special_tokens=True)
    return len(input_ids)


def evaluate_example(model, tokenizer, prompt, model_name, max_tokens):
    if model_name == 'gpt2_small':
        num_tokens = count_tokens(tokenizer, prompt)
        if num_tokens >= 900:
            return None
        print(max_tokens, num_tokens)
        model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs =model.generate(**model_inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=max_tokens) #set max_length to 1024 since GPT2 doesnt take nearly as long with ICL
        decoded_output = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):], skip_special_tokens=True)
        return decoded_output
    elif model_name == 'mistral':
        # num_tokens = count_tokens(tokenizer, prompt)
        # print("Num tokens in prompt: ", num_tokens)
        # if num_tokens > 3400:
        #     return None
        model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
        print("Max number of tokens: ", max_tokens)
        outputs =model.generate(**model_inputs, pad_token_id=tokenizer.eos_token_id, do_sample=True, max_new_tokens=max_tokens)
        decoded_output = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):], skip_special_tokens=True) #only get output
        return decoded_output
  

def evaluate_icl(train_dataset, test_dataset, model, tokenizer, num_egs, model_name, ds_name='ni', method='similarity', max_tokens_dict=None):
    reals = []
    preds = []
    counter = 0
    for example in test_dataset:
        # prompt = group_examples(train_dataset, num_egs) + f"### Question: {example['input']} \n ###Targets:"
        target_phrase = "\n ### Targets:"
        if ds_name == 'ni':
            curr_prompt = f"### Task: {example['definition']}\n ### Inputs: {example['inputs']}\n ### Targets:"
            real = f"{example['targets']}"
            max_tokens = max_tokens_dict[real] + 100
        elif ds_name == 'medmcq':
            curr_prompt = f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Answer:"
            real = f"{example['output']}"
            tokens = tokenizer(real, return_tensors='pt').to(device)
            max_tokens = len(tokens['input_ids'][0]) + 10
            target_phrase="### Answer:"
        elif ds_name == 'finance_sent':
            curr_prompt = f"### Text: {example['text']}\n ### Targets:"
            real = f"{example['label']}"
            tokens = tokenizer(real, return_tensors='pt').to(device)
            max_tokens = len(tokens['input_ids'][0]) + 10
        elif ds_name == 'medqa':
            curr_prompt = select_characters_before_target(example['text'], "### Response:")
            real = extract_response_content(example['text'], "### Response:")
            max_tokens = max_tokens_dict[real] + 100
        elif ds_name == 'lawqa':
            curr_prompt = f"### Question: {example['question']}\n ### Answer:"
            real = example['answer']
            max_tokens = max_tokens_dict[real] + 100
            target_phrase = "### Answer:"
        elif ds_name == 'alpaca':
            curr_prompt = f"### Instruction: {example['instruction']}\n ### Input: {example['input']}\n ### Text: {example['text']}\n ### Output:"
            real = example['output']
            max_tokens = max_tokens_dict[real]+100
            target_phrase="### Output:"

        # tokens = tokenizer(real, return_tensors='pt').to(device)
        # max_tokens = len(tokens['input_ids'][0])
    
        if method == 'similarity':
            icl_prompt = group_by_similarity(curr_prompt, train_dataset, num_egs, 250, target_phrase) + curr_prompt
        elif method == 'random':
            icl_prompt = group_examples_random(train_dataset, num_egs) + curr_prompt

        # print("MAX TOKENS:\n", max_tokens)
        # print("\n ICL Prompt: ",icl_prompt)
        print("ICL prompt complete")
        pred = evaluate_example(model, tokenizer, icl_prompt, model_name, max_tokens)
        print("Prediction complete")

        if counter % 50 == 0:
            print("PROMPT:\n", icl_prompt)
            print("REAL ANSWER:\n", real)
            print("PREDICTION:\n", pred)
        if pred:
            reals.append(real.lower())
            preds.append(pred.lower())
        counter+=1

    return reals, preds

In [23]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
tokenizer_mist_8= AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model_mist_8 = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1",  load_in_8bit=True, device_map='cuda')

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards:   0%|          | 0/2 [00:02<?, ?it/s]


KeyboardInterrupt: 

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig




# model_plain =  GPT2LMHeadModel.from_pretrained("gpt2").to(device)
# tokenizer_plain = GPT2Tokenizer.from_pretrained("gpt2")
# print("models retrieved")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)



model_mist = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", quantization_config=bnb_config, device_map="auto")
tokenizer_mist = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

Loading checkpoint shards: 100%|██████████| 2/2 [00:37<00:00, 18.70s/it]


In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_gpt2=  GPT2LMHeadModel.from_pretrained("gpt2").to(device)
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
def evaluate_example2(model, tokenizer, prompt):
    model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
    # print(prompt)
    # if len(tokenized_prompt['input_ids'][0]) > MAX_LENGTH: #currently just checking if random prompt is too big or not
    #     return None 
    outputs =model.generate(**model_inputs, pad_token_id= tokenizer.eos_token_id, do_sample=False, max_new_tokens = 5)
    decoded_output = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):], skip_special_tokens=True)
    # print("prediction: ",decoded_output)/
    return decoded_output


prompt = """ "featuring an oscar-worthy performance => positive\n"
    "completely messed up => negative\n"
    "masterpiece => positive\n"
    "the action is stilted => negative\n"
    "by far the worst movie of the year =>" """
pred = evaluate_example2(model_mist_8, tokenizer_mist_8, prompt) 
print(pred)

In [6]:

bleurt = evaluate.load("bleurt",'bleurt-large-512')


INFO:tensorflow:Reading checkpoint C:\Users\zakit\.cache\huggingface\metrics\bleurt\bleurt-large-512\downloads\extracted\3f937bb8d45f43db16ed64e68427a81be6250c9c6b0704e2e5ce3e3099d274c8\bleurt-large-512.


INFO:tensorflow:Reading checkpoint C:\Users\zakit\.cache\huggingface\metrics\bleurt\bleurt-large-512\downloads\extracted\3f937bb8d45f43db16ed64e68427a81be6250c9c6b0704e2e5ce3e3099d274c8\bleurt-large-512.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


### Test on Natural Instructions Data

In [19]:


tokenizer = tokenizer_gpt2
model = model_gpt2

data = load_from_disk('data/1000_per_task')

# data = filter_icl(data, max_num_egs, tokenizer_plain)

max_num_egs =  3   #natural instructions are just too big

bleu = evaluate.load('bleu')

# train_test_split = data.train_test_split(test_size=0.2, seed=42)
# train_dataset = train_test_split['train']
# test_dataset = train_test_split['test']

train_dataset = data['train']
test_dataset = data['test']

grouped_test_dataset = test_dataset.to_pandas().groupby('task_name').apply(lambda x: x.head(10)).reset_index(drop=True) #pick an array of tasks

test_dataset = Dataset.from_pandas(grouped_test_dataset.head(100))


icl_method = 'similarity'
model_name = 'gpt2_small'
ds_name = 'ni'

def filter_example(example):
    return count_tokens(tokenizer, f"### Task: {example['definition']}\n ### Inputs: {example['inputs']}\n ### Targets: {example['targets']}") <= 300 #speeds up process

train_dataset = train_dataset.filter(filter_example)

print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name="ni")
print("Length of train set", len(train_list))

max_token_dict = {}
for eg in test_dataset:
    max_token_dict[eg['targets']] = count_tokens(tokenizer, eg['targets'])

# csv_file = f'icl_results/similarity_dicts_{ds_name}.csv'
# if os.path.isfile(csv_file):
#     df = pd.read_csv(csv_file)
#     test_similarity_dict = dict(zip(df['Prompt'], df['Similar_Prompts']))
# else:
#     test_similarity_dict = {}
#     for eg in test_dataset:
#         prompt = f"### Task: {eg['definition']}\n ### Inputs: {eg['inputs']}\n ### Targets:"
#         test_similarity_dict[eg] = create_similarity_dict(prompt, train_list)
#     print(test_similarity_dict)
#     df = pd.DataFrame(list(test_similarity_dict.items()), columns=['Prompt', 'Similar_Prompts'])
#     df.to_csv(csv_file, index=False)

# print(test_similarity_dict)


bert_scores = []
results_data = []
for i in range(max_num_egs):
    reals, preds = evaluate_icl(train_list, test_dataset, model, tokenizer, i, model_name=model_name, ds_name=ds_name, method=icl_method, max_tokens_dict=max_token_dict)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    print(order)
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order) #set order to mean of real values
    bleurt_score = bleurt.compute(predictions=preds, references=reals)
    avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
    results_data.append({'num_samples' : len(preds), 'num_demonstrations':i, 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu'], 'bleurt_score' : avg_bleurt})

results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/{icl_method}/icl_results_{ds_name}_{model_name}.csv', index=False)

  grouped_test_dataset = test_dataset.to_pandas().groupby('task_name').apply(lambda x: x.head(10)).reset_index(drop=True) #pick an array of tasks


Length of test set:  100
Length of train set 487107
ICL prompt complete
102 112
Prediction complete
PROMPT:
 ### Task: The answer will be 'yes' if the provided sentence contains an explicit mention that answers the given question. Otherwise, the answer should be 'no'. Instances where the answer is implied from the sentence using "instinct" or "common sense" (as opposed to being written explicitly in the sentence) should be labeled as 'no'.
 ### Inputs: Sentence: Jerry goes out to the pier and casts his favorite bait : cheese . 
Question: How much time did Jerry spend at the pier?
 ### Targets:
REAL ANSWER:
 No.
PREDICTION:
  Jerry, Jerry's wife, Jerry's girlfriend, Jerry's best friend, Jerry's best friend's girlfriend, Jerry's best friend's girlfriend, Jerry's best friend's girlfriend, Jerry's best friend's girlfriend, Jerry's best friend's girlfriend, Jerry's best friend's girlfriend, Jerry's best friend's girlfriend, Jerry's best friend's girlfriend, Jerry's best friend's girlfriend,

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1
ICL prompt complete
102 217
Prediction complete
PROMPT:
 ### Task: You are given a sentence, a question and two answer options ('A' and 'B'). Your task is to find the correct answer (return the string of the correct option, not 'A' or 'B') for the given question.
 ### Inputs: Sentence: Stabbing a fork into a steak causes it to move slower then stabbing it into a muffin. Question: Which surface provides less resistance? (A) muffin (B) steak
 ### Targets: muffin
### Task: The answer will be 'yes' if the provided sentence contains an explicit mention that answers the given question. Otherwise, the answer should be 'no'. Instances where the answer is implied from the sentence using "instinct" or "common sense" (as opposed to being written explicitly in the sentence) should be labeled as 'no'.
 ### Inputs: Sentence: Jerry goes out to the pier and casts his favorite bait : cheese . 
Question: How much time did Jerry spend at the pier?
 ### Targets:
REAL ANSWER:
 No.
PREDICTION:
  cheese
##

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1
ICL prompt complete
102 317
Prediction complete
PROMPT:
 ### Task: You are given a sentence, a question and two answer options ('A' and 'B'). Your task is to find the correct answer (return the string of the correct option, not 'A' or 'B') for the given question.
 ### Inputs: Sentence: Stabbing a fork into a steak causes it to move slower then stabbing it into a muffin. Question: Which surface provides less resistance? (A) muffin (B) steak
 ### Targets: muffin
### Task: In this task, You are given an open-domain question that can be answered based on factual information. Your task is to provide \*short\* answer (in a few words only) for the given question. The short answer can be one or more entities or it can also be boolean \*yes\* or \*no\*.
 ### Inputs: who played scarlet o'hara in gone with the wind
 ### Targets: Vivien Leigh
### Task: The answer will be 'yes' if the provided sentence contains an explicit mention that answers the given question. Otherwise, the answer should be '

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1


### Test on Alpaca Dataset

In [21]:
tokenizer = tokenizer_mist
model = model_mist

data = load_dataset('tatsu-lab/alpaca')['train']

# data = filter_icl(data, max_num_egs, tokenizer_plain)

max_num_egs =  3   #natural instructions are just too big

bleu = evaluate.load('bleu')

def filter_example(example):
    return count_tokens(tokenizer, f"### Instruction: {example['instruction']}\n ### Input: {example['input']}\n ### Text: {example['text']} \n ### Output: {example['output']}") <= 300

data = data.filter(filter_example)


train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))

# train_dataset = data['train']
# test_dataset = data['test'].select(range(100))


icl_method = 'similarity'
model_name = 'mistral'
ds_name = 'alpaca'

print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name="alpaca")
print("Length of train set", len(train_list))

max_token_dict = {}
for eg in test_dataset:
    max_token_dict[eg['output']] = count_tokens(tokenizer, eg['output'])
print(max_token_dict)


bert_scores = []
results_data = []
for i in range(max_num_egs):
    reals, preds = evaluate_icl(train_list, test_dataset, model, tokenizer, i, model_name=model_name, ds_name=ds_name, method=icl_method, max_tokens_dict=max_token_dict)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    print(order)
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order) #set order to mean of real values
    bleurt_score = bleurt.compute(predictions=preds, references=reals)
    avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
    results_data.append({'num_samples' : len(preds), 'num_demonstrations':i, 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu'], 'bleurt_score' : avg_bleurt})

results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/{icl_method}/icl_results_{ds_name}_{model_name}.csv', index=False)

# plt.figure(figsize=(10, 2))
# plt.plot(range(max_num_egs), bert_scores) 
# plt.xlabel('Number of examples') 
# plt.ylabel('BERT F1 Score') 
# plt.title('BERT F1 Score vs Number of Examples') 
# plt.xticks(range(max_num_egs))
# plt.savefig('BERT_scores_icl_ni.png')



Length of test set:  100
Length of train set 33633
{'The maximum number of steps in a Fibonacci sequence is 93.': 18, 'An example of the Second Law of Thermodynamics is when heat flows from a hot object to a cold object. The hot object cools down, and the cold object warms up, but the energy is not conserved and some of the heat energy is lost.': 56, "It's time to get ready for the thrilling new movie! Get ready to experience edge-of-your-seat suspense, nail-biting suspense, and clever twists. #MovieNight #UpcomingMovie": 52, 'Prime numbers: 5, 31\nComposite numbers: 16, 9, 18': 25, 'Simile.': 4, 'A good exercise routine should incorporate aerobic exercise, strength training, and flexibility exercises. Examples of aerobic activities include walking, jogging, biking, or swimming. Strength training exercises include lifting weights, using resistance bands and bodyweight exercises. Flexibility exercises include stretching and yoga. A well structured routine should also have rest and recov

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1
ICL prompt complete
Max number of tokens:  118
Prediction complete
PROMPT:
 ### Instruction: Identify the starting and ending point of this sequence
 ### Input: 2, 4, 8, 16
 ### Text: Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Identify the starting and ending point of this sequence

### Input:
2, 4, 8, 16

### Response:
The starting point of the sequence is 2 and the ending point is 16. 
 ### Output: The starting point of the sequence is 2 and the ending point is 16.
### Instruction: What is the maximum number of steps in a Fibonacci sequence?
 ### Input: No input.
 ### Text: Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
What is the maximum number of steps in a Fibonacci sequence?

### Input:
No input.

### Response:
The maxi

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1
ICL prompt complete
Max number of tokens:  118
Prediction complete
PROMPT:
 ### Instruction: Identify the starting and ending point of this sequence
 ### Input: 2, 4, 8, 16
 ### Text: Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Identify the starting and ending point of this sequence

### Input:
2, 4, 8, 16

### Response:
The starting point of the sequence is 2 and the ending point is 16. 
 ### Output: The starting point of the sequence is 2 and the ending point is 16.
### Instruction: Generate the third term in the sequence 2, 5, 9, 14.
 ### Input: 
 ### Text: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Generate the third term in the sequence 2, 5, 9, 14.

### Response:
18 
 ### Output: 18
### Instruction: What is the maximum number of steps in a Fibonacci sequence?
 ### Inpu

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1


In [None]:
# print(int(sum(len(s) for s in refs)/len(refs)))
# test = bleu.compute(predictions=['No', 'Yes', 'Yes', 'yes', 'Yes', 'yes', 'yes', 'No', 'yes', 'No'],
#                     references=['No.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.'], max_order=int(sum(len(s) for s in refs)/len(refs)))
# test

### Test on Medical MCQ Dataset

In [24]:
bleu = evaluate.load('bleu')

tokenizer = tokenizer_gpt2
model = model_gpt2

max_num_egs = 3

data = load_dataset('medalpaca/medical_meadow_medqa')['train']
train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))

def filter_example(example):
    return count_tokens(tokenizer, f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Answer: {example['output']}") <= 300

train_dataset = train_dataset.filter(filter_example)


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='medmcq')
print("Length of train set", len(train_list))

icl_method = 'similarity'
model_name = 'gpt2_small'
ds_name = 'medmcq'

bert_scores = []
results_data = []


for i in range(max_num_egs):
    accuracy = 0
    reals, preds = evaluate_icl(train_list, test_dataset, model_mist, tokenizer_mist, i, model_name=model_name, ds_name=ds_name, method=icl_method)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    print("order: ", order)
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order=order)
    for r,p in zip(reals, preds):
        if len(p.strip()) != 0:
            if r.strip()[0] == p.strip()[0]:
                accuracy+=1
    accuracy = accuracy/len(preds)
    print("DOING BLEURT")
    bleurt_score = bleurt.compute(predictions=preds, references=reals)
    print("DONE BLEURT")
    avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
    results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i, 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu'], 'accuracy':accuracy, 'bleurt_score' : avg_bleurt})

results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/{icl_method}/icl_results_{ds_name}_{model_name}.csv', index=False)

# plt.figure(figsize=(10, 2))
# plt.plot(range(max_num_egs), bert_scores) 
# plt.xlabel('Number of examples') 
# plt.ylabel('BERT F1 Score')
# plt.title('BERT F1 Score vs Number of Examples') 
# plt.xticks(range(max_num_egs))
# plt.savefig('BERT_scores_icl_medqa.png')


Length of test set:  100
Length of train set 6086
ICL prompt complete
20 160
Prediction complete
PROMPT:
 ### Task: Please answer with one of the option in the bracket
 ### Question: Q:A 35-year-old woman comes to your office with a variety of complaints. As part of her evaluation, she undergoes laboratory testing which reveals the presence of anti-centromere antibodies. All of the following symptoms and signs would be expected to be present EXCEPT:? 
{'A': 'Pallor, cyanosis, and erythema of the hands', 'B': 'Calcium deposits on digits', 'C': 'Blanching vascular abnormalities', 'D': 'Hypercoagulable state', 'E': 'Heartburn and regurgitation'},
 ### Answer:
REAL ANSWER:
 D: Hypercoagulable state
PREDICTION:
 D
 ### Explanation:
 ### Answer: D
 ### Explanation:
 ###
ICL prompt complete
15 188
Prediction complete
ICL prompt complete
23 309
Prediction complete
ICL prompt complete
17 386
Prediction complete
ICL prompt complete
18 334
Prediction complete
ICL prompt complete
18 478
Prediction

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


order:  1
DOING BLEURT
DONE BLEURT
ICL prompt complete
20 392
Prediction complete
PROMPT:
 ### Task: Please answer with one of the option in the bracket
 ### Question: Q:A 51-year-old white female presents to her primary care physician for a regular check-up. She endorses eating a healthy diet with a balance of meat and vegetables. She also states that she has a glass of wine each night with dinner. As part of the evaluation, a complete blood count and blood smear were performed and are remarkable for: Hemoglobin 8.7 g/dL, Hematocrit 27%, MCV 111 fL, and a smear showing macrocytes and several hypersegmented neutrophils. Suspecting an autoimmune condition with anti-intrinsic factor antibodies, what other finding might you expect in this patient?? 
{'A': 'High serum TSH', 'B': 'Psorasis', 'C': 'Cheilosis', 'D': 'Bleeding gums', 'E': 'Abdominal colic'},
 ### Answer: A: High serum TSH
### Task: Please answer with one of the option in the bracket
 ### Question: Q:A 35-year-old woman comes t

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


order:  1
DOING BLEURT
DONE BLEURT
ICL prompt complete
20 624
Prediction complete
PROMPT:
 ### Task: Please answer with one of the option in the bracket
 ### Question: Q:A 51-year-old white female presents to her primary care physician for a regular check-up. She endorses eating a healthy diet with a balance of meat and vegetables. She also states that she has a glass of wine each night with dinner. As part of the evaluation, a complete blood count and blood smear were performed and are remarkable for: Hemoglobin 8.7 g/dL, Hematocrit 27%, MCV 111 fL, and a smear showing macrocytes and several hypersegmented neutrophils. Suspecting an autoimmune condition with anti-intrinsic factor antibodies, what other finding might you expect in this patient?? 
{'A': 'High serum TSH', 'B': 'Psorasis', 'C': 'Cheilosis', 'D': 'Bleeding gums', 'E': 'Abdominal colic'},
 ### Answer: A: High serum TSH
### Task: Please answer with one of the option in the bracket
 ### Question: Q:A 40-year-old man presents 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


order:  1
DOING BLEURT
DONE BLEURT


In [None]:
results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/icl_results_{ds_name}_{icl_method}_{model_name}.csv', index=False)
results_df.head()

### Test on Transcript Sentiment Analysis

In [26]:

max_num_egs = 3

data = load_dataset('jlh-ibm/earnings_call', 'transcript-sentiment')['train']
bleu = evaluate.load('bleu')

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))

tokenizer = tokenizer_gpt2
model = model_gpt2

def filter_example(example):
    return count_tokens(tokenizer, f"### Text: {example['text']}\n ### Targets: {example['label']}") <= 300

train_dataset = train_dataset.filter(filter_example)


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='finance_sent')
print("Length of train set", len(train_list))

icl_method = 'similarity'
model_name = 'gpt2_small'
ds_name = 'finance_sent'

bert_scores = []
results_data = []


for i in range(max_num_egs):
    accuracy = 0
    reals, preds = evaluate_icl(train_list, test_dataset, model, tokenizer, i, model_name=model_name, ds_name=ds_name, method=icl_method)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    print("order: ", order)
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order=order)
    for r,p in zip(reals, preds):
        if len(p.strip()) != 0:
            if r.strip()[0] == p.strip()[0]:
                accuracy+=1
    accuracy = accuracy/len(preds)
    print("DOING BLEURT")
    bleurt_score = bleurt.compute(predictions=preds, references=reals)
    print("DONE BLEURT")
    avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
    results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i, 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu'], 'accuracy':accuracy, 'bleurt' : avg_bleurt})

results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/{icl_method}/icl_results_{ds_name}_{model_name}.csv', index=False)

Length of test set:  100
Length of train set 4618
ICL prompt complete
11 125
Prediction complete
PROMPT:
 ### Text:  Mark Lipacis,  Jefferies LLC - Analyst    [4]  That's very helpful, thank you.  And then, last question. On the new  so, you're just starting to ship Pascal now.  I guess my understanding is that historically as you're shipping a new product, the yields have opportunity for improvement, and the more volume you ship the more you climb down the yield curve. What classically happens to here on the yield?  And, does that positively impact gross margins over the next three or four quarters? Thank you.
 ### Targets:
REAL ANSWER:
 positive
PREDICTION:
            
ICL prompt complete
11 244
Prediction complete
ICL prompt complete
11 151
Prediction complete
ICL prompt complete
11 100
Prediction complete
ICL prompt complete
11 107
Prediction complete
ICL prompt complete
11 59
Prediction complete
ICL prompt complete
11 483
Prediction complete
ICL prompt complete
11 134
Prediction 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


order:  1
DOING BLEURT
DONE BLEURT
ICL prompt complete
11 298
Prediction complete
PROMPT:
 ### Text: I think, if we look at it from an overall standpoint, in Q2, we did 41%. In the third quarter in a row, 41% gross margin. And in Q3, you're right that the decline in semi-custom, there is benefit to the margin. And the margin guide for Q3, that's approximately 43%. I can tell you that the richer product mix, especially with the new products ramping in Q3, are going to drive the gross margin. Although there is a benefit from the decline of semi-custom also, the margin benefit is more weighted towards the non-semi-custom business, and that's where we end up with the 43% in Q3. We've also updated our guidance for 2019 and now are projecting 42% for the year 2019.
 ### Targets: negative
### Text:  Mark Lipacis,  Jefferies LLC - Analyst    [4]  That's very helpful, thank you.  And then, last question. On the new  so, you're just starting to ship Pascal now.  I guess my understanding is that 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


order:  1
DOING BLEURT
DONE BLEURT
ICL prompt complete
11 430
Prediction complete
PROMPT:
 ### Text: I think, if we look at it from an overall standpoint, in Q2, we did 41%. In the third quarter in a row, 41% gross margin. And in Q3, you're right that the decline in semi-custom, there is benefit to the margin. And the margin guide for Q3, that's approximately 43%. I can tell you that the richer product mix, especially with the new products ramping in Q3, are going to drive the gross margin. Although there is a benefit from the decline of semi-custom also, the margin benefit is more weighted towards the non-semi-custom business, and that's where we end up with the 43% in Q3. We've also updated our guidance for 2019 and now are projecting 42% for the year 2019.
 ### Targets: negative
### Text:  Stacy Aaron Rasgon,  Sanford C. Bernstein & Co., LLC., Research Division - Senior Analyst    [9]  I wanted to follow-up on that 10-nanometer point. So as the volume production pushes out into 2019

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


order:  1
DOING BLEURT
DONE BLEURT


In [None]:
results_df.head()

### Testing on Medicine QA

In [28]:

max_num_egs = 3

data = load_dataset('Laurent1/MedQuad-MedicalQnADataset_128tokens_max')['train']
bleu = evaluate.load('bleu')

tokenizer = tokenizer_gpt2
model = model_gpt2

def filter_example(example):
    return count_tokens(tokenizer, extract_response_content(example['text'], "### Response:")) <= 300

data = data.filter(filter_example)

print(len(data))

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


max_token_dict = {}
for example in test_dataset:
    real = extract_response_content(example['text'], "### Response:")
    max_token_dict[real] = count_tokens(tokenizer, real)


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='medqa')
print("Length of train set", len(train_list))

# avg_tokens = 0
# for eg in train_list:
#     avg_tokens+= count_tokens(tokenizer_mist, eg)
# avg_tokens  = avg_tokens/len(train_list)
# print("AVG TOKENS: ",avg_tokens)
#avg_tokens = 


icl_method = 'similarity'
model_name = 'gpt2_small'
ds_name = 'medqa'

bert_scores = []
results_data = []


for i in range(max_num_egs):
    accuracy = 0
    reals, preds = evaluate_icl(train_list, test_dataset, model, tokenizer, i, model_name=model_name, ds_name=ds_name, method=icl_method, max_tokens_dict=max_token_dict)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    print("order: ", order)
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order=order)
    print("DOING BLEURT")
    bleurt_score = bleurt.compute(predictions=preds, references=reals)
    print("DONE BLEURT")
    avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
    results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i, 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu'], 'bleurt' : avg_bleurt})

results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/{icl_method}/icl_results_{ds_name}_{model_name}.csv', index=False)

15549
Length of test set:  100
Length of train set 12439
ICL prompt complete
152 36
Prediction complete
PROMPT:
 Below is an instruction from Human. Write a response.
    ### Instruction:
    Is anencephaly inherited ?
    ### Response:
REAL ANSWER:
 Most cases of anencephaly are sporadic, which means they occur in people with no history of the disorder in their family. A small percentage of cases have been reported to run in families; however, the condition does not have a clear pattern of inheritance.
PREDICTION:
                                                                                                                                                         
ICL prompt complete
158 41
Prediction complete
ICL prompt complete
170 41
Prediction complete
ICL prompt complete
165 41
Prediction complete
ICL prompt complete
182 41
Prediction complete
ICL prompt complete
190 39
Prediction complete
ICL prompt complete
155 39
Prediction complete
ICL prompt complete
167 39
Prediction compl

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


order:  1
DOING BLEURT
DONE BLEURT
ICL prompt complete
152 148
Prediction complete
PROMPT:
 Below is an instruction from Human. Write a response.
    ### Instruction:
    Is Fatal familial insomnia inherited ?
    ### Response:
    How is fatal familial insomnia inherited? Fatal familial insomnia (FFI) is inherited in an autosomal dominant manner. This means that to be affected, a person only needs a change (mutation) in one copy of the responsible gene in each cell. In some cases, an affected person inherits the mutation from an affected parent.
    
Below is an instruction from Human. Write a response.
    ### Instruction:
    Is anencephaly inherited ?
    ### Response:
REAL ANSWER:
 Most cases of anencephaly are sporadic, which means they occur in people with no history of the disorder in their family. A small percentage of cases have been reported to run in families; however, the condition does not have a clear pattern of inheritance.
PREDICTION:
 
    How is anencephaly inherited

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


order:  1
DOING BLEURT
DONE BLEURT
ICL prompt complete
152 255
Prediction complete
PROMPT:
 Below is an instruction from Human. Write a response.
    ### Instruction:
    Is Fatal familial insomnia inherited ?
    ### Response:
    How is fatal familial insomnia inherited? Fatal familial insomnia (FFI) is inherited in an autosomal dominant manner. This means that to be affected, a person only needs a change (mutation) in one copy of the responsible gene in each cell. In some cases, an affected person inherits the mutation from an affected parent.
    
Below is an instruction from Human. Write a response.
    ### Instruction:
    Is Protein C deficiency inherited ?
    ### Response:
    How is protein C deficiency inherited? Hereditary protein C deficiency is inherited in an autosomal dominant manner. This means that having only one mutated copy of the responsible gene in each cell is enough to cause mild protein C deficiency. A mutated copy of the gene can be inherited from a person's 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


order:  1
DOING BLEURT
DONE BLEURT


### Law QA

In [31]:

max_num_egs = 3

data = load_dataset('dzunggg/legal-qa-v1')['train']
bleu = evaluate.load('bleu')

tokenizer = tokenizer_gpt2
model = model_gpt2

def filter_example(example):
    return count_tokens(tokenizer, f"### Question: {example['question']}\n ### Answer: {example['answer']}") <= 300

data = data.filter(filter_example)

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='lawqa')
print("Length of train set", len(train_list))

max_token_dict = {}
for eg in test_dataset:
    max_token_dict[eg['answer']] = count_tokens(tokenizer, eg['answer'])

# avg_tokens = 0
# for eg in train_list:
#     avg_tokens+= count_tokens(tokenizer_mist, eg)
# avg_tokens  = avg_tokens/len(train_list)
# print("AVG TOKENS: ",avg_tokens)
#avg_tokens = 



# print("Token Dict complete")

icl_method = 'similarity'
model_name = 'gpt2_small'
ds_name = 'lawqa'

bert_scores = []
results_data = []


for i in range(max_num_egs):
    accuracy = 0
    reals, preds = evaluate_icl(train_list, test_dataset, model, tokenizer, i, model_name=model_name, ds_name=ds_name, method=icl_method, max_tokens_dict=max_token_dict)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order=order)
    print("DOING BLEURT")
    bleurt_score = bleurt.compute(predictions=preds, references=reals)
    print("DONE BLEURT")
    avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
    results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i, 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu'], 'bleurt':avg_bleurt})

results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/{icl_method}/icl_results_{ds_name}_{model_name}.csv', index=False)

Length of test set:  100
Length of train set 1579
ICL prompt complete
261 113
Prediction complete
PROMPT:
 ### Question: Q: My son was jumped by 6 students and the school has done nothing. I want to sue for failure to provide a safe environment. The school has know about the situation for three weeks but have done nothing. They don’t believe my child was jumped but has also failed to provide the surveillance footage of the attack. They told me one child came forward and said he hit my child too hard. The school expects my child to continue his education in an environment where he doesn’t feel safe or protected. 
 ### Answer:
REAL ANSWER:
 A:As a parent, it's distressing to hear that your child's safety is at risk. If the school is not addressing your concerns adequately, you might consider taking legal action. Schools have a duty to provide a safe environment for students. You can request the surveillance footage formally through a written request; if the school refuses, this may be so

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DOING BLEURT
DONE BLEURT
ICL prompt complete
261 290
Prediction complete
PROMPT:
 ### Question: Q: if a child is at school and throws a rock and it hits a vehicle who is responsible the parent or the school?. a child threw a rock and it accidentally stuck a vehicle during school under school supervision who is responsible for the cost of the damages? 
 ### Answer: A:if a child throws a rock and it hits a vehicle while the child is at school and under school supervision, the school district is typically responsible for the cost of the damages. This is because the school district has a duty to supervise its students and to take reasonable steps to prevent them from harming others. However, there are some exceptions to this rule. For example, if the child's actions were intentional or malicious, the school district may not be liable for the damages. Additionally, if the child's parents have homeowners insurance, their policy may cover the damages.
### Question: Q: My son was jumped by 6 s

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DOING BLEURT
DONE BLEURT
ICL prompt complete
261 513
Prediction complete
PROMPT:
 ### Question: Q: if a child is at school and throws a rock and it hits a vehicle who is responsible the parent or the school?. a child threw a rock and it accidentally stuck a vehicle during school under school supervision who is responsible for the cost of the damages? 
 ### Answer: A:if a child throws a rock and it hits a vehicle while the child is at school and under school supervision, the school district is typically responsible for the cost of the damages. This is because the school district has a duty to supervise its students and to take reasonable steps to prevent them from harming others. However, there are some exceptions to this rule. For example, if the child's actions were intentional or malicious, the school district may not be liable for the damages. Additionally, if the child's parents have homeowners insurance, their policy may cover the damages.
### Question: Q: Can I sue for emotional 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DOING BLEURT
DONE BLEURT


### Averaging ICL Results

In [19]:
import os
import pandas as pd

# Specify the directory containing your CSV files
directory = 'icl_results/random'

# Initialize an empty list to store DataFrames
dfs = []

for filename in os.listdir(directory):
    if filename.endswith('.csv') and 'gpt2_small' in filename:
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)

concatenated_df = pd.concat(dfs, ignore_index=True)
grouped_df = concatenated_df.groupby('num_demonstrations')[['bert_score', 'bleu_score', 'bleurt']].mean()
print(grouped_df.head())


                    bert_score  bleu_score    bleurt
num_demonstrations                                  
0                     0.670628    0.087276 -1.232413
1                     0.834500    0.129618 -1.137657
2                     0.833667    0.131596 -1.056782
3                     0.861578    0.124963 -1.378967
4                     0.840916    0.051867 -1.773363


In [18]:
import os
import pandas as pd

# Specify the directory containing your CSV files
directory = 'icl_results/similarity'

# Initialize an empty list to store DataFrames
dfs = []

for filename in os.listdir(directory):
    if filename.endswith('.csv') and 'gpt2_small' in filename:
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)

concatenated_df = pd.concat(dfs, ignore_index=True)
grouped_df = concatenated_df.groupby('num_demonstrations')[['bert_score', 'bleu_score', 'bleurt']].mean()
print(grouped_df.head())


                    bert_score  bleu_score    bleurt
num_demonstrations                                  
0                     0.670628    0.087276 -0.924396
1                     0.833230    0.139398 -0.577644
2                     0.837789    0.136516 -0.537595


In [20]:
import os
import pandas as pd

# Specify the directory containing your CSV files
directory = 'icl_results/random'

# Initialize an empty list to store DataFrames
dfs = []

for filename in os.listdir(directory):
    if filename.endswith('.csv') and 'mistral' in filename:
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)

concatenated_df = pd.concat(dfs, ignore_index=True)
grouped_df = concatenated_df.groupby('num_demonstrations')[['bert_score', 'bleu_score', 'bleurt']].mean()
print(grouped_df.head())


                    bert_score  bleu_score    bleurt
num_demonstrations                                  
0                     0.827385    0.124019 -1.116392
1                     0.847577    0.153620 -1.065689
2                     0.850429    0.149902 -1.055004
3                     0.861899    0.116885 -1.338390
4                     0.861151    0.112602 -1.381159


In [21]:
import os
import pandas as pd

# Specify the directory containing your CSV files
directory = 'icl_results/similarity'

# Initialize an empty list to store DataFrames
dfs = []

for filename in os.listdir(directory):
    if filename.endswith('.csv') and 'mistral' in filename:
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)

concatenated_df = pd.concat(dfs, ignore_index=True)
grouped_df = concatenated_df.groupby('num_demonstrations')[['bert_score', 'bleu_score', 'bleurt']].mean()
print(grouped_df.head())


                    bert_score  bleu_score    bleurt
num_demonstrations                                  
0                     0.825989    0.121709 -0.751007
1                     0.848177    0.154961 -0.636657
2                     0.854141    0.159165 -0.573506
