## ICL - GPT2

In [1]:
import torch
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric
import json
import bert_score
import evaluate
import random
import matplotlib.pyplot as plt
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm





  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
import torch
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric
import bert_score
import evaluate
import random
import matplotlib.pyplot as plt
import pandas as pd

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device being used:", device)

Device being used: cuda


In [12]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


sent_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def sent_similarity(sent1, sent2):
    sentences = [sent1, sent2]
    embeddings = sent_model.encode(sentences)
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix[0][1]


def format_examples(ds, ds_name='ni'):
    prompts = []
    if ds_name == 'ni':
        for example in ds:
            # prompt = f"### Question: {example['input']} \n ###Targets: {example['output']}"
            prompt = f"### Task: {example['definition']}\n ### Inputs: {example['inputs']}\n ### Targets: {example['targets']}"
            prompts.append(prompt)
    elif ds_name == 'medmcq':
         for example in ds:
            prompt = f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Targets: {example['output']}"
            prompts.append(prompt)
    elif ds_name == 'finance_sent':
        for example in ds:
            prompt = f"### Text: {example['text']}\n ### Targets: {example['label']}"
            prompts.append(prompt)
    elif ds_name == 'medqa':
        for example in ds:
            prompts.append(example['text'])
    elif ds_name == 'lawqa':
        for example in ds:
            prompt = f"### Question: {example['question']}\n ### Answer: {example['answer']}"
            prompts.append(prompt)

    return prompts

def select_characters_before_target(string, target_phrase="\n ### Targets:"): #this is a function to remove the actual target values from the train example so that the matching can be improved
    target_index = string.find(target_phrase)
    if target_index != -1:  # If the phrase is found
        return string[:target_index] + target_phrase
    else:
        return string 
    
def extract_response_content(string, target_phrase):
    response_index = string.find(target_phrase)
    return string[response_index + len(target_phrase):].strip()

def group_examples_random(ds, n): #this is where we group examples into a larger prompt
    random.seed()
    samples = random.sample(ds, n)
    new_prompt = ""
    for i in range(n):
        new_prompt += samples[i]
        new_prompt += "\n"
    return new_prompt

def group_by_similarity(prompt, ds, n_egs, m_choices, target_phrase="\n ### Targets:"):
    choices = random.sample(ds, m_choices)
    cos_sim_dict = {}
    for c in choices:
        cos_sim_dict[c] = sent_similarity(prompt, select_characters_before_target(c, target_phrase))

    sorted_cos_sim = sorted(cos_sim_dict.items(), key=lambda x: x[1], reverse=True)
    top_egs = ""
    for item in sorted_cos_sim[:n_egs]:
        top_egs += item[0]
        top_egs += "\n"

    # top_egs = "".join([item[0] for item in sorted_cos_sim[:n_egs]])
    return top_egs

def count_tokens(tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, add_special_tokens=True)
    return len(input_ids)


def evaluate_example(model, tokenizer, prompt, model_name, max_tokens):
    if model_name == 'gpt2':
        tokenized_prompt = tokenizer(prompt, return_tensors='pt').to(device)
        if len(tokenized_prompt['input_ids'][0]) > MAX_LENGTH: #currently just checking if random prompt is too big or not
            return None 
        outputs =model.generate(**tokenized_prompt, pad_token_id=tokenizer.eos_token_id, max_length=1024)
        decoded_output = tokenizer.decode(outputs[0][len(tokenized_prompt['input_ids'][0]):], skip_special_tokens=True)
        return decoded_output
    elif model_name == 'mistral':
        # num_tokens = count_tokens(tokenizer, prompt)
        # print("Num tokens in prompt: ", num_tokens)
        # if num_tokens > 3400:
        #     return None
        model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
        print("Max number of tokens: ", max_tokens)
        outputs =model.generate(**model_inputs, pad_token_id=tokenizer.eos_token_id, do_sample=True, max_new_tokens=max_tokens)
        decoded_output = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):], skip_special_tokens=True) #only get output
        return decoded_output
  

def evaluate_icl(train_dataset, test_dataset, model, tokenizer, num_egs, model_name, ds_name='ni', method='similarity', max_tokens_dict=None):
    reals = []
    preds = []
    counter = 0
    for example in test_dataset:
        # prompt = group_examples(train_dataset, num_egs) + f"### Question: {example['input']} \n ###Targets:"
        target_phrase = "\n ### Targets:"
        if ds_name == 'ni':
            curr_prompt = f"### Task: {example['definition']}\n ### Inputs: {example['inputs']}\n ### Targets:"
            real = f"{example['targets']}"
            max_tokens = max_tokens_dict[real] + 100
        elif ds_name == 'medmcq':
            curr_prompt = f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Targets:"
            real = f"{example['output']}"
            tokens = tokenizer(real, return_tensors='pt').to(device)
            max_tokens = len(tokens['input_ids'][0]) +  100
        elif ds_name == 'finance_sent':
            curr_prompt = f"### Text: {example['text']}\n ### Targets:"
            real = f"{example['label']}"
            tokens = tokenizer(real, return_tensors='pt').to(device)
            max_tokens = len(tokens['input_ids'][0])
        elif ds_name == 'medqa':
            curr_prompt = select_characters_before_target(example['text'], "### Response:")
            real = extract_response_content(example['text'], "### Response:")
            max_tokens = max_tokens_dict[real] + 100
        elif ds_name == 'lawqa':
            curr_prompt = f"### Question: {example['question']}\n ### Answer:"
            real = example['answer']
            max_tokens = max_tokens_dict[real] + 100
            target_phrase = "### Answer:"

        # tokens = tokenizer(real, return_tensors='pt').to(device)
        # max_tokens = len(tokens['input_ids'][0])
    
        if method == 'similarity':
            icl_prompt = group_by_similarity(curr_prompt,train_dataset, num_egs, 100, target_phrase) + curr_prompt
        elif method == 'random':
            icl_prompt = group_examples_random(train_dataset, num_egs) + curr_prompt

        # print("MAX TOKENS:\n", max_tokens)
        # print("\n ICL Prompt: ",icl_prompt)
        print("ICL prompt complete")
        pred = evaluate_example(model, tokenizer, icl_prompt, model_name, max_tokens)
        print("Prediction complete")

        if counter % 50 == 0:
            print("PROMPT:\n", icl_prompt)
            print("REAL ANSWER:\n", real)
            print("PREDICTION:\n", pred)
        if pred:
            reals.append(real.lower())
            preds.append(pred.lower())
        counter+=1

    return reals, preds

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
tokenizer_mist_8= AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model_mist_8 = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1",  load_in_8bit=True, device_map='cuda')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards:   0%|          | 0/2 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig




# model_plain =  GPT2LMHeadModel.from_pretrained("gpt2").to(device)
# tokenizer_plain = GPT2Tokenizer.from_pretrained("gpt2")
# print("models retrieved")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)



model_mist = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", quantization_config=bnb_config, device_map="auto")
tokenizer_mist = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

Loading checkpoint shards: 100%|██████████| 2/2 [00:36<00:00, 18.19s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_gpt2=  GPT2LMHeadModel.from_pretrained("gpt2").to(device)
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained("gpt2")

In [7]:
def evaluate_example2(model, tokenizer, prompt):
    model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
    # print(prompt)
    # if len(tokenized_prompt['input_ids'][0]) > MAX_LENGTH: #currently just checking if random prompt is too big or not
    #     return None 
    outputs =model.generate(**model_inputs, pad_token_id= tokenizer.eos_token_id, do_sample=False, max_new_tokens = 5)
    decoded_output = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):], skip_special_tokens=True)
    # print("prediction: ",decoded_output)/
    return decoded_output


prompt = """ "featuring an oscar-worthy performance => positive\n"
    "completely messed up => negative\n"
    "masterpiece => positive\n"
    "the action is stilted => negative\n"
    "by far the worst movie of the year =>" """
pred = evaluate_example2(model_mist_8, tokenizer_mist_8, prompt) 
print(pred)



"


### Test on Natural Instructions Data

In [9]:
tokenizer = tokenizer_mist
model = model_mist

data = load_from_disk('data/1000_per_task')

# data = filter_icl(data, max_num_egs, tokenizer_plain)

max_num_egs =  3   #natural instructions are just too big

bleu = evaluate.load('bleu')

# train_test_split = data.train_test_split(test_size=0.2, seed=42)
# train_dataset = train_test_split['train']
# test_dataset = train_test_split['test']

train_dataset = data['train']
test_dataset = data['test'].select(range(100))

# def filter_example(example):
#     return count_tokens(tokenizer, example['targets']) <= 300

# train_dataset = train_dataset.filter(filter_example)

print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset)
print("Length of train set", len(train_list))

max_token_dict = {}
for eg in test_dataset:
    max_token_dict[eg['targets']] = count_tokens(tokenizer, eg['targets'])
print(max_token_dict)

icl_method = 'similarity'
model_name = 'mistral'
ds_name = 'ni'

bert_scores = []
results_data = []
for i in range(max_num_egs):
    reals, preds = evaluate_icl(train_list, test_dataset, model, tokenizer, i, model_name=model_name, ds_name=ds_name, method=icl_method, max_tokens_dict=max_token_dict)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    print(order)
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order) #set order to mean of real values
    results_data.append({'num_samples' : len(preds), 'num_demonstrations':i, 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu']})

results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/icl_results_{ds_name}_{icl_method}_{model_name}.csv', index=False)

# plt.figure(figsize=(10, 2))
# plt.plot(range(max_num_egs), bert_scores) 
# plt.xlabel('Number of examples') 
# plt.ylabel('BERT F1 Score') 
# plt.title('BERT F1 Score vs Number of Examples') 
# plt.xticks(range(max_num_egs))
# plt.savefig('BERT_scores_icl_ni.png')



Length of test set:  100
Length of train set 575481
{'No.': 3, 'Yes.': 3}
ICL prompt complete
Max number of tokens:  103
Prediction complete
PROMPT:
 ### Task: The answer will be 'yes' if the provided sentence contains an explicit mention that answers the given question. Otherwise, the answer should be 'no'. Instances where the answer is implied from the sentence using "instinct" or "common sense" (as opposed to being written explicitly in the sentence) should be labeled as 'no'.
 ### Inputs: Sentence: Jerry goes out to the pier and casts his favorite bait : cheese . 
Question: How much time did Jerry spend at the pier?
 ### Targets:
REAL ANSWER:
 No.
PREDICTION:
 yes

### Outputs:
#### Answer

yes

#### Rationale

Jerry's fishing trip is an explicit mention that answers, given the amount of context information, what we can only call a "fishing trip".
We consider a sentence explicit if it explicitly spans multiple lexical items (or words). We consider an answer implicit if it is not th

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1
ICL prompt complete
Max number of tokens:  103
Prediction complete
PROMPT:
 ### Task: You will be given a sentence. Check whether the sentence is grammatically correct and is meaningful. If the sentence is grammatically correct, then answer with '1', otherwise answer with '0'.
 ### Inputs: Bill was bitten the dog.
 ### Targets: 0
### Task: The answer will be 'yes' if the provided sentence contains an explicit mention that answers the given question. Otherwise, the answer should be 'no'. Instances where the answer is implied from the sentence using "instinct" or "common sense" (as opposed to being written explicitly in the sentence) should be labeled as 'no'.
 ### Inputs: Sentence: Jerry goes out to the pier and casts his favorite bait : cheese . 
Question: How much time did Jerry spend at the pier?
 ### Targets:
REAL ANSWER:
 No.
PREDICTION:
 no
### Task: You will be given a sentence in the first field. You will be given a question after that. Answer if the answer is explicitly (not 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1
ICL prompt complete
Max number of tokens:  103
Prediction complete
PROMPT:
 ### Task: Given a statement, generate a question such that the answer is contained in that statement.
 ### Inputs: nerve extensions receive electrical impulses
 ### Targets: What do nerve extensions receive?
### Task: In this task, you are given a sentence and question which can be answered using the sentence. Your task is to answer the question using the information from the sentence. The answer to the question is unique and it is a continuous text span from the sentence.
 ### Inputs: Sentence: He also began making super-8 films beginning in junior high , and showed these films to the scholarship committee of Brigham Young University in 1981 , earning a full scholarship in ` Theatre and Cinematic Arts ' after receiving a Sundance Institute ` Most Promising Filmmaker ' award for his film `` Night Meeting '' . 
 Question: who   made something?
 ### Targets: He
### Task: The answer will be 'yes' if the provided

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1


In [None]:
# print(int(sum(len(s) for s in refs)/len(refs)))
# test = bleu.compute(predictions=['No', 'Yes', 'Yes', 'yes', 'Yes', 'yes', 'yes', 'No', 'yes', 'No'],
#                     references=['No.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.'], max_order=int(sum(len(s) for s in refs)/len(refs)))
# test

### Test on Medical MCQ Dataset

In [None]:
bleu = evaluate.load('bleu')

max_num_egs = 5

data = load_dataset('medalpaca/medical_meadow_medqa')['train']
train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='medmcq')
print("Length of train set", len(train_list))

icl_method = 'random'
model_name = 'mistral'
ds_name = 'medmcq'

bert_scores = []
results_data = []


for i in range(max_num_egs):
    accuracy = 0
    reals, preds = evaluate_icl(train_list, test_dataset, model_mist, tokenizer_mist, i, model_name=model_name, ds_name=ds_name, method=icl_method)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    print("order: ", order)
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order=order)
    for r,p in zip(reals, preds):
        if len(p.strip()) != 0:
            if r.strip()[0] == p.strip()[0]:
                accuracy+=1
    accuracy = accuracy/len(preds)
    results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i, 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu'], 'accuracy':accuracy})

# results_df = pd.DataFrame(results_data)
# results_df.to_csv(f'icl_results/icl_results_{ds_name}_{icl_method}_{model_name}.csv', index=False)

# plt.figure(figsize=(10, 2))
# plt.plot(range(max_num_egs), bert_scores) 
# plt.xlabel('Number of examples') 
# plt.ylabel('BERT F1 Score')
# plt.title('BERT F1 Score vs Number of Examples') 
# plt.xticks(range(max_num_egs))
# plt.savefig('BERT_scores_icl_medqa.png')


In [None]:
results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/icl_results_{ds_name}_{icl_method}_{model_name}.csv', index=False)
results_df.head()

### Test on Transcript Sentiment Analysis

In [None]:

max_num_egs = 5

data = load_dataset('jlh-ibm/earnings_call', 'transcript-sentiment')['train']
bleu = evaluate.load('bleu')

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='finance_sent')
print("Length of train set", len(train_list))

icl_method = 'similarity'
model_name = 'mistral'
ds_name = 'finance_sent'

bert_scores = []
results_data = []


for i in range(max_num_egs):
    accuracy = 0
    reals, preds = evaluate_icl(train_list, test_dataset, model_mist, tokenizer_mist, i, model_name=model_name, ds_name=ds_name, method=icl_method)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    print("order: ", order)
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order=order)
    for r,p in zip(reals, preds):
        if len(p.strip()) != 0:
            if r.strip()[0] == p.strip()[0]:
                accuracy+=1
    accuracy = accuracy/len(preds)
    results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i, 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu'], 'accuracy':accuracy})

results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/icl_results_{ds_name}_{icl_method}_{model_name}.csv', index=False)

In [None]:
results_df.head()

### Testing on Medicine QA

In [11]:

max_num_egs = 3

data = load_dataset('Laurent1/MedQuad-MedicalQnADataset_128tokens_max')['train']
bleu = evaluate.load('bleu')

def filter_example(example):
    return count_tokens(tokenizer_mist, extract_response_content(example['text'], "### Response:")) <= 300

data = data.filter(filter_example)

print(len(data))

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


max_token_dict = {}
for example in test_dataset:
    real = extract_response_content(example['text'], "### Response:")
    max_token_dict[real] = count_tokens(tokenizer_mist, real)


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='medqa')
print("Length of train set", len(train_list))

# avg_tokens = 0
# for eg in train_list:
#     avg_tokens+= count_tokens(tokenizer_mist, eg)
# avg_tokens  = avg_tokens/len(train_list)
# print("AVG TOKENS: ",avg_tokens)
#avg_tokens = 


icl_method = 'similarity'
model_name = 'mistral'
ds_name = 'medqa'

bert_scores = []
results_data = []


for i in range(max_num_egs):
    accuracy = 0
    reals, preds = evaluate_icl(train_list, test_dataset, model_mist, tokenizer_mist, i, model_name=model_name, ds_name=ds_name, method=icl_method, max_tokens_dict=max_token_dict)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    print("order: ", order)
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order=order)
 
    results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i, 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu']})

results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/icl_results_{ds_name}_{icl_method}_{model_name}.csv', index=False)

Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


15549
Length of test set:  100
Length of train set 12439
ICL prompt complete
Prediction complete
PROMPT:
 Below is an instruction from Human. Write a response.
    ### Instruction:
    Is anencephaly inherited ?
    ### Response:
REAL ANSWER:
 Most cases of anencephaly are sporadic, which means they occur in people with no history of the disorder in their family. A small percentage of cases have been reported to run in families; however, the condition does not have a clear pattern of inheritance.
PREDICTION:
 
    Anencephaly is not inherited. Anencephaly is a result of random, uncontrollable genetic mutations that occur after the fertilized egg begins to expand and divide into two cells , then four , then eight , and so on .
    ### Instruction:
    Is there a genetic component to anencephaly? Does a family history of anencephaly increase the risk of having a child born with anencephaly?
    ### Response:
    Anencephaly is considered to be a non-inheritable birth defect with a high r

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


order:  1
ICL prompt complete
Prediction complete
PROMPT:
 Below is an instruction from Human. Write a response.
    ### Instruction:
    Is Fryns syndrome inherited ?
    ### Response:
    How is Fryns syndrome inherited? Although the exact cause of Fryns syndrome is not currently known (and no disease-causing gene has yet been identified), it is thought to be genetic because it tends to "run in families" and has features common to other genetic disorders.
    
Below is an instruction from Human. Write a response.
    ### Instruction:
    Is anencephaly inherited ?
    ### Response:
REAL ANSWER:
 Most cases of anencephaly are sporadic, which means they occur in people with no history of the disorder in their family. A small percentage of cases have been reported to run in families; however, the condition does not have a clear pattern of inheritance.
PREDICTION:
 
    Anencephaly is not inherited or genetic. Even though anencephaly has been linked to other genetic conditions (such as t

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


order:  1
ICL prompt complete
Prediction complete
PROMPT:
 Below is an instruction from Human. Write a response.
    ### Instruction:
    Is Rett syndrome inherited ?
    ### Response:
    In more than 99 percent of people with Rett syndrome, there is no history of the disorder in their family. Many of these cases result from new mutations in the MECP2 gene.  A few families with more than one affected family member have been described. These cases helped researchers determine that classic Rett syndrome and variants caused by MECP2 gene mutations have an X-linked dominant pattern of inheritance.
    
Below is an instruction from Human. Write a response.
    ### Instruction:
    Is Chorea-acanthocytosis inherited ?
    ### Response:
    How do people inherit chorea-acanthocytosis? Chorea-acanthocytosis is inherited in an autosomal recessive pattern, which means both copies of the gene in each cell have mutations. The parents of an individual with an autosomal recessive condition each car

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


order:  1


Things to do:

Mistral 8B?
Law dataset
Figure out max tokens crap


### Law QA

In [13]:

max_num_egs = 3

data = load_dataset('dzunggg/legal-qa-v1')['train']
bleu = evaluate.load('bleu')

def filter_example(example):
    return count_tokens(tokenizer_mist, example['answer']) <= 100

data = data.filter(filter_example)


print(len(data))

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


print("Length of test set: ", len(test_dataset))
train_list = format_examples(train_dataset, ds_name='lawqa')
print("Length of train set", len(train_list))

max_token_dict = {}
for eg in test_dataset:
    max_token_dict[eg['answer']] = count_tokens(tokenizer_mist, eg['answer'])

# avg_tokens = 0
# for eg in train_list:
#     avg_tokens+= count_tokens(tokenizer_mist, eg)
# avg_tokens  = avg_tokens/len(train_list)
# print("AVG TOKENS: ",avg_tokens)
#avg_tokens = 



print("Token Dict complete")

icl_method = 'random'
model_name = 'mistral'
ds_name = 'lawqa'

bert_scores = []
results_data = []


for i in range(max_num_egs):
    accuracy = 0
    reals, preds = evaluate_icl(train_list, test_dataset, model_mist, tokenizer_mist, i, model_name=model_name, ds_name=ds_name, method=icl_method, max_tokens_dict=max_token_dict)
    P, R, F1 = bert_score.score(preds, reals, lang="en")
    average_F1 = sum(F1) / len(F1)
    bert_scores.append(average_F1)
    refs = [[r] for r in reals]
    order = int(sum(len(s) for s in refs)/len(refs))
    bleu_score = bleu.compute(predictions=preds, references=refs, max_order=order)
 
    results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i, 'bert_score' : float(average_F1), 'bleu_score' : bleu_score['bleu']})

results_df = pd.DataFrame(results_data)
results_df.to_csv(f'icl_results/icl_results_{ds_name}_{icl_method}_{model_name}.csv', index=False)

Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


1102
Length of test set:  100
Length of train set 881
Token Dict complete
ICL prompt complete
Max number of tokens:  159
Prediction complete
PROMPT:
 ### Question: Q: What should I do? My account is frozen cause I was given fraud checks and bank took my money when they decline the check. So I was given two checks for a side job I deposit it to my account and they decline next day I was called that the check’s were fraud but they bank never gave me the money. I gave them the emails and address of where I gotten them and they froze my account. When I login into my account I had 123.50 in the checking account I checked the next day it was 232 which was weird then I saw it was taken . I don’t know how if it’s frozen 
 ### Answer:
REAL ANSWER:
 A:If you never got the money and the checks were declined, the bank must suspect you for fraud. There is something not right but if the whole matter is over $110 there is little a lawyer can do. When the courts reopen, make a small claims suit.
PREDI

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ICL prompt complete
Max number of tokens:  159
Prediction complete
PROMPT:
 ### Question: Q: I recently financed a ‘13 Dodge Avenger from a used car dealership. It failed 2 emissions inspections.. The car has been at the mechanic longer than I have been able to use it and I’m making payments on it for 2 months now without being able to use the vehicle. The dealership refuses to void the contract and insists of wasting more time trying to fix it or selling me a different car with a higher monthly payment (which I would have initially financed if I could afford it). The car has had other several repairs done since first date of purchase such as caliper changers, brake pad repairs and tire replacements and still has other issues that were ignored by the mechanic such as a non functional horn and head light is out. I have reported a complaint to consumer fraud department and I’m waiting to hear back, but the car should have never been sold to me in this condition and I just want to have th

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ICL prompt complete
Max number of tokens:  159
Prediction complete
PROMPT:
 ### Question: Q: My nephew wants to build a house on land I own. He said my name has to come off the deeds. Is that true?. I have no problem with him building a house on the land but I want my name to remain on the deeds. 
 ### Answer: A:If you own the land and your nephew wants to build a house on it, it is not necessarily true that your name has to be removed from the deeds. The ownership of the land can be structured in a way that allows your nephew to build the house while still maintaining your ownership rights.
### Question: Q: What does GMAC, WVMF Funding, or RECAP have to do with this case?. Does it have something to do with the mortgage crash in 2008 through 2010? 
 ### Answer: A:A South Carolina attorney could best advise, but your post remains open for five weeks. It's possible something inadvertently got left off in uploading your post - a case is not mentioned. Not every question is picked up, but 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
