# Instruction Tuning on Datasets

In [None]:
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from encodeinstruction import encodeinstruction
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import json
import bert_score
import evaluate
import bleurt
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device being used:", device)


In [None]:
model_gpt2_it =  GPT2LMHeadModel.from_pretrained( "new_test/trained_model").to(device) #instruction tuned gpt2 model
tokenizer_gpt2_it = GPT2Tokenizer.from_pretrained("new_test/tokenizer")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)



model_mist = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", quantization_config=bnb_config, device_map="auto")
tokenizer_mist = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

In [None]:
def evaluate_example(prompt, model, tokenizer, max_tokens):
    tokenized_prompt = tokenizer(prompt, return_tensors='pt').to(device)
    outputs =model.generate(**tokenized_prompt, pad_token_id=tokenizer.eos_token_id, max_new_tokens=max_tokens)
    decoded_output = tokenizer.decode(outputs[0][len(tokenized_prompt['input_ids'][0]):], skip_special_tokens=True)
    return decoded_output

def count_tokens(tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, add_special_tokens=True)
    return len(input_ids)


# tokenizer_mist.add_bos_token, tokenizer_mist.add_eos_token

print(evaluate_example("What colour is the sky?", model_gpt2_it, tokenizer_gpt2_it, max_tokens=100))

In [None]:

bleurt = evaluate.load("bleurt", module_type="metric")

### Test on Natural Instructions Data

In [None]:
import pandas as pd

data = load_from_disk('data/1000_per_task')

model = model_gpt2_it
tokenizer = tokenizer_gpt2_it

# data = filter_icl(data, max_num_egs, tokenizer_plain)

bleu = evaluate.load('bleu')

# train_test_split = data.train_test_split(test_size=0.2, seed=42)
# train_dataset = train_test_split['train']
# test_dataset = train_test_split['test']

model_name = 'gpt2_small_it'

train_dataset = data['train']
test_dataset = data['test']

grouped_test_dataset = test_dataset.to_pandas().groupby('task_name').apply(lambda x: x.head(10)).reset_index(drop=True)

test_dataset = Dataset.from_pandas(grouped_test_dataset.head(100))

reals = []
preds = []
results_data = []
for example in test_dataset:
    if model_name == 'mistral_it':
        prompt = f"<s>[INST] {example['definition']} {example['inputs']} [/INST]"
        real = f"{example['targets']}"
    elif model_name == 'gpt2_small_it':
        prompt = f"### Task: {example['definition']}\n ### Inputs: {example['inputs']} \n ### Targets: "
        real = f"{example['targets']}"
    max_tokens = count_tokens(tokenizer, real)#maximum number of tokens to generate
    pred = evaluate_example(prompt, model, tokenizer, max_tokens+100) #add 100 to reduce bias
    reals.append(real.lower())
    preds.append(pred.lower())
    print("PROMPT:\n", prompt)
    print("REAL:\n", real)
    print("PREDICTION:\n", pred)

P, R, F1 = bert_score.score(preds, reals, lang="en")
average_F1 = sum(F1) / len(F1)
average_precision = sum(P)/len(P)
average_recall = sum(R)/len(R)
refs = [[r] for r in reals]
order = int(sum(len(s) for s in refs)/len(refs))
print(order)
bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order)
bleurt_score = bleurt.compute(predictions=preds, references=reals)
avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
results_data.append({'num_samples' : len(preds), 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu'], 'bleurt_score' : avg_bleurt})
results_df = pd.DataFrame(results_data)
results_df.to_csv(f'it_results/it_results_ni_{model_name}.csv', index=False)


### Test on Alpaca

In [None]:
model = model_gpt2_it
tokenizer = tokenizer_gpt2_it

data = load_dataset('tatsu-lab/alpaca')['train']

# data = filter_icl(data, max_num_egs, tokenizer_plain)
bleu = evaluate.load('bleu')

def filter_example(example):
    return count_tokens(tokenizer, f"### Instruction: {example['instruction']}\n ### Input: {example['input']}\n ### Text: {example['text']} \n ### Output: {example['output']}") <= 300

data = data.filter(filter_example)


train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))

print("Length of test set: ", len(test_dataset))

model_name = 'gpt2_small_it'

reals = []
preds = []
results_data = []
for example in test_dataset:
    if model_name == 'mistral_it':
        prompt = f"<s>[INST] {example['definition']} {example['inputs']} [/INST]"
        real = f"{example['targets']}"
    elif model_name == 'gpt2_small_it':
        prompt = f"### Task: {example['instruction']} \n ### Inputs: {example['input']} \n ### Targets: "
        real = f"{example['output']}"
    max_tokens = count_tokens(tokenizer, real)#maximum number of tokens to generate
    pred = evaluate_example(prompt, model, tokenizer, max_tokens+100) #add 100 to reduce bias
    reals.append(real.lower())
    preds.append(pred.lower())
    print("PROMPT:\n", prompt)
    print("REAL:\n", real)
    print("PREDICTION:\n", pred)

P, R, F1 = bert_score.score(preds, reals, lang="en")
average_F1 = sum(F1) / len(F1)
average_precision = sum(P)/len(P)
average_recall = sum(R)/len(R)
refs = [[r] for r in reals]
order = int(sum(len(s) for s in refs)/len(refs))
print(order)
bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order)
bleurt_score = bleurt.compute(predictions=preds, references=reals)
avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
results_data.append({'num_samples' : len(preds), 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu'], 'bleurt_score' : avg_bleurt})
results_df = pd.DataFrame(results_data)
results_df.to_csv(f'it_results/it_results_alpaca_{model_name}.csv', index=False)




### Test on Medical MCQ

In [34]:
import pandas as pd

# data = filter_icl(data, max_num_egs, tokenizer_plain)

bleu = evaluate.load('bleu')

model_name = 'gpt2_small_it'

model = model_gpt2_it
tokenizer = tokenizer_gpt2_it



data = load_dataset('medalpaca/medical_meadow_medqa')['train']
train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))

def filter_example(example):
    return count_tokens(tokenizer, f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Answer: {example['output']}") <= 300 #for consistency, get the same samples as ICL

train_dataset = train_dataset.filter(filter_example)

reals = []
preds = []
results_data = []
for example in test_dataset:
    if model_name == 'mistral_it':
        prompt = f"<s> {example['input']} [INST] {example['instruction']} [/INST] Answer: "
        real = f"{example['output']}"
    else:
        prompt = f"### Task: Write down the option in the brackets that is correct. \n ### Inputs: {example['input']} \n ### Targets: "
        real = f"{example['output']}"
    max_tokens = count_tokens(tokenizer, real)
    
    pred = evaluate_example(prompt, model, tokenizer, max_tokens+100)
    reals.append(real.lower())
    preds.append(pred.lower())
    print("PROMPT:\n", prompt)
    print("REAL:\n", real)
    print("PREDICTION:\n", pred)

P, R, F1 = bert_score.score(preds, reals, lang="en")
average_F1 = sum(F1) / len(F1)
refs = [[r] for r in reals]
order = int(sum(len(s) for s in refs)/len(refs))
print(order)
bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order)
accuracy = 0
for r,p in zip(reals, preds):
        if len(p.strip()) != 0:
            if r.strip()[0] == p.strip()[0]:
                accuracy+=1
accuracy = accuracy/len(preds)
bleurt_score = bleurt.compute(predictions=preds, references=reals)
avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
results_data.append({'num_samples' : len(preds), 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu'], 'accuracy' : accuracy, 'bleurt' : avg_bleurt})
results_df = pd.DataFrame(results_data)
results_df.to_csv(f'it_results/it_results_medmcq_{model_name}.csv', index=False)


PROMPT:
 ### Task: Write down the option in the brackets that is correct. 
 ### Inputs: Q:A 35-year-old woman comes to your office with a variety of complaints. As part of her evaluation, she undergoes laboratory testing which reveals the presence of anti-centromere antibodies. All of the following symptoms and signs would be expected to be present EXCEPT:? 
{'A': 'Pallor, cyanosis, and erythema of the hands', 'B': 'Calcium deposits on digits', 'C': 'Blanching vascular abnormalities', 'D': 'Hypercoagulable state', 'E': 'Heartburn and regurgitation'}, 
 ### Targets: 
REAL:
 D: Hypercoagulable state
PREDICTION:
  B
PROMPT:
 ### Task: Write down the option in the brackets that is correct. 
 ### Inputs: Q:An 8-year-old boy is brought to the pediatrician because his mother is concerned about recent behavioral changes. His mother states that she has started to notice that he is slurring his speech and seems to be falling more than normal. On exam, the pediatrician observes the boy has pes ca

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1


### Test on Transcript Sent Analysis

In [None]:
import pandas as pd

data = load_dataset('jlh-ibm/earnings_call', 'transcript-sentiment')['train']
bleu = evaluate.load('bleu')

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))

tokenizer = tokenizer_gpt2_it
model = model_gpt2_it

def filter_example(example):
    return count_tokens(tokenizer, f"### Text: {example['text']}\n ### Targets: {example['label']}") <= 300

train_dataset = train_dataset.filter(filter_example)
model_name = 'gpt2_small_it'

reals = []
preds = []
results_data = []
for example in test_dataset:
    if model_name == 'mistral_it':
        prompt = f"<s> {example['input']} [INST] {example['instruction']} [/INST] Answer: "
        real = f"{example['output']}"
    else:
        prompt = f"### Task: Predict whether the given text is positive or negative. Output 'positive' or 'negative'.\n ### Inputs: {example['text']}\n ### Targets: "
        real = f"{example['label']}"
    max_tokens = count_tokens(tokenizer, real)
    if count_tokens(tokenizer, prompt) >= 900:
         continue
    print("PROMPT:\n", prompt)
    pred = evaluate_example(prompt, model, tokenizer, max_tokens+100)
    reals.append(real.lower())
    preds.append(pred.lower())
    print("REAL:\n", real)
    print("PREDICTION:\n", pred)

P, R, F1 = bert_score.score(preds, reals, lang="en")
average_F1 = sum(F1) / len(F1)
refs = [[r] for r in reals]
order = int(sum(len(s) for s in refs)/len(refs))
print(order)
bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order)
accuracy = 0
for r,p in zip(reals, preds):
        if len(p.strip()) != 0:
            if r.strip()[0] == p.strip()[0]:
                accuracy+=1
accuracy = accuracy/len(preds)
bleurt_score = bleurt.compute(predictions=preds, references=reals)
avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
results_data.append({'num_samples' : len(preds), 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu'], 'accuracy' : accuracy, 'bleurt' : avg_bleurt})
results_df = pd.DataFrame(results_data)
results_df.to_csv(f'it_results/it_results_finance_sent_{model_name}.csv', index=False)


### Test on Medical QA

In [27]:
data = load_dataset('Laurent1/MedQuad-MedicalQnADataset_128tokens_max')['train']
bleu = evaluate.load('bleu')

def select_characters_before_target(string, target_phrase="\n ### Targets:"): #this is a function to remove the actual target values from the train example so that the matching can be improved
    target_index = string.find(target_phrase)
    if target_index != -1:  # If the phrase is found
        return string[:target_index] + "\n ### Targets:"
    else:
        return string 

def extract_response_content(string, target_phrase):
    response_index = string.find(target_phrase)
    return string[response_index + len(target_phrase):].strip()

def replace_instructions_and_responses(input_string):
    # Replacing "### Instruction" with "### Task"
    modified_string = input_string.replace("### Instruction", "\n ### Inputs")
    
    # Replacing "### Response" with "### Targets"
    modified_string = modified_string.replace("### Response", "\n ### Targets")
    
    return "### Task: " + modified_string

def filter_example(example):
    return count_tokens(tokenizer, extract_response_content(example['text'], "### Response:")) <= 300

data = data.filter(filter_example)

print(len(data))

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


max_token_dict = {}
for example in test_dataset:
    real = extract_response_content(example['text'], "### Response:")
    max_token_dict[real] = count_tokens(tokenizer, real)


print("Length of test set: ", len(test_dataset))

model_name = 'gpt2_small_it'
ds_name = 'medqa'

reals = []
preds = []
results_data = []
for example in test_dataset:
    if model_name == 'mistral_it':
        prompt = f"<s> {example['input']} [INST] {example['instruction']} [/INST] Answer: "
        real = f"{example['output']}"
    else:
        prompt = select_characters_before_target(replace_instructions_and_responses(example['text']))
        real = extract_response_content(example['text'], "### Response:")
    max_tokens = count_tokens(tokenizer, real)
    
    pred = evaluate_example(prompt, model, tokenizer, max_tokens+100)
    reals.append(real.lower())
    preds.append(pred.lower())
    print("PROMPT:\n", prompt)
    print("REAL:\n", real)
    print("PREDICTION:\n", pred)

P, R, F1 = bert_score.score(preds, reals, lang="en")
average_F1 = sum(F1) / len(F1)
refs = [[r] for r in reals]
order = int(sum(len(s) for s in refs)/len(refs))
print(order)
bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order)
bleurt_score = bleurt.compute(predictions=preds, references=reals)
avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
results_data.append({'num_samples' : len(preds), 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu'], 'accuracy' : accuracy, 'bleurt' : avg_bleurt})
results_df = pd.DataFrame(results_data)
results_df.to_csv(f'it_results/it_results_medqa_{model_name}.csv', index=False)

15549
Length of test set:  100
PROMPT:
 ### Task: Below is an instruction from Human. Write a response.
    
 ### Inputs:
    Is anencephaly inherited ?
    
 ### Targets:
REAL:
 Most cases of anencephaly are sporadic, which means they occur in people with no history of the disorder in their family. A small percentage of cases have been reported to run in families; however, the condition does not have a clear pattern of inheritance.
PREDICTION:
  yes
PROMPT:
 ### Task: Below is an instruction from Human. Write a response.
    
 ### Inputs:
    What are the symptoms of Permanent neonatal diabetes mellitus ?
    
 ### Targets:
REAL:
 What are the signs and symptoms of Permanent neonatal diabetes mellitus? The Human Phenotype Ontology provides the following list of signs and symptoms for Permanent neonatal diabetes mellitus. If the information is available, the table below includes how often the symptom is seen in people with this condition.
PREDICTION:
  Yes
PROMPT:
 ### Task: Below is a

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1


### Test on Law QA

In [29]:

max_num_egs = 3

data = load_dataset('dzunggg/legal-qa-v1')['train']
bleu = evaluate.load('bleu')

tokenizer = tokenizer_gpt2_it
model = model_gpt2_it

def filter_example(example):
    return count_tokens(tokenizer, f"### Question: {example['question']}\n ### Answer: {example['answer']}") <= 300 

data = data.filter(filter_example)

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


print("Length of test set: ", len(test_dataset))

# avg_tokens = 0
# for eg in train_list:
#     avg_tokens+= count_tokens(tokenizer_mist, eg)
# avg_tokens  = avg_tokens/len(train_list)
# print("AVG TOKENS: ",avg_tokens)
#avg_tokens = 



# print("Token Dict complete")
model_name = 'gpt2_small_it'
ds_name = 'lawqa'

reals = []
preds = []
results_data = []
for example in test_dataset:
    if model_name == 'mistral_it':
        prompt = f"<s> {example['input']} [INST] {example['instruction']} [/INST] Answer: "
        real = f"{example['output']}"
    else:
        prompt = f"### Task: Write an answer to the following question starting with 'A:'.\n ### Inputs: {example['question']}\n ### Targets:"
        real = example['answer']
    max_tokens = count_tokens(tokenizer, real)
    
    pred = evaluate_example(prompt, model, tokenizer, max_tokens+100)
    reals.append(real.lower())
    preds.append(pred.lower())
    print("PROMPT:\n", prompt)
    print("REAL:\n", real)
    print("PREDICTION:\n", pred)

P, R, F1 = bert_score.score(preds, reals, lang="en")
average_F1 = sum(F1) / len(F1)
refs = [[r] for r in reals]
order = int(sum(len(s) for s in refs)/len(refs))
print(order)
bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order)

bleurt_score = bleurt.compute(predictions=preds, references=reals)
avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
results_data.append({'num_samples' : len(preds), 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu'], 'accuracy' : accuracy, 'bleurt' : avg_bleurt})
results_df = pd.DataFrame(results_data)
results_df.to_csv(f'it_results/it_results_lawqa_{model_name}.csv', index=False)

Length of test set:  100
PROMPT:
 ### Task: Write an answer to the following question starting with 'A:'.
 ### Inputs: Q: My son was jumped by 6 students and the school has done nothing. I want to sue for failure to provide a safe environment. The school has know about the situation for three weeks but have done nothing. They don’t believe my child was jumped but has also failed to provide the surveillance footage of the attack. They told me one child came forward and said he hit my child too hard. The school expects my child to continue his education in an environment where he doesn’t feel safe or protected. 
 ### Targets:
REAL:
 A:As a parent, it's distressing to hear that your child's safety is at risk. If the school is not addressing your concerns adequately, you might consider taking legal action. Schools have a duty to provide a safe environment for students. You can request the surveillance footage formally through a written request; if the school refuses, this may be something 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1
