# Instruction Tuning on Datasets

In [1]:
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from encodeinstruction import encodeinstruction
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import json
import bert_score
import evaluate

import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device being used:", device)


  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(



Device being used: cuda


In [4]:
model_it =  GPT2LMHeadModel.from_pretrained( "new_test/trained_model").to(device) #instruction tuned gpt2 model
tokenizer_it = GPT2Tokenizer.from_pretrained("new_test/tokenizer")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)



model_mist = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", quantization_config=bnb_config, device_map="auto")
tokenizer_mist = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

Loading checkpoint shards: 100%|██████████| 2/2 [00:38<00:00, 19.49s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def evaluate_example(prompt, model, tokenizer, max_tokens):
    tokenized_prompt = tokenizer(prompt, return_tensors='pt').to(device)
    outputs =model.generate(**tokenized_prompt, pad_token_id=tokenizer.eos_token_id, max_new_tokens=max_tokens)
    decoded_output = tokenizer.decode(outputs[0][len(tokenized_prompt['input_ids'][0]):], skip_special_tokens=True)
    return decoded_output

def count_tokens(tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, add_special_tokens=True)
    return len(input_ids)


# tokenizer_mist.add_bos_token, tokenizer_mist.add_eos_token

print(evaluate_example("What colour is the sky?", model_mist, tokenizer_mist, max_tokens=100))



It's blue. But not just any blue. It's a specific shade of blue that has been scientifically proven to be the most calming and soothing colour for the human eye. It's a colour that has been associated with tranquility, peace, and stability. It's a colour that has been used in art, architecture, and design to create a sense of harmony and balance. It's a colour that has been studied and analyzed by scientists and


### Test on Natural Instructions Data

In [7]:
import pandas as pd

data = load_from_disk('data/1000_per_task')

# data = filter_icl(data, max_num_egs, tokenizer_plain)

bleu = evaluate.load('bleu')

# train_test_split = data.train_test_split(test_size=0.2, seed=42)
# train_dataset = train_test_split['train']
# test_dataset = train_test_split['test']

model_name = 'mistral_it'

train_dataset = data['train']
test_dataset = data['test'].select(range(100))

reals = []
preds = []
results_data = []
for example in test_dataset:
    prompt = f"<s> {example['definition']} {example['inputs']} [INST]"
    real = f"{example['targets']}"
    max_tokens = count_tokens(tokenizer_mist, real) #maximum number of tokens to generate
    pred = evaluate_example(prompt, model_mist, tokenizer_mist, max_tokens+100) #add 100 to reduce bias
    reals.append(real.lower())
    preds.append(pred.lower())
    print("PROMPT:\n", prompt)
    print("REAL:\n", real)
    print("PREDICTION:\n", pred)

P, R, F1 = bert_score.score(preds, reals, lang="en")
average_F1 = sum(F1) / len(F1)
average_precision = sum(P)/len(P)
average_recall = sum(R)/len(R)
refs = [[r] for r in reals]
order = int(sum(len(s) for s in refs)/len(refs))
print(order)
bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order)
results_data.append({'num_samples' : len(preds), 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu']})
results_df = pd.DataFrame(results_data)
results_df.to_csv(f'it_results/it_results_ni_{model_name}.csv', index=False)


PROMPT:
 <s> The answer will be 'yes' if the provided sentence contains an explicit mention that answers the given question. Otherwise, the answer should be 'no'. Instances where the answer is implied from the sentence using "instinct" or "common sense" (as opposed to being written explicitly in the sentence) should be labeled as 'no'. Sentence: Jerry goes out to the pier and casts his favorite bait : cheese . 
Question: How much time did Jerry spend at the pier? [INST]
REAL:
 No.
PREDICTION:
 No, the sentence does not provide any information about how much time Jerry spent at the pier.
PROMPT:
 <s> The answer will be 'yes' if the provided sentence contains an explicit mention that answers the given question. Otherwise, the answer should be 'no'. Instances where the answer is implied from the sentence using "instinct" or "common sense" (as opposed to being written explicitly in the sentence) should be labeled as 'no'. Sentence: The previous numismatic record holder was an 1804 U.S. sil

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1


### Test on Medical MCQ

In [9]:
import pandas as pd

# data = filter_icl(data, max_num_egs, tokenizer_plain)

bleu = evaluate.load('bleu')

model_name = 'gpt2_small_it'

data = load_dataset('medalpaca/medical_meadow_medqa')['train']
train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))

reals = []
preds = []
results_data = []
for example in test_dataset:
    prompt = f"### Task: {example['instruction']}\n ### Question: {example['input']}\n ### Targets:"
    real = f"{example['output']}"
    max_tokens = count_tokens(tokenizer_it, real)
    pred = evaluate_example(prompt, model_it, tokenizer_it, max_tokens+100)
    reals.append(real.lower())
    preds.append(pred.lower())

P, R, F1 = bert_score.score(preds, reals, lang="en")
average_F1 = sum(F1) / len(F1)
refs = [[r] for r in reals]
order = int(sum(len(s) for s in refs)/len(refs))
print(order)
bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order)
accuracy = 0
for r,p in zip(reals, preds):
        if len(p.strip()) != 0:
            if r.strip()[0] == p.strip()[0]:
                accuracy+=1
accuracy = accuracy/len(preds)
results_data.append({'num_samples' : len(preds), 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu'], 'accuracy' : accuracy})
results_df = pd.DataFrame(results_data)
results_df.to_csv(f'it_results/it_results_medmcq_{model_name}.csv', index=False)



 D: Hypercoagulable state

  B

 B: GAA

  CGG

 C: Breakdown of endothelial tight junctions

  B

 E: Cor pulmonale

  A

 E: Begin cognitive behavioral therapy

  D

 D: IV acyclovir

  A

 B: Discoid lupus erythematosus (DLE)

  D

 D: Low urine sodium

  C

 C: Herpes simplex virus

  D

 D: CT scan of the head

  A

 D: Enterocele

  D

 C: Lysyl oxidase

  C

 C: Ventricular septal defect (VSD)

  D

 E: Replication of the attenuated vaccine strain

  B

 D: Pick bodies

  B

 B: Small cell lung carcinoma

  D

 A: 99mTc sestamibi scan with ultrasound of the neck

  B

 E: Terminal bronchioles

  D

 D: Carotid stenting

  D

 B: Rotator cuff tear

  D

 A: CT scan

  A

 B: Calcitriol

  B

 D: Hemangioblastoma

  B

 E: HLA-DR4

  D

 A: Autosomal dominant polycystic kidney disease (ADPKD)

  C

 A: Posterior midline of the anal canal, distal to the pectinate line

  D

 A: ↑ NADH/NAD+; AST:ALT ≥ 2:1; ß-oxidation ↓; ß-hydroxybutyrate ↑; lactic acid ↑

  C

 A: Decreased pH

  

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1


### Test on Medical QA

In [12]:
data = load_dataset('Laurent1/MedQuad-MedicalQnADataset_128tokens_max')['train']
bleu = evaluate.load('bleu')

def select_characters_before_target(string, target_phrase="\n ### Targets:"): #this is a function to remove the actual target values from the train example so that the matching can be improved
    target_index = string.find(target_phrase)
    if target_index != -1:  # If the phrase is found
        return string[:target_index] + target_phrase
    else:
        return string 

def extract_response_content(string, target_phrase):
    response_index = string.find(target_phrase)
    return string[response_index + len(target_phrase):].strip()

def filter_example(example):
    return count_tokens(tokenizer_it, extract_response_content(example['text'], "### Response:")) <= 300

data = data.filter(filter_example)

print(len(data))

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


max_token_dict = {}
for example in test_dataset:
    real = extract_response_content(example['text'], "### Response:")
    max_token_dict[real] = count_tokens(tokenizer_it, real)


print("Length of test set: ", len(test_dataset))

model_name = 'gpt2_small_it'
ds_name = 'medqa'

reals = []
preds = []
results_data = []
for example in test_dataset:
    prompt = select_characters_before_target(example['text'], "### Response:")
    real = extract_response_content(example['text'], "### Response:")
    max_tokens = max_token_dict[real] + 100
    pred = evaluate_example(prompt, model_it, tokenizer_it, max_tokens+100)
    reals.append(real.lower())
    preds.append(pred.lower())
    print("PROMPT:\n", prompt)
    print("REAL:\n", real)
    print("PREDICTION\n", pred)

P, R, F1 = bert_score.score(preds, reals, lang="en")
average_F1 = sum(F1) / len(F1)
refs = [[r] for r in reals]
order = int(sum(len(s) for s in refs)/len(refs))
print(order)
bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order)
# accuracy = 0
# for r,p in zip(reals, preds):
#         if len(p.strip()) != 0:
#             if r.strip()[0] == p.strip()[0]:
#                 accuracy+=1
# accuracy = accuracy/len(preds)
results_data.append({'num_samples' : len(preds), 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu']})
results_df = pd.DataFrame(results_data)
results_df.to_csv(f'it_results/it_results_medqa_{model_name}.csv', index=False) #interestingly, predictions include the ###Targets phrase that the model was trained on originally

Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


15549
Length of test set:  100
PROMPT:
 Below is an instruction from Human. Write a response.
    ### Instruction:
    Is anencephaly inherited ?
    ### Response:
REAL:
 Most cases of anencephaly are sporadic, which means they occur in people with no history of the disorder in their family. A small percentage of cases have been reported to run in families; however, the condition does not have a clear pattern of inheritance.
PREDICTION
 
                                                                                                                                                                                                                                                           
PROMPT:
 Below is an instruction from Human. Write a response.
    ### Instruction:
    What are the symptoms of Permanent neonatal diabetes mellitus ?
    ### Response:
REAL:
 What are the signs and symptoms of Permanent neonatal diabetes mellitus? The Human Phenotype Ontology provides the following list

KeyboardInterrupt: 

### Test on Law QA

In [13]:

max_num_egs = 3

data = load_dataset('dzunggg/legal-qa-v1')['train']
bleu = evaluate.load('bleu')

def filter_example(example):
    return count_tokens(tokenizer_it, example['answer']) <= 100

data = data.filter(filter_example)

print(len(data))

train_test_split = data.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test'].select(range(100))


print("Length of test set: ", len(test_dataset))

max_token_dict = {}
for eg in test_dataset:
    max_token_dict[eg['answer']] = count_tokens(tokenizer_it, eg['answer'])

model_name = 'gpt2_small_it'

reals = []
preds = []
results_data = []
for example in test_dataset:
    prompt = f"### Question: {example['question']}\n ### Answer:"
    real = example['answer']
    max_tokens = max_token_dict[real] + 100
    pred = evaluate_example(prompt, model_it, tokenizer_it, max_tokens+100)
    reals.append(real.lower())
    preds.append(pred.lower())
    print("PROMPT:\n", prompt)
    print("REAL:\n", real)
    print("PREDICTION\n", pred)

P, R, F1 = bert_score.score(preds, reals, lang="en")
average_F1 = sum(F1) / len(F1)
refs = [[r] for r in reals]
order = int(sum(len(s) for s in refs)/len(refs))
print(order)
bleu_score = bleu.compute(predictions=preds, references=refs, max_order= order)
# accuracy = 0
# for r,p in zip(reals, preds):
#         if len(p.strip()) != 0:
#             if r.strip()[0] == p.strip()[0]:
#                 accuracy+=1
# accuracy = accuracy/len(preds)
results_data.append({'num_samples' : len(preds), 'bert_score' : float(average_F1), 'bleu_score': bleu_score['bleu']})
results_df = pd.DataFrame(results_data)
results_df.to_csv(f'it_results/it_results_lawqa_{model_name}.csv', index=False) #interestingly, predictions

Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Filter:  80%|████████  | 3000/3742 [00:01<00:00, 1590.61 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1054 > 1024). Running this sequence through the model will result in indexing errors
Filter: 100%|██████████| 3742/3742 [00:03<00:00, 1029.87 examples/s]


1206
Length of test set:  100
PROMPT:
 ### Question: Q: HOW TO FILE A NEGLIGENCE COMPLAINT AGAINST A DR., SPARROW, AND ANURSING HOME/ SOCIAL WORKER?. THIS HAS TO DO WITH MY SISTER WHO HAS A MENTAL CONDITION FOR YRS .LAST YR SHE FELL AND WAS TAKEN TO CARSON HOSPITAL FOR THAT . THAT IS WHERE THE NIGHTMARE BEGAN. SHE HAD BEE TAKING CLOIRIL FOR YRS AND I RECOGNIZED IT WAS AFFECTING HER SPEECH AND HER ARMS BECAME JERKY AND BECAUSE OF BEING AROUND MENTAL ILLNESS KNEW IT WAS THE MEDICATION RIGHT AWAY HER DR. SAID IT WAS PARKENSON'S DESEASE WITHOUT FURTHER DIAGNOTICS HE GAVE HER PILLS FOR IT AND HAS THE SAME INGREDIENTS AS THE OTHE 1ST HOSPITAL PUT HER IN BRIEFS INSTEAD OF GETTING HER TO THE BATHROOM. RIGHT THEN I TOLD THEM ,HER DR., AND THIS SOCIAL WORKER AND NURSE TO EITHER CHANGE THE PILL OR LOWER IT SOCIAL I THOUGHT SHE WAS GOING TO HAVE THAT DONE DID NOT HAPPEN SUPPOSE TO GET CARE SHE WAS IN THESE PLACES FOR ABOUT 2 YRS.NOTHING DONE FAST FORWARD SHE WAS RUSHED TO SPARROW JAN. 22 AND HAD T

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1
