resources:
- https://huggingface.co/docs/transformers/training
- Universal Language Model Fine-Tuning for Text Classification: https://arxiv.org/abs/1801.06146
- https://learn.deeplearning.ai/finetuning-large-language-models

### Imports

In [128]:
import datasets
import tempfile
import logging
import random
import config # need to also pip install python-configuration if created in a new env
import os
import yaml
import logging
import difflib
import pandas as pd
from datetime import datetime
from pprint import pprint

import transformers
import datasets
import torch

from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments

from utilities import *

logger = logging.getLogger(__name__)
global_config = None

### Load Dataset

In [90]:
# training on https://huggingface.co/datasets/medmcqa?row=0 dataset
dataset_path = "medmcqa"
use_hf = True

In [91]:
# load dataset
if use_hf:
    dataset = datasets.load_dataset(dataset_path)
else:
    dataset = load_dataset(dataset_path, tokenizer)

In [14]:
# manual inference on dataset
# might be worth only evaluating answers with explanations
print(val_dataset.to_pandas()['cop'].unique())
val_dataset.to_pandas().head(10)

[0 2 1 3]


Unnamed: 0,id,question,opa,opb,opc,opd,cop,choice_type,exp,subject_name,topic_name
0,45258d3d-b974-44dd-a161-c3fccbdadd88,Which of the following is not true for myelina...,Impulse through myelinated fibers is slower th...,Membrane currents are generated at nodes of Ra...,Saltatory conduction of impulses is seen,Local anesthesia is effective only when the ne...,0,multi,,Physiology,
1,b944ada9-d776-4c2a-9180-3ae5f393f72d,Which of the following is not true about glome...,The oncotic pressure of the fluid leaving the ...,Glucose concentration in the capillaries is th...,Constriction of afferent aeriole decreases the...,Hematocrit of the fluid leaving the capillarie...,0,multi,Ans-a. The oncotic pressure of the fluid leavi...,Physiology,
2,b64a9cd7-d076-4c55-8be1-f9c44fece6cc,A 29 yrs old woman with a pregnancy of 17 week...,No test is required now as her age is below 35...,Ultra sound at this point of time will definit...,Amniotic fluid samples plus chromosomal analys...,blood screening at this point of time will cle...,2,single,,Medicine,
3,c6365cce-507c-40f6-90a2-46b867f47b6e,Axonal transport is:,Antegrade,Retrograde,Antegrade and retrograde,,2,multi,Fast anterograde (400 mm/day) transport occurs...,Physiology,
4,72c1c5e0-b64f-4eef-bf22-ecfb60c5c19c,Low insulin to glucagon ratio is seen in all o...,Glycogen synthesis,Glycogen breakdown,Gluconeogenesis,Ketogenesis,0,multi,Answer- A. Glycogen synthesisLow insulin to gl...,Biochemistry,
5,17360c6c-2c98-4fe2-aa85-487dcf4678df,Concentration of tropicamide:,0.01,0.02,0.03,0.04,0,single,Answer- A. 0.01Tropicamide is the shoest actin...,Ophthalmology,
6,62fa6f78-1964-4249-974b-6fcbbd7fc9ba,Which of the following statements is true rega...,Pregnant woman with sore throat can be staed i...,People on long-term steroids cannot receive Os...,Category B concerns with low risk cases,Category B patients have to undergo immediate ...,0,multi,Ans: A. Pregnant woman with sore throat can be...,Medicine,
7,ce49098b-cc48-4168-859e-936e3e0c7459,Which of the following are not a branch of ext...,Sphenopalatine aery,Anterior ethmoidal aery,Greater palatine aery,Septal branch of superior labial aery,1,single,*Kiesselbach's plexus: Antero superior pa is s...,Anatomy,AIIMS 2017
8,18d5c4a1-cb81-41a8-9bfc-b6f7dec431d2,Diagnosis of the following ECG-,Ventricular bigeminy,Electrical alternans,P pulmonale,Left ventricular failure,1,single,Option A- Broad QRS complex with normal sinus ...,Medicine,AIIMS 2017
9,de09d388-bd4e-42a9-ac6b-ee2d95f822e2,A blue new born presents with cyanosis. The X–...,Ebstein's anomaly,Pulmonary atresia,Transposition of great arteries,Tetralogy of fallot,1,multi,The findings in this newborn are\nCyanosis at ...,Pediatrics,


### Instruction Fine Tuning - Dataset Prep

In [15]:
# instruction and input templates
# for other tasks, can format instruction template. contant for this dataset
instruction_template = "Answer the following multiple-choice question."
input_template = """
### Question: {question} 
### Answers: 
# {a}
# {b} 
# {c} 
# {d}
"""

In [32]:
# prompt templates

prompt_template_with_input = """Below is an instruction that describes a task, paired with inputs that provide further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""

prompt_template_without_input = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""

prompt_template_without_instruction = """

### Input:
{input}

### Response:
"""

In [61]:
# hydrating dataset with instruction prompts
# row based

def hydrate(row):
    input = input_template.format(
        question = row['question'],
        a = row['opa'][0],
        b = row['opb'][0],
        c = row['opc'][0],
        d = row['opd'][0]
    )
    # prompt hydration
    processed_prompt = prompt_template_without_instruction.format(input=input)
    
    if row['cop'][0] == -1 or not row['exp'][0]: # test dataset
        output = "N/A"
    else: # training and val datasets
        # processed_prompt = prompt_template_with_input.format(instruction=instruction_template, input=input)
        output = str(row['cop'][0]) + ': ' + row['exp'][0]
    
    return {"input": processed_prompt, "output": output}

### Instruction Fine Tuning - Tokenization

In [62]:
# base llama model
model_name = "EleutherAI/pythia-410m"

In [63]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [64]:
def tokenize_function(row):
    qa_dict = hydrate(row)
    text = qa_dict['input'] + qa_dict["output"]

    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [65]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

Map:   0%|          | 0/182822 [00:00<?, ? examples/s]

Map:   0%|          | 0/6150 [00:00<?, ? examples/s]

Map:   0%|          | 0/4183 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name', 'input_ids', 'attention_mask'],
        num_rows: 182822
    })
    test: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name', 'input_ids', 'attention_mask'],
        num_rows: 6150
    })
    validation: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name', 'input_ids', 'attention_mask'],
        num_rows: 4183
    })
})


In [75]:
# add labels for trainer param
for split in tokenized_dataset.keys():
    tokenized_dataset[split] = tokenized_dataset[split].add_column("labels", tokenized_dataset[split]["input_ids"])

tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 182822
    })
    test: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6150
    })
    validation: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4183
    })
})

In [77]:
# save dataset
dataset_path = "medmcqa_tokenized.hf"
tokenized_dataset.save_to_disk(dataset_path)

Saving the dataset (0/2 shards):   0%|          | 0/182822 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6150 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4183 [00:00<?, ? examples/s]

### Set Up Training Config and Updated Dataset

In [92]:
# use saved dataframe created during dataset processing
use_hf = False

In [79]:
training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}

In [108]:
# load dataset from local

# TODO - loading?
# dataset = datasets.load_dataset(dataset_path)

dataset = tokenized_dataset

train_dataset = dataset["train"]
test_dataset = dataset["test"]
val_dataset = dataset["validation"] if "validation" in dataset else None

In [116]:
# ensure dataset has valid features

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 182822
    })
    test: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6150
    })
    validation: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4183
    })
})

### Load Model

In [112]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

### Inference

In [113]:
def inference(text, model, tokenizer, max_input_tokens = 1000, max_output_tokens = 100):
    # Tokenize
    input_ids = tokenizer.encode(
        text, 
        return_tensors = "pt",
        truncation = True,
        max_length = max_input_tokens
    )
    # Generate
    device = model.device
    generated_output_with_prompt = model.generate(
        input_ids = input_ids.to(device),
        max_length = max_output_tokens
    )
    # Decode
    generated_text_with_prompt = tokenizer.batch_decode( # batch decode
        generated_output_with_prompt, 
        skip_special_tokens = True
    )
    generated_text_answer = generated_text_with_prompt[0][len(text):]
    
    return generated_text_answer

In [114]:
# base model test
val_text = val_dataset[0]['question']
print("Question input (test):", val_text)
print(f"Correct answer: {val_dataset[0]['cop']}, {val_dataset[0]['exp']}")
print("Model's answer: ")
print(inference(val_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): Which of the following is not true for myelinated nerve fibers:
Correct answer: 0, None
Model's answer: 


1.  The myelinated nerve fibers are not myelinated.

2.  The myelinated nerve fibers are not myelinated.

3.  The myelinated nerve fibers are not myelinated.

4.  The myelinated nerve fibers are not myelinated.

5.  The myelinated nerve fibers are not myelinated.

6.  The myelinated nerve fibers are not


### Set Up Training

In [117]:
max_steps = 5 # experiment w/ val

In [118]:
trained_model_name = f"medmcqa_{max_steps}_steps_{datetime.now()}"
output_dir = "models"

In [127]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False,

)

In [124]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

In [125]:
trainer = Trainer(
    model=base_model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

### Training

In [129]:
training_output = trainer.train()

AttributeError: 'AcceleratorState' object has no attribute 'distributed_type'

### Appendix

In [17]:
# hydrating dataset with instruction prompts
# dataset based

def hydrate(dataset):
    processed_data = []
    
    for row in dataset:
        # input hydration
        input = input_template.format(
            question = row['question'],
            a = row['opa'],
            b = row['opb'],
            c = row['opc'],
            d = row['opd']
        )
        # prompt hydration
        if row['cop'] == -1 or not row['exp']: # test dataset
            continue
        else: # training and val datasets
            processed_prompt = prompt_template_with_input.format(instruction=instruction_template, input=input)
        
        processed_data.append({"input": processed_prompt, "output": str(row['cop']) + ': ' + row['exp']})

    return processed_data


processed_data = hydrate(train_dataset)