## 1. Import Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install evaluate
!pip install sacrebleu
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [

In [3]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import os
import torch
from tqdm import tqdm
# from datasets import metric
import evaluate

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
MAX_LENGTH=500
BATCH_SIZE=16

## 2. Load Dataset & Prepocess the Data
- Load Train and Test dataset
- Truncate the answers to MAX_LENGTH.
- Concat Question and Answer using special token, and add start and end token.
-  Repeate the above two steps for Train and Test


In [6]:
train_test_dict={"train":{},"test":{}}

In [7]:
tokenizer=AutoTokenizer.from_pretrained('distilgpt2', add_special_tokens=True,additional_special_tokens=['[response]'], pad_token="[Pad]",padding_side='left')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
bos_token=tokenizer.decode(tokenizer.bos_token_id)
eos_token=tokenizer.decode(tokenizer.eos_token_id)

In [9]:
def truncate_to_n_words(s, n):
    return ' '.join(s.split()[:n])

def sequence_len(s):
    return len(s.split())

In [10]:
for key in train_test_dict:
    dataset_pd=pd.read_csv(f'/content/drive/MyDrive/NLP_Project/{key}_datasets/MedQuAD_{key}.csv')
    dataset_pd=dataset_pd.astype('string')
    dataset_pd['length'] = dataset_pd['Answer'] .apply(lambda x: sequence_len(x))
    dataset_pd['Truncated_Answer'] = dataset_pd['Answer'].apply(lambda x: truncate_to_n_words(x, MAX_LENGTH))
    dataset_pd['Question']=bos_token + dataset_pd['Question'] + ' [response] '
    dataset_pd['QA_pairs'] = dataset_pd['Question']  + dataset_pd['Answer'] + eos_token
    dataset_list=list(dataset_pd['QA_pairs'])
    questions_list=list(dataset_pd['Question'])
    train_test_dict[key]['dataset']=dataset_pd
    train_test_dict[key]['question_list']=questions_list
    train_test_dict[key]['QA_list']=dataset_list



## 3. Initialize pytorch dataset and dataloaders

In [11]:
class MedicalDataset(Dataset):
    def __init__(self, data,  tokenizer, questions):
        self.tokenizer = tokenizer
        self.data=self.tokenizer(data, padding=True, truncation=True ,return_tensors='pt')
        self.non_tokenized_data= data
        self.tokenized_questions=self.tokenizer(questions, padding=True, truncation=False, return_tensors='pt')
    def __len__(self):
        return len(self.data['input_ids'])

    def __getitem__(self, idx):
        return self.data['input_ids'][idx], self.data['attention_mask'][idx], self.non_tokenized_data[idx],self.tokenized_questions['input_ids'][idx], self.tokenized_questions['attention_mask'][idx]



In [12]:
train_dataset = MedicalDataset(train_test_dict['train']['QA_list'], tokenizer, train_test_dict['train']['question_list'])
test_dataset = MedicalDataset(train_test_dict['test']['QA_list'], tokenizer, train_test_dict['test']['question_list'])


In [13]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

## 4. Define the model
- Resize the total number of model's embeddings  as we added a special token ('['response']') for sperating the question from the answer.
- Set the model's hyperparams, optimizer and the learning rate scheduler.

In [None]:
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))
model=model.to(device)

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
epochs = 3
learning_rate = 5e-4
warmup_steps = 1e2
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler=get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps= epochs*(len(train_loader)))
grad_accumulatio_steps=2

## 6. Model Training

In [None]:
from datetime import datetime

date_string = 'fine_tune_model_' + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
train_loss_values=[]
test_loss_values=[]
ctr=1
for epoch in range(epochs): # number of epochs
    model.train()
    train_loss=0
    model.train()
    for input_ids, attention_mask,raw_data, toknized_questions_id,toknized_questions_mask in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids=input_ids.to(device)
        attention_mask=attention_mask.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        train_loss+=loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        # if (grad_accumulatio_steps != 0) and  ((grad_accumulatio_steps %grad_accumulatio_steps) ==0):
        #     optimizer.step()
        #     scheduler.step()
        #     ctr+=1
    model.eval()
    val_loss=0
    with torch.no_grad():
      for input_ids, attention_mask, raw_data, toknized_questions_id,toknized_questions_mask in tqdm(test_loader):
        input_ids=input_ids.to(device)
        attention_mask=attention_mask.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        val_loss += outputs.loss.item()
    avg_val_loss = val_loss / len(test_loader)
    test_loss_values.append(avg_val_loss)


    avg_train_loss = train_loss / len(train_loader)
    train_loss_values.append(avg_train_loss)

    print(f"Epoch {epoch}, train_loss: {avg_train_loss}, val_loss: {avg_val_loss}")


100%|██████████| 1488/1488 [17:32<00:00,  1.41it/s]
100%|██████████| 372/372 [01:27<00:00,  4.23it/s]


Epoch 0, train_loss: 0.600176092818059, val_loss: 0.47433350347383046


100%|██████████| 1488/1488 [17:31<00:00,  1.42it/s]
100%|██████████| 372/372 [01:27<00:00,  4.24it/s]


Epoch 1, train_loss: 0.4685447421425613, val_loss: 0.45927348760988124


100%|██████████| 1488/1488 [17:31<00:00,  1.42it/s]
100%|██████████| 372/372 [01:27<00:00,  4.23it/s]

Epoch 2, train_loss: 0.4428419076427016, val_loss: 0.4515089931427151





In [None]:
model.save_pretrained('/content/drive/MyDrive/NLP_Project/models/fineTune_models/MedQuad_left_pad_Latest_fineTuned_gpt2_3epoch'+date_string)

## 7. Model Evaluation
- Generating the sacreBleu and Rouge scores on test datasset.



In [14]:
model=GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/NLP_Project/models/fineTune_models/MedQuad_left_pad_Latest_fineTuned_gpt2_3epochfine_tune_model_2024_04_06_14_18_29/').to(device)


In [15]:
bleu={}
rouge={}


In [16]:
bleu['score']=6.27
rouge['rouge1']=0.117
rouge['rougeL']=0.103

In [17]:
model.eval()
bleu=evaluate.load('sacrebleu')
rouge=evaluate.load('rouge')

for _, _, raw_data, toknized_questions_id,toknized_questions_mask in tqdm(test_loader):
    input_ids=toknized_questions_id.to(device)
    attention_mask=toknized_questions_mask.to(device)
    answers=[qa.split('[response]')[-1] for qa in raw_data]
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask,  pad_token_id=tokenizer.pad_token_id,max_new_tokens=MAX_LENGTH)
    decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    for pred, ref in zip(answers, decoded_outputs):
        bleu.add(references = ref, predictions=pred)
        rouge.add(references = ref, predictions=pred)

bleu=bleu.compute()
rouge=rouge.compute()




print(f"Bleu: {bleu['score']},  Rouge:{rouge['rouge1']}, Rouge:{rouge['rougeL']}")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

100%|██████████| 205/205 [10:39<00:00,  3.12s/it]


Bleu: 6.27,  Rouge:0.117, Rouge:0.103


### 7.3) Chekcing the reponse for any one question.

In [20]:
output_1391=model.generate(input_ids=torch.tensor(tokenizer(questions_list[11])['input_ids']).reshape(1,-1).to(device),attention_mask=torch.tensor(tokenizer(questions_list[11])['attention_mask']).reshape(1,-1).to(device), pad_token_id=tokenizer.pad_token_id,max_new_tokens=MAX_LENGTH)

In [21]:
tokenizer.decode(tokenizer(bos_token+questions_list[11])['input_ids'][1:],skip_special_tokens=False)

'<|endoftext|>What to do for Gastritis? [response] '

In [22]:
tokenizer.decode(output_1391[0], skip_special_tokens=False)

'<|endoftext|>What to do for Gastritis? [response]  Gastritis is a disease in which the body does not make enough of a protein called a substance called a protein.  The body does not make enough of a protein called a substance called a protein. The body does not make enough of a protein called a substance called a protein. Gastritis is a disease in which the body does not make enough of a protein called a substance called a protein. The body does not make enough of a protein called a substance called a protein. Gastritis is a disease in which the body does not make enough of a protein called a substance. The body does not make enough of a protein called a substance. Gastritis is a disease in which the body does not make enough of a protein called a substance. Gastritis is a disease in which the body does not make enough of a protein called a substance. Gastritis is a disease in which the body does not make enough of a protein called a substance. Gastritis is a disease in which the body