# Finetuning Base Models

In [1]:
%%bash
# install necessary libraries
pip install -r requirements.txt

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting langchain-experimental
  Downloading langchain_experimental-0.0.53-py3-none-any.whl.metadata (2.1 kB)
Collecting langchain<0.2.0,>=0.1.8 (from langchain-experimental)
  Downloading langchain-0.1.11-py3-none-any.whl.metadata (13 kB)
Collecting langchain-core<0.2.0,>=0.1.27 (from langchain-experimental)
  Downloading langchain_core-0.1.30-py3-none-any.whl.metadata (6.0 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain<0.2.0,>=0.1.8->langchain-experimental)
  Downloading SQLAlchemy-2.0.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain<0.2.0,>=0.1.8->langchain-experimental)
  Downloading aiohttp-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x8

In [2]:
# import necessary libraries
import json
import pandas as pd
import os
import torch
from datasets import load_dataset,DatasetDict, Dataset
from transformers import pipeline,DataCollatorForSeq2Seq,AutoModelForSeq2SeqLM, AutoTokenizer,TrainingArguments, Trainer

In [3]:
# set the gpu device as gpu
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [18]:
# define model checkpoint and load tokenizer and model
MODEL_CHECKPOINT = 'facebook/bart-base'
# download data using https://drive.google.com/file/d/1tuiAVkfy_EVM0zavMcWPvqeuEGDdpbnG/view?usp=sharing
INPUT_DATA_PATH = './finetune_data.txt'

## Load Pubmed Dataset
Load data into a dataframe with article_id, article_text and article_text_raw as attributes

In [19]:
def load_data(path:str=INPUT_DATA_PATH)->pd.DataFrame:
    """
    Load Pubmed dataset to dataframe
    """
    pubmed_data={}
    article_id=""
    article_text=""
    print("############# Started Data Loading ##############")
    with open(path, 'r') as file:
        for line in file:
            article_id = json.loads(line)['article_id']
            article_text = " ".join(json.loads(line)['article_text'])
            abstract_text="".join(x.strip("<S> </S>") for x in json.loads(line)['abstract_text'])
            pubmed_data[article_id]=[article_text,abstract_text]
    df=pd.DataFrame.from_dict(pubmed_data,orient='index',columns=['article_text','abstract_text'])
    print("############# Finished Data Loading ##############")
    return df

## Preprocess Data to convert it from Dataframe to DatasetDict

In [20]:
class preprocess_data():
    columns = ['input_ids', 'labels', 'attention_mask']
    TRAIN_RANGE = 1000
    VAL_RANGE = 1125
    TEST_RANGE = 1250
    TOKENIZER=AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
    def get_feature(self,batch:DatasetDict)->dict:
        """
        Get encodings for all the input data
        """
        encodings = self.TOKENIZER(batch['article_text'], text_target=batch['abstract_text'],max_length=512, truncation=True)
        encodings = {'input_ids': encodings['input_ids'],'attention_mask': encodings['attention_mask'],'labels': encodings['labels']}
        return encodings
        
    def map_data(self,pubmed_df:pd.DataFrame)->DatasetDict:
        """
        Map the input data to be compatible with pytorch format
        """
        data_dict = {
        "train": Dataset.from_dict({"article_text": pubmed_df['article_text'].to_list()[:self.TRAIN_RANGE], "abstract_text": pubmed_df['abstract_text'].to_list()[:self.TRAIN_RANGE]}),
        "validation": Dataset.from_dict({"article_text": pubmed_df['article_text'].to_list()[self.TRAIN_RANGE:self.VAL_RANGE], "abstract_text": pubmed_df['abstract_text'].to_list()[self.TRAIN_RANGE:self.VAL_RANGE]}),
        "test": Dataset.from_dict({"article_text": pubmed_df['article_text'].to_list()[self.VAL_RANGE:self.TEST_RANGE], "abstract_text": pubmed_df['abstract_text'].to_list()[self.VAL_RANGE:self.TEST_RANGE]})
        }
        dataset_dict = DatasetDict(data_dict)
        pubmed_pt = dataset_dict.map(self.get_feature, batched=True)
        pubmed_pt.set_format(type='torch', columns=self.columns)
        return pubmed_pt

## Trainer Class

In [29]:
class trainer_class:
    def __init__(self,pubmed_pt):
        """
        initialize arguments for training
        """
        self.MODEL=AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
        self.TOKENIZER=AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
        self.DATA_COLLATOR = DataCollatorForSeq2Seq(self.TOKENIZER, model=self.MODEL)
        self.TRAINING_ARGS = TrainingArguments(
            output_dir = MODEL_CHECKPOINT,
            num_train_epochs=10,
            warmup_steps = 500,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            weight_decay = 0.01,
            logging_steps = 10,
            evaluation_strategy = 'steps',
            eval_steps=500,
            save_steps=1e6,
            gradient_accumulation_steps=16
            )
        self.PUBMED_PT=pubmed_pt

    def train_model(self):
        """
        train the model and save it locally
        """
        trainer = Trainer(model=self.MODEL, args=self.TRAINING_ARGS, tokenizer=self.TOKENIZER, data_collator=self.DATA_COLLATOR,train_dataset = self.PUBMED_PT['train'], eval_dataset = self.PUBMED_PT['validation'])
        trainer.train()
        trainer.save_model(MODEL_CHECKPOINT+'_model')

## Start finetuning

In [22]:
%%time
pubmed_df=load_data()

############# Started Data Loading ##############
############# Finished Data Loading ##############


In [23]:
%%time
pubmed_pt = preprocess_data().map_data(pubmed_df)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [36]:
%%time
trainer_class(pubmed_pt).train_model()

Step,Training Loss,Validation Loss


TrainOutput(global_step=12, training_loss=4.360424836476644, metrics={'train_runtime': 1235.9057, 'train_samples_per_second': 0.647, 'train_steps_per_second': 0.01, 'total_flos': 234138799964160.0, 'train_loss': 4.360424836476644, 'epoch': 0.96})