## Importing Libraries

In [1]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel,GPT2Tokenizer,TrainingArguments,Trainer,TextDataset,DataCollatorForLanguageModeling
import warnings 
warnings.filterwarnings('ignore')

## Importing Model 

In [20]:
model_name='gpt2'
tokenizer=GPT2Tokenizer.from_pretrained(model_name)
model=GPT2LMHeadModel.from_pretrained(model_name)

In [15]:
data=pd.read_csv('final_data.csv')

In [16]:
data.shape

(20855, 3)

In [17]:
data.head()

Unnamed: 0.1,Unnamed: 0,Questions,Answers
0,0,Hi,Hello there. Tell me how are you feeling today?
1,1,Hey,Hi there. What brings you here today?
2,2,Is anyone there?,Hi there. How are you feeling today?
3,3,Hi there,Great to see you. How do you feel currently?
4,4,Hello,Hello there. Glad to see you. What is going on...


In [18]:
data.tail()

Unnamed: 0.1,Unnamed: 0,Questions,Answers
20850,20850,"Okay,Thanks a lot",Thank you for visiting. Have a nice day!
20851,20851,"Got it,Thank you",Thank you for visiting. Have a nice day!
20852,20852,"Got it,Thank you so much",Thank you for visiting. Have a nice day!
20853,20853,"Got it,Thanks",Thank you for visiting. Have a nice day!
20854,20854,"Got it,Thanks a lot",Thank you for visiting. Have a nice day!


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20855 entries, 0 to 20854
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  20855 non-null  int64 
 1   Questions   20855 non-null  object
 2   Answers     20852 non-null  object
dtypes: int64(1), object(2)
memory usage: 488.9+ KB


In [20]:
data.duplicated().sum()

0

In [35]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [36]:
def load_dataset(file_path,tokenizer,block_size=1024):
    train_data=TextDataset(tokenizer=tokenizer,file_path=file_path,block_size=block_size)
    return train_data

In [37]:
def load_data_collator(tokenizer,mlm=False):
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=mlm)
    return data_collator

In [38]:
def train(train_file_path,model_name,output_dir,overwrite_output_dir,train_batch_size,epochs):
    tokenizer=GPT2Tokenizer.from_pretrained(model_name)
    train_data=load_dataset(train_file_path,tokenizer)
    
    data_collator=load_data_collator(tokenizer)
    
    tokenizer.save_pretrained(output_dir)
    
    model=GPT2LMHeadModel.from_pretrained(model_name)
    model.save_pretrained(output_dir)
    
    training_arguments=TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=train_batch_size,
        logging_dir='./log',
        logging_steps=50,
        save_steps=150,
        logging_first_step=True,
        save_total_limit=2,
        learning_rate=0.001
    )
    
    trainer=Trainer(
        model=model,
        args=training_arguments,
        data_collator=data_collator,
        train_dataset=train_data
    )
    hist=trainer.train()
    trainer.save_model()
    return trainer,hist

In [39]:
train_file_path='C:\\Users\\DELL\\Documents\\CWB_HACK\\500_.csv'
model_name=model_name
output_dir='./model'
overwrite_output_dir=True
train_batch_size=3
epochs=10

In [40]:
hist=train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    train_batch_size=train_batch_size,
    epochs=epochs)


  0%|          | 0/490 [00:00<?, ?it/s]

{'loss': 3.4527, 'grad_norm': 5.69218111038208, 'learning_rate': 0.0009979591836734693, 'epoch': 0.02}
{'loss': 3.5727, 'grad_norm': 1.2075151205062866, 'learning_rate': 0.0008979591836734694, 'epoch': 1.02}
{'loss': 2.6217, 'grad_norm': 1.04498291015625, 'learning_rate': 0.0007959183673469387, 'epoch': 2.04}
{'loss': 1.9433, 'grad_norm': 1.0610477924346924, 'learning_rate': 0.0006938775510204082, 'epoch': 3.06}
{'loss': 1.3459, 'grad_norm': 1.0818220376968384, 'learning_rate': 0.0005918367346938776, 'epoch': 4.08}


KeyboardInterrupt: 

In [None]:
print('Run-time: ',hist[1].metrics['train_runtime'])
print('Loss: ',hist[1].metrics['train_loss'])

Run-time:  255.8535
Loss:  0.6654784452915191
