In [2]:
from datasets import Dataset, load_dataset, concatenate_datasets
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2TokenizerFast
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from IPython.display import FileLink, FileLinks
import sklearn
import os
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim

In [3]:
# !pip3 install datasets --upgrade            #resolving the error i.e keyerror: length when loading dataset from huggingface_hub

Specifying the model and tokenizer

In [4]:
model_id = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_id)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

tokenization and adding pad token (eos_token) especially for the text generational model

In [6]:
# tokenizer.padding_side = "left"
# tokenizer.pad_token = tokenizer.eos_token

# def tokenize_dataset(example):
#     return tokenizer(example['text'], padding = 'max_length', truncation = True, max_length= 128, return_tensors = "pt")
# tokenized_dataset = dataset.map(tokenize_dataset)
# tokenized_dataset = tokenized_dataset.with_format('torch')

Pushing to hugging face for future purposes

In [None]:
#go into the paperspace terminal and type huggingface-cli login and enter the token and comeback here

# from huggingface_hub import notebook_login
# notebook_login()
# tokenized_dataset.push_to_hub('Yunij/daig2-tokenized-dataset')

Loading the updated tokenized dataset (padding to the left scenario after adding the eos token) to the huggingface hub

In [7]:
tokenized_dataset = load_dataset('Yunij/daig2-tokenized-dataset')

Downloading readme:   0%|          | 0.00/607 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/109M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/44868 [00:00<?, ? examples/s]

In [8]:

tokenized_dataset = tokenized_dataset['train'].train_test_split(test_size=0.2, seed = 42) 
tokenized_dataset = tokenized_dataset.with_format('torch')

In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'prompt_name', 'source', 'RDizzl3_seven', 'cleaned_text', 'input_ids', 'attention_mask'],
        num_rows: 35894
    })
    test: Dataset({
        features: ['text', 'label', 'prompt_name', 'source', 'RDizzl3_seven', 'cleaned_text', 'input_ids', 'attention_mask'],
        num_rows: 8974
    })
})

Define your model (finetuning on top of gpt2)

In [10]:

class gpt2essayprediction(nn.Module):

    def __init__(self, num_classes: int, hidden_size: int, max_sequence_length: int, model_id: str):
        super().__init__()  
        self.gpt2model = GPT2LMHeadModel.from_pretrained(model_id)
        self.linear_projection = nn.Linear(hidden_size * max_sequence_length, num_classes)   #input should be matched

    def forward(self, input_id, attention_mask):
        # input_dict = {'input_id': input_id,'attention_mask': attention_mask}

        logits,_ = self.gpt2model(input_ids = input_id, attention_mask = attention_mask, return_dict = False)
        batch_size = logits.shape[0]
        output = self.linear_projection(logits.view(batch_size,-1))   #converting (16,columns) to (16,2) where 2 is the num_classes
        return output

GPU utilization

In [11]:
#global values

hidden_size = 50257
max_sequence_length = 128
learning_rate = 3e-4

device = "cuda" if torch.cuda.is_available() else "cpu"
model = gpt2essayprediction(num_classes=2, hidden_size=hidden_size, max_sequence_length=max_sequence_length, model_id="gpt2").to(device)


Loss function and creating a pytorch optimizer

In [12]:
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

Creating pytorch training and testing Dataloader

In [13]:
training_data = tokenized_dataset["train"]
testing_data = tokenized_dataset["test"]
train_dataloader = DataLoader(
    training_data, batch_size=16, shuffle=True, drop_last=True
)
test_dataloader = DataLoader(testing_data, batch_size=16, shuffle=True, drop_last = True)

Training_loop

In [14]:
epochs = 3
for epoch in range(epochs):
    total_train_loss = 0
    total_val_loss = 0
    total_acc_train = 0
    total_acc_val = 0

    model.train()

    for batch_index, train_input in enumerate(tqdm(train_dataloader)):
        label = train_input["label"].to(device)

        input_ids = train_input["input_ids"].squeeze(1).to(device)
        attention_mask = (
            train_input["attention_mask"].squeeze(1).to(device)
        )  # removing the unnecessary dimension by squeezing it out
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, label)
        total_train_loss += loss.item()
        acc = (logits.argmax(dim=1) == label).sum().item()
        total_acc_train += acc

        # backpropagation

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True

    with torch.no_grad():
        for batch, test_input in enumerate(tqdm(test_dataloader)):
            label = test_input["label"].to(device)
            input_ids = test_input["input_ids"].squeeze(1).to(device)
            attention_mask = (
                test_input["attention_mask"].squeeze(1).to(device)
            )  # removing the unnecessary dimension by squeezing it out
            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, label)
            total_val_loss += loss.item()

            # accuracy_score
            acc = (logits.argmax(dim=1) == label).sum().item()
            total_acc_val += acc

            # roc_score

            predictions = torch.argmax(logits.cpu(), dim=1)
            try:
                roc_score = roc_auc_score(
                    predictions, label.cpu()
                )  # catching the exception like ValueError
            except ValueError:
                pass

    print(
        f"Epochs: {epoch + 1} | Train Loss: {total_train_loss/len(training_data): .5f} \
        | Val Loss: {total_val_loss / len(testing_data): .5f} \
        | train_accuracy: {total_acc_train/len(training_data): .5f} \
        | validation_accuracy: {total_acc_val/len(testing_data): .5f} \
        | Roc_score: {roc_score: .5f}"
    )

100%|██████████| 2243/2243 [05:18<00:00,  7.03it/s]
100%|██████████| 560/560 [00:30<00:00, 18.52it/s]


Epochs: 1 | Train Loss:  21.92564         | Val Loss:  1.33507         | train_accuracy:  0.90494         | validation_accuracy:  0.96958         | Roc_score:  1.00000


100%|██████████| 2243/2243 [05:16<00:00,  7.10it/s]
100%|██████████| 560/560 [00:30<00:00, 18.50it/s]


Epochs: 2 | Train Loss:  0.73277         | Val Loss:  0.69925         | train_accuracy:  0.97286         | validation_accuracy:  0.98039         | Roc_score:  0.95000


 10%|█         | 226/2243 [00:31<04:42,  7.15it/s]

Saving the model

In [61]:

print("The state dict keys: \n\n", model.state_dict().keys())

The state dict keys: 

 odict_keys(['gpt2model.transformer.wte.weight', 'gpt2model.transformer.wpe.weight', 'gpt2model.transformer.h.0.ln_1.weight', 'gpt2model.transformer.h.0.ln_1.bias', 'gpt2model.transformer.h.0.attn.bias', 'gpt2model.transformer.h.0.attn.masked_bias', 'gpt2model.transformer.h.0.attn.c_attn.weight', 'gpt2model.transformer.h.0.attn.c_attn.bias', 'gpt2model.transformer.h.0.attn.c_proj.weight', 'gpt2model.transformer.h.0.attn.c_proj.bias', 'gpt2model.transformer.h.0.ln_2.weight', 'gpt2model.transformer.h.0.ln_2.bias', 'gpt2model.transformer.h.0.mlp.c_fc.weight', 'gpt2model.transformer.h.0.mlp.c_fc.bias', 'gpt2model.transformer.h.0.mlp.c_proj.weight', 'gpt2model.transformer.h.0.mlp.c_proj.bias', 'gpt2model.transformer.h.1.ln_1.weight', 'gpt2model.transformer.h.1.ln_1.bias', 'gpt2model.transformer.h.1.attn.bias', 'gpt2model.transformer.h.1.attn.masked_bias', 'gpt2model.transformer.h.1.attn.c_attn.weight', 'gpt2model.transformer.h.1.attn.c_attn.bias', 'gpt2model.transform

We will need to reconstruct the model exactly as it was when trained at loading time, So we need to store information about the model architecture in the checkpoint, along with the state dict.

If you are planning to continue training of the model you'll need to store the optimizer state too.

To do this, you build a dictionary with all the information you need to compeletely rebuild the model.

In [64]:
checkpoint = {
    'model': gpt2essayprediction(num_classes = 2, hidden_size=50257,max_sequence_length=128, model_id = 'gpt2'),
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict()}

torch.save(checkpoint, '/notebooks/checkpoint.pth')


Loading the saved checkpoint

In [72]:
filepath = '/notebooks/checkpoint.pth'
def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    model = checkpoint['model']
    model.load_state_dict(checkpoint['state_dict'])
    for parameter in model.parameters():
        parameter.requires_grad = False

    model.eval()

    return model


In [12]:
# batch_size = 16
# test_dataset = tokenized_dataset['test']
# total = len(test_dataset)

# for i in range(0,total,batch_size):

#     input_ids = test_dataset['input_ids'].squeeze(1)
#     attention_mask = test_dataset['attention_mask'].squeeze(1)
#     batch_input_ids = input_ids[i:i + batch_size]
#     batch_attention_mask = attention_mask[i: i + batch_size]
#     # input_dict = {'input_ids': batch_input_ids,
#     #               'attention_mask': batch_attention_mask
#     # }
#     with torch.no_grad():
#         logits, _= model(input_ids = batch_input_ids, attention_mask = batch_attention_mask, return_dict = False) #input_format....return_dict = False for returning logits as a torch.tensor types.
#     break 
