<a href="https://colab.research.google.com/github/alexali04/gpt2_finetune/blob/main/finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This Google Colab will detail how I fine-tuned GPT2 on movie reviews in order to improve its auto-regressive capacity in that domain.

In [None]:
## make sure to use T4 GPU

!pip install transformers

import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")



# Pre-Processing

In [None]:
import pandas as pd
import numpy as np
import random
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

In [None]:
## much faster to load the CSVs from drive than into Colab directly - name them whatever you like but be sure the path / filenames match

from google.colab import drive
drive.mount('/content/drive')
summaries = pd.read_csv('/content/drive/MyDrive/summaries.csv')
reviews = pd.read_csv('/content/drive/MyDrive/reviews.csv', engine='python', error_bad_lines=False)
## ignore bad lines - customize as you like

print(summaries.shape) ## gut check
print(reviews.shape)

In [None]:
summaries_relevant = summaries[['uid', 'synopsis']].rename(columns = {'uid':'summary_uid', 'synopsis':'text'})
reviews_relevant = reviews[['uid', 'synopsis']].rename(columns = {'uid': 'anime_uid'})
reviews_relevant = reviews[['uid', 'text']]
reviews_relevant['text'] = reviews_relevant['text'].str.replace("more pics", "", case=False, regex=True)
reviews_relevant = reviews_relevant.sample(n = 18000) # number subject to change
animes_relevant = summaries_relevant.sample(n = 18000)
df = pd.concat([summaries_relevant, reviews_relevant], ignore_index=True)

In [None]:
df = df[df['text'].notnull()]
df = df[df['uid'].notnull()]
df = df[df['text'].str.strip() != ""] ## filter out empty values
print(df.shape)

(35090, 2)

In [None]:
small_dataset = df.sample(2000) ## gonna use small data set as trial run
df = df.sample(n = 24000) # number subject to change


In [None]:
test_set = df.sample(n = 5)
df = df.loc[~df.index.isin(test_set.index)]
print(df.shape)

# reset indices
test_set = test_set.reset_index()
df = df.reset_index()

# for the test set keep last 20 words in a new col and remove from original col
test_set['true_end'] = test_set['text'].str.split().str[-20:].apply(' '.join)
test_set['text'] = test_set['text'].str.split().str[:-20].apply(' '.join)
#test_set.to_csv("/content/test.csv")

(23995, 2)


In [None]:
test_set.head() ## gut check

In [None]:
class Reviews(Dataset):
  def __init__(self, control_code, truncate = False, gpt2_type = "gpt2", max_length = 1024):

    self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type) ## tokenizer will reprsent 'subwords' as integers to pass into neural network
    ## breaks up sentences into tokens so the model can interpret them / learn relations between them
    ## subword tokenization > word / character tokenization - handles OOV well and isn't as long as character tokenization
    self.reviews = []

    for i, row in enumerate(df['text']):
      tokens = self.tokenizer.encode(f"<|{control_code}|>{row}")
      truncated_tokens = tokens[:max_length]
      self.reviews.append(torch.tensor(truncated_tokens))
      if i % 1000 == 0:
        print(i)
      if truncate and i == 20000:
        break


      ## what we're doing is converting each review in df['text'] into tokens, putting those in a tensor, and putting those tokens into the model

   ## if truncate:
    ##  self.reviews = self.reviews[:20000]

    self.reviews_count = len(self.reviews)


  def __len__(self):
    return self.reviews_count

  def __getitem__(self, item):
    return self.reviews[item]

#dataset = Reviews(df['text'], truncate = False, gpt2_type = "gpt2")
dataset = Reviews(small_dataset['text'], truncate = False, gpt2_type = "gpt2")


In [None]:
## move in the direction anti-parallel to the average of the computed gradients of the loss function

def pack_tensor(new_tensor, packed_tensor, max_seq_len):
  if packed_tensor is None:
    return new_tensor, True, None

  if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
    return new_tensor, False, new_tensor
  else:
    packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim = 1)
    return packed_tensor, True, None

In [None]:
## NOTE - this will be the longest part of the whole process - ensure everything works FIRST before training

def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):
    acc_steps = 100
    device = torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss) ## labels are the tokenized end of the sentence - this is still SL
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device) ## CUDA
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

AdamW optimizer is better at generalizing than the Adam optimizer due to how it handles L2 regularization (loss function = loss + sum of squares of weights). Adam is not compatible with L2 since it includes square root of second moment of gradients - this is estunate of uncentered variance of gradients. This allows for an adaptive learning rate (variable gradient descent step size) - high gradients in the past reduce step size (lots of change / very mountainous region) and increases step size for flat regions where we're less likely to converge on a minimum. However, due to its update rule, Adam (unintentionally) regularizes large weights less than small weights. AdamW fixes this by regularizing the weights equally.

In [None]:
model = train(dataset, model, tokenizer)

torch.save(model.state_dict(), "/content/drive/MyDrive/model_large.pth")