# Finetuning GPT2 with Netflix Descriptions
Taken ref from: https://www.kaggle.com/code/nulldata/fine-tuning-gpt-2-to-generate-netlfix-descriptions/notebook

Which took ref from: https://medium.com/geekculture/fine-tune-eleutherai-gpt-neo-to-generate-netflix-movie-descriptions-in-only-47-lines-of-code-40c9b4c32475

## Setup

In [1]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

In [2]:
%%capture
if IN_COLAB:
    
    #Remove not needed python versions to free space
    !rm -rf "/usr/local/lib/python2.7"
    !rm -rf "/usr/lib/python2.7"

    # Clone the repo.
    # !git clone ""

    # Change the working directory to the repo root.
    # %cd

    # Add the repo root to the Python path.
    # import sys, os
    # sys.path.append(os.getcwd())
    
    #Install packages not native to colab
    !pip install python-dotenv
    !pip install transformers
    !pip install transformers[onnx]
    !pip install evaluate
    !pip install wandb --upgrade

    # !pip install pandas-profiling --upgrade

    #Mount GDrive to access .env file
    from google.colab import drive
    drive.mount('/content/gdrive')

    #Load env file
    #NOTE: gdrive wont allow you to mount dotfiles
    from dotenv import load_dotenv
    load_dotenv("./gdrive/MyDrive/my_env_file")
else:
    pass

In [3]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33ma-sh0ts[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
!nvidia-smi

Tue Oct 11 05:19:24 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P8    20W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   74C    P8    22W /  70W |      0MiB / 15360MiB |      0%      Default |
|       

In [5]:
import torch
torch.manual_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f8e1084ba90>

## Model Training

In [6]:
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [7]:
#These settings are set for 4 x T4s
_model_conf = {
    "dataset_artifact": "netflix-shows",
    "dataset_path": "data/netflix_titles.csv",
    "dataset_version": "latest",
    "text_column": "description",
    "base_gpt_model": "gpt2-medium",
    "bos_token": '<|startoftext|>',
    "eos_token": '<|endoftext|>',
    "pad_token": '<|pad|>',
    "train_split": 0.99,
    "num_train_epochs": 10,
    "logging_steps": 1,
    "save_steps": 500,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "warmup_steps": 10,
    "weight_decay": 0.05,
    "evaluation_strategy": "steps",
    "eval_steps": 10,
    "evaluation_metrics": [
        "bleu", 
        "google_bleu", 
        # "mauve"
                           ]
}

In [8]:
project_name = "gpt2-netflix"
run_name = "finetune_gpt2"
run_type = "train"

In [9]:
run = wandb.init(
        project=project_name, job_type=run_type, name=run_name, config = _model_conf)

In [10]:
model_conf = run.config

In [11]:
dataset_artifact = model_conf["dataset_artifact"]
dataset_version = model_conf["dataset_version"]
dataset_path = model_conf["dataset_path"]
text_column = model_conf["text_column"]

base_gpt_model = model_conf["base_gpt_model"]
bos_token = model_conf["bos_token"]
eos_token = model_conf["eos_token"]
pad_token = model_conf["pad_token"]

train_split = model_conf["train_split"]

num_train_epochs = model_conf["num_train_epochs"]
logging_steps = model_conf["logging_steps"]
save_steps = model_conf["save_steps"]
per_device_train_batch_size = model_conf["per_device_train_batch_size"]
per_device_eval_batch_size = model_conf["per_device_eval_batch_size"]
warmup_steps = model_conf["warmup_steps"]
weight_decay = model_conf["weight_decay"]

evaluation_strategy = model_conf["evaluation_strategy"]
eval_steps = model_conf["eval_steps"]
evaluation_metrics = model_conf["evaluation_metrics"]

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained(base_gpt_model, bos_token=bos_token,
                                          eos_token=eos_token, pad_token=pad_token)
model = GPT2LMHeadModel.from_pretrained(base_gpt_model).cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 1024)

In [13]:
netflix_dataset_art = run.use_artifact(f"{dataset_artifact}:{dataset_version}").get_path(dataset_path)
netflix_dataset_path = netflix_dataset_art.download()

In [14]:
import pandas as pd

In [15]:
netflix_descriptions = pd.read_csv(netflix_dataset_path)[text_column]

In [16]:
max_length = max([len(tokenizer.encode(description)) for description in netflix_descriptions])

In [17]:
run.config.update({"max_length": max_length})

In [18]:
class NetflixDataset(Dataset):
    def __init__(self, txt_list, tokenizer, bos_token, eos_token, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer(bos_token + txt + eos_token, truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [19]:
dataset = NetflixDataset(netflix_descriptions, tokenizer, bos_token, eos_token, max_length=max_length)

In [20]:
train_size = int(train_split * len(dataset))

In [21]:
run.config.update({"train_size": train_size})

In [22]:
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [23]:
import evaluate

In [24]:
# bleu = evaluate.load("bleu")
# google_bleu = evaluate.load("google_bleu")
# mauve = load('mauve')

In [25]:
text_gen_metrics = evaluate.combine(evaluation_metrics)

In [26]:
import numpy as np

In [27]:
def compute_metrics(eval_pred):
    prediction_logits, true_encodings = eval_pred
    prediction_encodings = np.argmax(prediction_logits, axis=-1)
    
    true_texts = tokenizer.batch_decode(true_encodings, skip_special_tokens=True)
    prediction_texts = tokenizer.batch_decode(prediction_encodings, skip_special_tokens=True)
    
    del prediction_logits
    del true_encodings
    del prediction_encodings
    gc.collect()
    
    return text_gen_metrics.compute(predictions=prediction_texts, references=true_texts)

In [28]:
import gc
gc.collect()

1048

In [29]:
torch.cuda.empty_cache()

In [30]:
training_args = TrainingArguments(output_dir='./results',
                                  num_train_epochs=num_train_epochs,
                                  logging_steps=logging_steps,
                                  save_steps=save_steps,
                                  per_device_train_batch_size=per_device_train_batch_size,
                                  per_device_eval_batch_size=per_device_eval_batch_size,
                                  evaluation_strategy=evaluation_strategy,
                                  eval_steps=eval_steps,
                                  warmup_steps=warmup_steps,
                                  weight_decay=weight_decay,
                                  logging_dir='./logs',
                                  report_to = 'wandb') #🪄🐝

In [31]:
model_trainer = Trainer(model=model,
                        args=training_args,
                        train_dataset=train_dataset, 
                        eval_dataset=val_dataset,
                        compute_metrics = compute_metrics,
                        data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])})

In [32]:
model_trainer.train()

***** Running training *****
  Num examples = 8718
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1370
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Bleu,Precisions,Brevity Penalty,Length Ratio,Translation Length,Reference Length,Google Bleu
10,5.019,3.976054,0.021213,"[0.34360189573459715, 0.05987135081642751, 0.009316770186335404, 0.0016277807921866521]",0.897577,0.902481,2110,2338,0.097701
20,2.2496,2.373108,0.029503,"[0.3555651797314855, 0.08333333333333333, 0.018301267010793053, 0.0014691478942213516]",0.987519,0.987596,2309,2338,0.117568
30,1.923,1.789639,0.055343,"[0.3752166377816291, 0.106804867057233, 0.03145539906103286, 0.007839294463498285]",0.987086,0.987169,2308,2338,0.133709
40,1.8277,1.74204,0.055561,"[0.3697442566103164, 0.1109107303877367, 0.03334899013621419, 0.007352941176470588]",0.986653,0.986741,2307,2338,0.133717
50,1.9088,1.716047,0.060227,"[0.38109756097560976, 0.11508835523334844, 0.03446647780925401, 0.009364218827008379]",0.981874,0.982036,2296,2338,0.137779
60,1.7954,1.700844,0.056887,"[0.38464893153074575, 0.11524500907441017, 0.03451536643026005, 0.007403751233958539]",0.980566,0.980753,2293,2338,0.13818
70,1.7671,1.689114,0.050813,"[0.38280226975120035, 0.11307901907356949, 0.03076194983435873, 0.005434782608695652]",0.979694,0.979897,2291,2338,0.135697
80,1.7181,1.679162,0.055227,"[0.3854529616724739, 0.11599456275487087, 0.03493862134088763, 0.006407097092163627]",0.981874,0.982036,2296,2338,0.138632
90,1.7232,1.673787,0.054735,"[0.38374619730551934, 0.11482820976491863, 0.03155911446066886, 0.00688298918387414]",0.984049,0.984175,2301,2338,0.137317
100,1.7302,1.669682,0.058173,"[0.3874619068350022, 0.1213768115942029, 0.03539405379896177, 0.007389162561576354]",0.982309,0.982464,2297,2338,0.140953


***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  Num examples = 89
  Batch size = 64
***** Running Evaluation *****
  N

TrainOutput(global_step=1370, training_loss=1.5358056969016138, metrics={'train_runtime': 2469.9737, 'train_samples_per_second': 35.296, 'train_steps_per_second': 0.555, 'total_flos': 9804249699287040.0, 'train_loss': 1.5358056969016138, 'epoch': 10.0})

In [33]:
model.save_pretrained("gpt2-netflix-model")
tokenizer.save_pretrained("gpt2-netflix-model")

Configuration saved in gpt2-netflix-model/config.json
Model weights saved in gpt2-netflix-model/pytorch_model.bin
tokenizer config file saved in gpt2-netflix-model/tokenizer_config.json
Special tokens file saved in gpt2-netflix-model/special_tokens_map.json
added tokens file saved in gpt2-netflix-model/added_tokens.json


('gpt2-netflix-model/tokenizer_config.json',
 'gpt2-netflix-model/special_tokens_map.json',
 'gpt2-netflix-model/vocab.json',
 'gpt2-netflix-model/merges.txt',
 'gpt2-netflix-model/added_tokens.json')

In [34]:
artifact_name = "gpt2-netflix-hf"
artifact_type = "model"
artifact_description = "GPT2 model finetuned as per this article: https://www.kaggle.com/code/nulldata/fine-tuning-gpt-2-to-generate-netlfix-descriptions/notebook"

In [35]:
model_artifact = wandb.Artifact(name=artifact_name, type=artifact_type, description=artifact_description)

In [36]:
model_artifact.add_dir(f"gpt2-netflix-model")

[34m[1mwandb[0m: Adding directory to artifact (./gpt2-netflix-model)... Done. 9.4s


In [37]:
run.log_artifact(model_artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f8c701b5a50>

In [None]:
run.finish()