# Finetuning GPT2 with Netflix Descriptions
Taken ref from: https://www.kaggle.com/code/nulldata/fine-tuning-gpt-2-to-generate-netlfix-descriptions/notebook

Which took ref from: https://medium.com/geekculture/fine-tune-eleutherai-gpt-neo-to-generate-netflix-movie-descriptions-in-only-47-lines-of-code-40c9b4c32475

## Setup

In [1]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

In [2]:
%%capture
if IN_COLAB:
    
    #Remove not needed python versions to free space
    !rm -rf "/usr/local/lib/python2.7"
    !rm -rf "/usr/lib/python2.7"

    # Clone the repo.
    # !git clone ""

    # Change the working directory to the repo root.
    # %cd

    # Add the repo root to the Python path.
    # import sys, os
    # sys.path.append(os.getcwd())
    
    #Install packages not native to colab
    !pip install python-dotenv
    !pip install transformers
    !pip install transformers[onnx]
    !pip install evaluate
    !pip install wandb --upgrade

    # !pip install pandas-profiling --upgrade

    #Mount GDrive to access .env file
    from google.colab import drive
    drive.mount('/content/gdrive')

    #Load env file
    #NOTE: gdrive wont allow you to mount dotfiles
    from dotenv import load_dotenv
    load_dotenv("./gdrive/MyDrive/my_env_file")

In [3]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33ma-sh0ts[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
!nvidia-smi

Mon Oct 10 17:47:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import torch
torch.manual_seed(42)

<torch._C.Generator at 0x7f3a37e8ffd0>

## Model Training

In [6]:
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [7]:
_model_conf = {
    "dataset_artifact": "netflix-shows",
    "dataset_path": "data/netflix_titles.csv",
    "dataset_version": "latest",
    "text_column": "description",
    "base_gpt_model": "gpt2-medium",
    "bos_token": '<|startoftext|>',
    "eos_token": '<|endoftext|>',
    "pad_token": '<|pad|>',
    "train_split": 0.99,
    "num_train_epochs": 1,
    "logging_steps": 100,
    "save_steps": 5000,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "warmup_steps": 10,
    "weight_decay": 0.05,
    "evaluation_strategy": "steps",
    "eval_steps": 250,
    "evaluation_metrics": [
        "bleu", 
        "google_bleu", 
        # "mauve"
                           ]
}

In [8]:
project_name = "gpt2-netflix"
run_name = "finetune_gpt2"
run_type = "train"

In [9]:
run = wandb.init(
        project=project_name, job_type=run_type, name=run_name, config = _model_conf)

In [None]:
model_conf = run.config

In [None]:
dataset_artifact = model_conf["dataset_artifact"]
dataset_version = model_conf["dataset_version"]
dataset_path = model_conf["dataset_path"]
text_column = model_conf["text_column"]

base_gpt_model = model_conf["base_gpt_model"]
bos_token = model_conf["bos_token"]
eos_token = model_conf["eos_token"]
pad_token = model_conf["pad_token"]

train_split = model_conf["train_split"]

num_train_epochs = model_conf["num_train_epochs"]
logging_steps = model_conf["logging_steps"]
save_steps = model_conf["save_steps"]
per_device_train_batch_size = model_conf["per_device_train_batch_size"]
per_device_eval_batch_size = model_conf["per_device_eval_batch_size"]
warmup_steps = model_conf["warmup_steps"]
weight_decay = model_conf["weight_decay"]

evaluation_strategy = model_conf["evaluation_strategy"]
eval_steps = model_conf["eval_steps"]
evaluation_metrics = model_conf["evaluation_metrics"]

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained(base_gpt_model, bos_token=bos_token,
                                          eos_token=eos_token, pad_token=pad_token)
model = GPT2LMHeadModel.from_pretrained(base_gpt_model).cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 1024)

In [None]:
netflix_dataset_art = run.use_artifact(f"{dataset_artifact}:{dataset_version}").get_path(dataset_path)
netflix_dataset_path = netflix_dataset_art.download()

In [None]:
import pandas as pd

In [None]:
netflix_descriptions = pd.read_csv(netflix_dataset_path)[text_column]

In [None]:
max_length = max([len(tokenizer.encode(description)) for description in netflix_descriptions])

In [None]:
run.config.update({"max_length": max_length})

In [None]:
class NetflixDataset(Dataset):
    def __init__(self, txt_list, tokenizer, bos_token, eos_token, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer(bos_token + txt + eos_token, truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
dataset = NetflixDataset(netflix_descriptions, tokenizer, bos_token, eos_token, max_length=max_length)

In [20]:
train_size = int(train_split * len(dataset))

In [None]:
run.config.update({"train_size": train_size})

In [None]:
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [None]:
import gc
gc.collect()

In [None]:
torch.cuda.empty_cache()

In [None]:
import evaluate

In [None]:
# bleu = evaluate.load("bleu")
# google_bleu = evaluate.load("google_bleu")
# mauve = load('mauve')

In [None]:
text_gen_metrics = evaluate.combine(evaluation_metrics)

In [28]:
def compute_metrics(eval_pred):
     predictions, labels = eval_pred
     return text_gen_metrics.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(output_dir='./results',
                                  num_train_epochs=num_train_epochs,
                                  logging_steps=logging_steps,
                                  save_steps=save_steps,
                                  per_device_train_batch_size=per_device_train_batch_size,
                                  per_device_eval_batch_size=per_device_eval_batch_size,
                                  evaluation_strategy=evaluation_strategy,
                                  eval_steps=eval_steps,
                                  warmup_steps=warmup_steps,
                                  weight_decay=weight_decay,
                                  logging_dir='./logs',
                                  report_to = 'wandb') #🪄🐝

In [None]:
model_trainer = Trainer(model=model,
                        args=training_args,
                        train_dataset=train_dataset, 
                        eval_dataset=val_dataset,
                        compute_metrics = compute_metrics,
                        data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])})

In [None]:
model_trainer.train()

In [None]:
model.save_pretrained("gpt2-netflix-model")
tokenizer.save_pretrained("gpt2-netflix-model")

In [None]:
artifact_name = "gpt2-netflix-hf"
artifact_type = "model"
artifact_description = "GPT2 model finetuned as per this article: https://www.kaggle.com/code/nulldata/fine-tuning-gpt-2-to-generate-netlfix-descriptions/notebook"

In [None]:
model_artifact = wandb.Artifact(name=artifact_name, type=artifact_type, description=artifact_description)

In [None]:
model_artifact.add_dir(f"gpt2-netflix-model")

In [None]:
run.log_artifact(model_artifact)

In [None]:
run.finish()