# Train Models


Based on https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tune_CodeT5_for_generating_docstrings_from_Ruby_code.ipynb#scrollTo=wvRHDkCIS91f and https://colab.research.google.com/drive/1d4xNsZbDSZ5ZqXgZjy7HyTVRLBJBVsh6#scrollTo=SDVQ04fGRb1v

## Set-up environment

Let's first install the required libraries:
* HuggingFace Transformers (for the CodeT5 model)
* HuggingFace Datasets (for loading the dataset + preprocessing it)
* PyTorch Lightning (for training)
* Weights and Biases (for logging training metrics).
* Project code from a GitHub repo

In [None]:
!pip install -q transformers sentencepiece pytorch-lightning

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.0/719.0 kB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
from datetime import datetime
from pathlib import Path
import sys
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from tqdm.auto import tqdm

from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    RobertaTokenizer,
    get_linear_schedule_with_warmup
)

from lib.data.dataset import ComplexUtteranceCodeDataset
from lib.data.utils import (
    get_dataset_args,
    load_data,
    split_dataset_train_test,
    
)

torch.manual_seed(42)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Google colab config

In [None]:
%%bash
rm -r /content/complex-utterance-to-code
git clone https://github.com/asafam/complex-utterance-to-code.git /content/complex-utterance-to-code
ls /content/

complex-utterance-to-code
sample_data


rm: cannot remove '/content/complex-utterance-to-code': No such file or directory
Cloning into '/content/complex-utterance-to-code'...


In [None]:
paths = [
  '/content/complex-utterance-to-code', 
  '/content/complex-utterance-to-code/notebooks',
  '/content/complex-utterance-to-code/src', 
  '/content/complex-utterance-to-code/src/api/v6', 
]
for path in paths:
  path = os.path.normcase(path)
  if not any(os.path.normcase(sp) == path for sp in sys.path):
      sys.path.append(path)

In [None]:
from google.colab import drive

WORK_DRIVE = '/content/drive'
WORK_AREA = WORK_DRIVE + '/MyDrive/university/masters/complex_utterances_semantic_parsing/notebooks'

drive.mount(WORK_DRIVE)
os.chdir(WORK_AREA)

Mounted at /content/drive


In [None]:
!ls -lta data

total 46747
-rw------- 1 root root 14055743 May 19 10:19 train_complex_utterance_to_code_with_intermediate_40k.csv.gz
-rw------- 1 root root    26899 May 19 01:35 eval_complex_utterance_to_code_with_intermediate_82_20230519.csv.gz
-rw------- 1 root root 11850462 May 13 10:16 train_3domains_complex_utterance_to_code_with_intermediate_40k.csv.gz
-rw------- 1 root root    23698 May 12 00:22 eval_complex_utterance_to_code_with_intermediate_82_20230511.csv.gz
-rw------- 1 root root    22366 May 11 19:27 eval_complex_utterance_to_code_with_intermediate_78_20230511.csv.gz
-rw------- 1 root root    21239 May 10 14:13 eval_complex_utterance_to_code_with_intermediate_61_20230509.csv.gz
-rw------- 1 root root 10481850 May  4 12:49 train_complex_utterance_to_code_with_intermediate_30k.csv.gz
-rw------- 1 root root  1484529 Apr 16 07:42 train_complex_utterance_to_code_with_intermediate_10k_20230414.csv.gz
-rw------- 1 root root    17183 Apr 15 21:31 eval_complex_utterance_to_code_with_intermediate_

## Model configuration code

In [None]:
def load_tokenizer(pretrained_model_name_or_path):
    print(f"Loading tokenizer from {pretrained_model_name_or_path}")
    tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path)
    return tokenizer


def load_model(pretrained_model_name_or_path):
    print(f"Loading model from {pretrained_model_name_or_path}")
    model = T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
    return model

In [None]:
from enum import Enum

class ModelFlavour(Enum):
    Text2Code = "text2code"
    Text2Rep = "text2rep"
    Rep2Code = "rep2code"
    Rep2Rep = "rep2rep"
    TextRep2Rep = "textrep2rep"
    TextRep2Code = "textrep2code"


class Model(Enum):
    T5Base = "t5-base"
    CodeT5Small = "codet5-small"
    CodeT5Base = "codet5-base"
    CodeT5P220m = "codet5p-220m"


model_flavour_params = {
    ModelFlavour.Text2Code: dict(
        slug = "text2code",
        input_prefix = "text to code: ",
        input_label = "text",
        target_label = "code",
    ),
    ModelFlavour.Text2Rep: dict(
        slug = "text2rep",
        input_prefix = "text to rep: ",
        input_label = "text",
        target_label = "code_rep",
    ),
    ModelFlavour.Rep2Code: dict(
        slug = "rep2code",
        input_prefix = "rep to code: ",
        input_label = "lang_rep",
        target_label = "code",
    ),
    ModelFlavour.Rep2Rep: dict(
        slug = "rep2rep",
        input_prefix = "rep to rep: ",
        input_label = "lang_rep",
        target_label = "code_rep",
    ),
    ModelFlavour.TextRep2Rep: dict(
        slug = "text_rep2rep",
        input_prefix = "text and rep to rep: ",
        input_label = "text_lang_rep",
        target_label = "code_rep",
    ),
    ModelFlavour.TextRep2Code: dict(
        slug = "textrep2code",
        input_prefix = "text and rep to code: ",
        input_label = "text_lang_rep",
        target_label = "code",
    ),
}


pretrained_model_names_mapping = {
    Model.T5Base: "t5-base",
    Model.CodeT5Small: "Salesforce/codet5-small",
    Model.CodeT5Base: "Salesforce/codet5-base",
    Model.CodeT5P220m: "Salesforce/codet5p-220m",
}

## Training

### Model flavour selection

In [None]:
SELECTED_MODEL_FLAVOUR = ModelFlavour.Rep2Rep
MODEL = Model.CodeT5P220m

selected_model_flavour_params = model_flavour_params[SELECTED_MODEL_FLAVOUR]
pretrained_model_name_or_path = pretrained_model_names_mapping[MODEL]
model_id = MODEL.value
slug = model_flavour_params[SELECTED_MODEL_FLAVOUR].get("slug")
timestamp_str = datetime.now().strftime('%Y-%m-%d_%H%M%S')

### Loading the dataset

In [None]:
df = load_data(file_path='data/train_complex_utterance_to_code_with_intermediate_40k.csv.gz')
train_df, val_df = split_dataset_train_test(df, test_size=int(df.shape[0]*0.2), random_state=42)
print("train_df", train_df.shape)
print("val_df", val_df.shape)

In [None]:
tokenizer = load_tokenizer(pretrained_model_name_or_path)

In [None]:
dataset_args = get_dataset_args(tokenizer=tokenizer, selected_model_flavour_params=selected_model_flavour_params)

train_dataset = ComplexUtteranceCodeDataset(data=train_df, **dataset_args)
val_dataset = ComplexUtteranceCodeDataset(data=val_df, **dataset_args)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, num_workers=12)
val_dataloader = DataLoader(val_dataset, batch_size=4, num_workers=12)

### Refitting the model

In [None]:
tensorboard_dir = f"./experiments/logs"
Path(tensorboard_dir).mkdir(parents=True, exist_ok=True)
print("Logging to ", tensorboard_dir)

# Load the TensorBoard notebook extension
%reload_ext tensorboard
%tensorboard --logdir ./experiments/logs

name = f"{model_id}-{slug}-tr-{train_df.shape[0]}k-{timestamp_str}"
logger = TensorBoardLogger(tensorboard_dir, name=name)
print(f"Now {tensorboard_dir} is the name of the saving directory and this logging will have the name as {name}")

In [None]:
model = load_model(pretrained_model_name_or_path)

In [None]:
# for early stopping, see https://pytorch-lightning.readthedocs.io/en/1.0.0/early_stopping.html?highlight=early%20stopping
early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)

lr_monitor = LearningRateMonitor(logging_interval='step')

## saving model checkpoints in a directory
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="validation_loss",
    mode="min"
)

trainer_args = dict(
    devices=1, 
    accelerator="auto",
    default_root_dir=f"./experiments/{model_id}-{slug}-{timestamp_str}", 
    logger=logger,
    callbacks=[checkpoint_callback, early_stop_callback, lr_monitor],
    max_epochs=20
)

trainer = Trainer(**trainer_args)

In [None]:
torch.set_float32_matmul_precision('medium') # 'medium|high'
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"

trainer.fit(model)

In [None]:
save_directory = f"./experiments/{model_id}-{slug}-{timestamp_str}/" # save in the current working directory, you can change this of course
Path(save_directory).mkdir(parents=True, exist_ok=True)

model.model.save_pretrained(save_directory)

print(f"Saving model to {save_directory}")