In [None]:
%load_ext autoreload
%autoreload 2

from transformers import AutoTokenizer
import pytorch_lightning as pl
import torch.nn.functional as F
import pandas as pd
import torch
from collections import Counter
from sklearn.metrics import classification_report
import numpy as np
from dont_patronize_me import DontPatronizeMe
from sklearn.model_selection import KFold
from models.tuner import *
from models.model_pool import *
import gc

from torch.utils.data import Dataset, DataLoader, Subset
from models.utils import *

## Hyperparameter tuning

In [3]:
def cross_validate_once(
    pretrained_model_name,
    classifier_dim,
    lr,
    categorical,
    tokenizer_max_len,
    gradient_clip_val,
    target_label,
    batch_size,
    dropout,
    num_classes,
):
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
    tuner = ModelTuner(target_label)
    full_df, test_df = tuner.load_data()

    score_ls = []

    for train_idx, val_idx in KFold(3, shuffle=True).split(full_df):
        train_df = full_df.iloc[train_idx]
        train_df = tuner.downsample_data(train_df)
        val_df = full_df.iloc[val_idx]
        model = TransformerModel(
            pretrained_model_name,
            add_categorical=categorical,
            num_classes=num_classes,
            lr=lr,
            classifier_dim=classifier_dim,
            dropout=dropout,
        )
        trainer, metrics = tuner.train_once(
            train_df,
            epochs=epochs,
            patience=patience,
            tokenizer=tokenizer,
            model=model,
            batch_size=batch_size,
            gradient_clip_val=gradient_clip_val,
            tokenizer_max_len=tokenizer_max_len,
            val_df=val_df,
        )
        score_ls.append(metrics["f1"])
        
        del model, trainer
        torch.cuda.empty_cache()
        gc.collect()

    return score_ls

In [2]:
epochs = 10
patience = 2

# ordered by performance impact
hparams = {
    'model_arch': [['distilbert-base-uncased', 768], ['distilbert-base-cased', 768], ['roberta-base', 768]],
    'tokenizer_max_len': [16, 32, 64, 128, 256],
    'lr': [1e-6, 1e-5, 1e-4, 1e-3],
    'batch_size': [32, 16, 8],
    'target_label': [['label', 2], ['orig_label', 5]],
    'categorical': [True, False],
    'dropout': [0.1, 0.3, 0.5, 0.7, 0.9, 0.99],
    'gradient_clip_val': [0.1, 1, 10, 100, 0.01],
}

### Search for best architecture

best: ['distilbert-base-uncased', 768]

In [None]:
%%capture
for pretrained_model_name, classifier_dim in hparams["model_arch"]:
    # pretrained_model_name, classifier_dim = hparams["model_arch"][0]
    target_label, num_classes = hparams['target_label'][0]
    config = {
        "pretrained_model_name": pretrained_model_name,
        "classifier_dim": classifier_dim,
        "lr": hparams['lr'][0],
        "categorical": hparams['categorical'][0],
        "tokenizer_max_len": hparams['tokenizer_max_len'][0],
        "gradient_clip_val": hparams['gradient_clip_val'][0],
        "target_label": target_label,
        "batch_size": hparams['batch_size'][0],
        "dropout": hparams['dropout'][0],
        "num_classes": num_classes,
    }
    score_ls = cross_validate_once(**config)
    with open('parameter_search.txt', 'a') as f:
        f.write(f'{np.mean(score_ls)} {str(score_ls)} {str(config)}\n')

### Search for best tokenizer max length

best: 128

In [None]:
hparams = {
    'model_arch': [['distilbert-base-uncased', 768]],
    'tokenizer_max_len': [16, 32, 64, 128, 256],
    'lr': [1e-6, 1e-5, 1e-4, 1e-3],
    'batch_size': [32, 16, 8],
    'target_label': [['label', 2], ['orig_label', 5]],
    'categorical': [True, False],
    'dropout': [0.1, 0.3, 0.5, 0.7, 0.9, 0.99],
    'gradient_clip_val': [0.1, 1, 10, 100, 0.01],
}

In [None]:
%%capture
for tokenizer_max_len in hparams["tokenizer_max_len"]:
    pretrained_model_name, classifier_dim = hparams["model_arch"][0]
    target_label, num_classes = hparams['target_label'][0]
    config = {
        "pretrained_model_name": pretrained_model_name,
        "classifier_dim": classifier_dim,
        "lr": hparams['lr'][0],
        "categorical": hparams['categorical'][0],
        "tokenizer_max_len": tokenizer_max_len,
        "gradient_clip_val": hparams['gradient_clip_val'][0],
        "target_label": target_label,
        "batch_size": hparams['batch_size'][0],
        "dropout": hparams['dropout'][0],
        "num_classes": num_classes,
    }
    score_ls = cross_validate_once(**config)
    with open('parameter_search.txt', 'a') as f:
        f.write(f'{np.mean(score_ls)} {str(score_ls)} {str(config)}\n')

### Search for best learning rate

best: 1e-4

In [None]:
hparams = {
    'model_arch': [['distilbert-base-uncased', 768]],
    'tokenizer_max_len': [128],
    'lr': [1e-6, 1e-5, 1e-4, 1e-3],
    'batch_size': [32, 16, 8],
    'target_label': [['label', 2], ['orig_label', 5]],
    'categorical': [True, False],
    'dropout': [0.1, 0.3, 0.5, 0.7, 0.9, 0.99],
    'gradient_clip_val': [0.1, 1, 10, 100, 0.01],
}

In [None]:
%%capture
for lr in hparams["lr"]:
    pretrained_model_name, classifier_dim = hparams["model_arch"][0]
    target_label, num_classes = hparams['target_label'][0]
    config = {
        "pretrained_model_name": pretrained_model_name,
        "classifier_dim": classifier_dim,
        "lr": lr,
        "categorical": hparams['categorical'][0],
        "tokenizer_max_len": hparams['tokenizer_max_len'][0],
        "gradient_clip_val": hparams['gradient_clip_val'][0],
        "target_label": target_label,
        "batch_size": hparams['batch_size'][0],
        "dropout": hparams['dropout'][0],
        "num_classes": num_classes,
    }
    score_ls = cross_validate_once(**config)
    with open('parameter_search.txt', 'a') as f:
        f.write(f'{np.mean(score_ls)} {str(score_ls)} {str(config)}\n')

### Search for best batch size

best: 32

In [None]:
hparams = {
    'model_arch': [['distilbert-base-uncased', 768]],
    'tokenizer_max_len': [128],
    'lr': [1e-4],
    'batch_size': [32, 16, 8],
    'target_label': [['label', 2], ['orig_label', 5]],
    'categorical': [True, False],
    'dropout': [0.1, 0.3, 0.5, 0.7, 0.9, 0.99],
    'gradient_clip_val': [0.1, 1, 10, 100, 0.01],
}

In [None]:
%%capture
for batch_size in hparams["batch_size"]:
    pretrained_model_name, classifier_dim = hparams["model_arch"][0]
    target_label, num_classes = hparams['target_label'][0]
    config = {
        "pretrained_model_name": pretrained_model_name,
        "classifier_dim": classifier_dim,
        "lr": hparams['lr'][0],
        "categorical": hparams['categorical'][0],
        "tokenizer_max_len": hparams['tokenizer_max_len'][0],
        "gradient_clip_val": hparams['gradient_clip_val'][0],
        "target_label": target_label,
        "batch_size": batch_size,
        "dropout": hparams['dropout'][0],
        "num_classes": num_classes,
    }
    score_ls = cross_validate_once(**config)
    with open('parameter_search.txt', 'a') as f:
        f.write(f'{np.mean(score_ls)} {str(score_ls)} {str(config)}\n')

### Search for best label strategy

best: use binary labels

In [None]:
hparams = {
    'model_arch': [['distilbert-base-uncased', 768]],
    'tokenizer_max_len': [128],
    'lr': [1e-4],
    'batch_size': [32],
    'target_label': [['label', 2], ['orig_label', 5]],
    'categorical': [True, False],
    'dropout': [0.1, 0.3, 0.5, 0.7, 0.9, 0.99],
    'gradient_clip_val': [0.1, 1, 10, 100, 0.01],
}

In [None]:
%%capture
for target_label, num_classes in hparams["target_label"]:
    pretrained_model_name, classifier_dim = hparams["model_arch"][0]
    # target_label, num_classes = hparams['target_label'][0]
    config = {
        "pretrained_model_name": pretrained_model_name,
        "classifier_dim": classifier_dim,
        "lr": hparams['lr'][0],
        "categorical": hparams['categorical'][0],
        "tokenizer_max_len": hparams['tokenizer_max_len'][0],
        "gradient_clip_val": hparams['gradient_clip_val'][0],
        "target_label": target_label,
        "batch_size": hparams['batch_size'][0],
        "dropout": hparams['dropout'][0],
        "num_classes": num_classes,
    }
    score_ls = cross_validate_once(**config)
    with open('parameter_search.txt', 'a') as f:
        f.write(f'{np.mean(score_ls)} {str(score_ls)} {str(config)}\n')

### Search if add categorical feature is useful

best: not use categorical features

In [None]:
hparams = {
    'model_arch': [['distilbert-base-uncased', 768]],
    'tokenizer_max_len': [128],
    'lr': [1e-4],
    'batch_size': [32],
    'target_label': [['label', 2]],
    'categorical': [True, False],
    'dropout': [0.1, 0.3, 0.5, 0.7, 0.9, 0.99],
    'gradient_clip_val': [0.1, 1, 10, 100, 0.01],
}

In [None]:
%%capture
for categorical in hparams["categorical"]:
    pretrained_model_name, classifier_dim = hparams["model_arch"][0]
    target_label, num_classes = hparams['target_label'][0]
    config = {
        "pretrained_model_name": pretrained_model_name,
        "classifier_dim": classifier_dim,
        "lr": hparams['lr'][0],
        "categorical": categorical,
        "tokenizer_max_len": hparams['tokenizer_max_len'][0],
        "gradient_clip_val": hparams['gradient_clip_val'][0],
        "target_label": target_label,
        "batch_size": hparams['batch_size'][0],
        "dropout": hparams['dropout'][0],
        "num_classes": num_classes,
    }
    score_ls = cross_validate_once(**config)
    with open('parameter_search.txt', 'a') as f:
        f.write(f'{np.mean(score_ls)} {str(score_ls)} {str(config)}\n')

### Search for best dropout

best: 0.3

In [None]:
hparams = {
    'model_arch': [['distilbert-base-uncased', 768]],
    'tokenizer_max_len': [128],
    'lr': [1e-4],
    'batch_size': [32],
    'target_label': [['label', 2]],
    'categorical': [False],
    'dropout': [0.1, 0.3, 0.5, 0.7, 0.9, 0.99],
    'gradient_clip_val': [0.1, 1, 10, 100, 0.01],
}

In [None]:
%%capture
for dropout in hparams["dropout"]:
    pretrained_model_name, classifier_dim = hparams["model_arch"][0]
    target_label, num_classes = hparams['target_label'][0]
    config = {
        "pretrained_model_name": pretrained_model_name,
        "classifier_dim": classifier_dim,
        "lr": hparams['lr'][0],
        "categorical": hparams['categorical'][0],
        "tokenizer_max_len": hparams['tokenizer_max_len'][0],
        "gradient_clip_val": hparams['gradient_clip_val'][0],
        "target_label": target_label,
        "batch_size": hparams['batch_size'][0],
        "dropout": dropout,
        "num_classes": num_classes,
    }
    score_ls = cross_validate_once(**config)
    with open('parameter_search.txt', 'a') as f:
        f.write(f'{np.mean(score_ls)} {str(score_ls)} {str(config)}\n')

### Search for best gradient clip value

best: 100

In [None]:
hparams = {
    'model_arch': [['distilbert-base-uncased', 768]],
    'tokenizer_max_len': [128],
    'lr': [1e-4],
    'batch_size': [32],
    'target_label': [['label', 2]],
    'categorical': [False],
    'dropout': [0.3],
    'gradient_clip_val': [0.1, 1, 10, 100, 0.01],
}

In [None]:
%%capture
for gradient_clip_val in hparams["gradient_clip_val"]:
    pretrained_model_name, classifier_dim = hparams["model_arch"][0]
    target_label, num_classes = hparams['target_label'][0]
    config = {
        "pretrained_model_name": pretrained_model_name,
        "classifier_dim": classifier_dim,
        "lr": hparams['lr'][0],
        "categorical": hparams['categorical'][0],
        "tokenizer_max_len": hparams['tokenizer_max_len'][0],
        "gradient_clip_val": gradient_clip_val,
        "target_label": target_label,
        "batch_size": hparams['batch_size'][0],
        "dropout": hparams['dropout'][0],
        "num_classes": num_classes,
    }
    score_ls = cross_validate_once(**config)
    with open('parameter_search.txt', 'a') as f:
        f.write(f'{np.mean(score_ls)} {str(score_ls)} {str(config)}\n')

In [None]:
hparams = {
    'model_arch': [['distilbert-base-uncased', 768]],
    'tokenizer_max_len': [128],
    'lr': [1e-4],
    'batch_size': [32],
    'target_label': [['label', 2]],
    'categorical': [False],
    'dropout': [0.3],
    'gradient_clip_val': [100],
}