In [1]:
from utils import *
from utils_training import *
from utils_dataset import *
from utils_metrics import *

from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
# The Training is defined by the parameters below

# A few rules:
# - The first metric will be the one used to decide whether to save the model or not
# - num_eval is the number of evaluation per epoch
# - The TRAIN_PATH dataset will be divided in a train and validation set (with split train_frac)
# - subsample_train/test_size is an integer

hyperparameters = {
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',       # device for training (should be GPU)
    'metrics_names': ['Accuracy', 'MatthewCorr', 'F1 Score'],       # Metrics on which to do the evaluations
    'num_eval': 10,                                                 # Number of evaluation per epoch
    'model_name': 'bert-base-cased',                                # Model to train (by default should be 'bert-base-cased')
    'seed': torch.randint(10000, size = (1,1)).squeeze(0).item(),   # Seed for training
    'dataset_name': 'RTE',                                         # Name of the dataset ('CoLA', 'RTE', 'QNLI', 'SST-2')
    'batch_size': 8,                                                # Training batch size
    'lr': 5e-4,                                                     # Learning rate
    'TRAIN_PATH': 'data/RTE/train.tsv',                            # Path to training set
    'TEST_PATH': 'data/RTE/dev.tsv',                               # Path to validation set (will be our test set)
    'num_epochs': 20,                                               # Number of epochs
    'Finetuning': 'Init&BitFit',                                    # Type of Fine-Tuning ('Full', 'BitFit', 'LayerNorm', 'Random', 'Init&BitFit', 'InitBias&BitFit')
    'subsample_train_size': None,                                   # Number of training samples, if None all the samples are used
    'subsample_test_size': None,                                    # Number of test samples, if None all the samples are used
    'max_length': 512,                                              # Maximum length of a sample (should depend on the model, 512 by default)
    'train_frac': 0.8,                                              # train/validation split ratio
    'grad_masks': None,                                             # gradient masks for the random Fine-Tuning    
    'ratio_params': None,                                           # Ratio of parameters trained for random fine-tuning
}

# Full Fine-Tuning

In [None]:
# Full finetuning
seeds = [7504, 8224, 3266, 2691, 4717, 2674, 3866, 8820, 1288, 5237]
hyperparameters['Finetuning'] = 'Full'
hyperparameters['lr'] = 3e-5

for seed in seeds:
    hyperparameters['seed'] = seed
    training(hyperparameters)

# BitFit Fine-Tuning

In [None]:
# BitFit finetuning
seeds = [7504, 8224, 3266, 2691, 4717, 2674, 3866, 8820, 1288, 5237]
hyperparameters['Finetuning'] = 'BitFit'
lrs = [1e-4, 5e-4, 1e-3]

for lr in lrs:
  for seed in seeds:
      hyperparameters['seed'] = seed
      hyperparameters['lr'] = lr
      training(hyperparameters)

# LayerNorm Fine-Tuning

In [None]:
# LayerNorm finetuning
seeds = [7504, 8224, 3266, 2691, 4717, 2674, 3866, 8820, 1288, 5237]
hyperparameters['Finetuning'] = 'LayerNorm'
lrs = [1e-4, 5e-4, 1e-3]

for lr in lrs:
  for seed in seeds:
      hyperparameters['seed'] = seed
      hyperparameters['lr'] = lr
      training(hyperparameters)

# Random Fine-Tuning

In [10]:
# Random Finetuning

hyperparameters['Finetuning'] = 'Random'
hyperparameters['ratio_params'] = 0.001

seeds1 = [4500, 6731, 1433, 4120,  943, 9217, 8121, 2816,  633, 5185]
seeds2 = [7504, 8224, 3266, 2691, 4717, 2674, 3866, 8820, 1288, 5237]
lrs = [5e-4, 1e-3, 5e-3]
model_base = BertForSequenceClassification.from_pretrained(hyperparameters['model_name'])

for lr in lrs:

    hyperparameters['lr'] = lr

    if lr != 5e-4:
        seeds = seeds1
    else:
        seeds = seeds2

    for seed in seeds:
        hyperparameters['seed'] = seed
        grad_masks = get_grad_mask(model_base, hyperparameters['ratio_params'])
        hyperparameters['grad_masks'] = grad_masks

        training(hyperparameters)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b



Random0.001_bert-base-cased_RTE_seed7504_lr0.0005_epochs20:


Random0.001_bert-base-cased_RTE_seed7504_lr0.0005_epochs20 already trained




Random0.001_bert-base-cased_RTE_seed8224_lr0.0005_epochs20:


Random0.001_bert-base-cased_RTE_seed8224_lr0.0005_epochs20 already trained




Random0.001_bert-base-cased_RTE_seed3266_lr0.0005_epochs20:


Random0.001_bert-base-cased_RTE_seed3266_lr0.0005_epochs20 already trained




Random0.001_bert-base-cased_RTE_seed2691_lr0.0005_epochs20:


Random0.001_bert-base-cased_RTE_seed2691_lr0.0005_epochs20 already trained




Random0.001_bert-base-cased_RTE_seed4717_lr0.0005_epochs20:


Random0.001_bert-base-cased_RTE_seed4717_lr0.0005_epochs20 already trained




Random0.001_bert-base-cased_RTE_seed2674_lr0.0005_epochs20:


Random0.001_bert-base-cased_RTE_seed2674_lr0.0005_epochs20 already trained




Random0.001_bert-base-cased_RTE_seed3866_lr0.0005_epochs20:


Random0.001_bert-base-cased_RTE_seed3866_lr0.0005_epochs20 already trained




Random0.001

# InitBias + BitFit

In [None]:
# InitBias&Bitfit fine-tuning for RTE

seeds = [5271, 6789, 5892, 5078, 3581, 3706, 8757, 5910,  137, 7205]
lrs = [1e-4, 5e-4, 1e-3, 5e-3]
hyperparameters['Finetuning'] = 'InitBias&BitFit'
hyperparameters['dataset_name'] = 'RTE'
hyperparameters['TRAIN_PATH'] = 'data/RTE/train.tsv'
hyperparameters['TEST_PATH'] = 'data/RTE/dev.tsv'
hyperparameters['metrics_names'] = ['Accuracy', 'MatthewCorr', 'F1 Score']

for lr in lrs:
    for seed in seeds:
        hyperparameters['seed'] = seed
        hyperparameters['lr'] = lr
        training(hyperparameters)

# Low Data Regime

In [None]:
# Full Finetuning and changing train_size
seeds = [85, 1733, 601, 1212, 5283, 2314, 6521, 7304, 2914, 4783]
train_sizes = [2000, 1000, 500, 200, 100]
lrs = [3e-5]
hyperparameters['Finetuning'] = 'Full'

for seed in seeds:
  for lr in lrs:
    for train_size in train_sizes:
      hyperparameters['seed'] = seed
      hyperparameters['lr'] = lr
      hyperparameters['subsample_train_size'] = train_size
      training(hyperparameters)

In [None]:
# BitFit Finetuning and changing train_size
seeds = [85, 1733, 601, 1212, 5283, 2314, 6521, 7304, 2914, 4783]
train_sizes = [2000, 1000, 500, 200, 100]
lrs = [1e-4, 5e-4, 1e-3]
hyperparameters['Finetuning'] = 'BitFit'

for seed in seeds:
  for lr in lrs:
    for train_size in train_sizes:
      hyperparameters['seed'] = seed
      hyperparameters['lr'] = lr
      hyperparameters['subsample_train_size'] = train_size
      training(hyperparameters)

In [14]:
# LayerNorm Finetuning and changing train_size
seeds = [85, 1733, 601, 1212, 5283, 2314, 6521, 7304, 2914, 4783]
train_sizes = [2000, 1000, 500, 200, 100]
lrs = [1e-4, 5e-4, 1e-3]
hyperparameters['Finetuning'] = 'LayerNorm'

for seed in seeds:
  for lr in lrs:
    for train_size in train_sizes:
      hyperparameters['seed'] = seed
      hyperparameters['lr'] = lr
      hyperparameters['subsample_train_size'] = train_size
      training(hyperparameters)



LayerNorm_Size2000_bert-base-cased_RTE_seed85_lr0.0001_epochs20:


LayerNorm_Size2000_bert-base-cased_RTE_seed85_lr0.0001_epochs20 already trained




LayerNorm_Size1000_bert-base-cased_RTE_seed85_lr0.0001_epochs20:


LayerNorm_Size1000_bert-base-cased_RTE_seed85_lr0.0001_epochs20 already trained




LayerNorm_Size500_bert-base-cased_RTE_seed85_lr0.0001_epochs20:


LayerNorm_Size500_bert-base-cased_RTE_seed85_lr0.0001_epochs20 already trained




LayerNorm_Size200_bert-base-cased_RTE_seed85_lr0.0001_epochs20:


LayerNorm_Size200_bert-base-cased_RTE_seed85_lr0.0001_epochs20 already trained




LayerNorm_Size100_bert-base-cased_RTE_seed85_lr0.0001_epochs20:


LayerNorm_Size100_bert-base-cased_RTE_seed85_lr0.0001_epochs20 already trained




LayerNorm_Size2000_bert-base-cased_RTE_seed85_lr0.0005_epochs20:


LayerNorm_Size2000_bert-base-cased_RTE_seed85_lr0.0005_epochs20 already trained




LayerNorm_Size1000_bert-base-cased_RTE_seed85_lr0.0005_epochs20:


LayerNorm_Size1000_bert-base-ca