In [1]:
from utils import *
from utils_training import *
from utils_dataset import *
from utils_metrics import *

In [2]:
# The Training is defined by the parameters below

# A few rules:
# - The first metric will be the one used to decide whether to save the model or not
# - num_eval is the number of evaluation per epoch
# - The TRAIN_PATH dataset will be divided in a train and validation set (with split train_frac)
# - subsample_train/test_size is an integer

hyperparameters = {
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',       # device for training (should be GPU)
    'metrics_names': ['Accuracy', 'MatthewCorr', 'F1 Score'],       # Metrics on which to do the evaluations
    'num_eval': 10,                                                 # Number of evaluation per epoch
    'model_name': 'bert-base-cased',                                # Model to train (by default should be 'bert-base-cased')
    'seed': torch.randint(10000, size = (1,1)).squeeze(0).item(),   # Seed for training
    'dataset_name': 'SST-2',                                         # Name of the dataset ('CoLA', 'RTE', 'QNLI', 'SST-2')
    'batch_size': 8,                                                # Training batch size
    'lr': 5e-4,                                                     # Learning rate
    'TRAIN_PATH': 'data/SST-2/train.tsv',                            # Path to training set
    'TEST_PATH': 'data/SST-2/dev.tsv',                               # Path to validation set (will be our test set)
    'num_epochs': 20,                                               # Number of epochs
    'Finetuning': 'Full',                                    # Type of Fine-Tuning ('Full', 'BitFit', 'LayerNorm', 'Random', 'Init&BitFit', 'InitBias&BitFit')
    'subsample_train_size': None,                                   # Number of training samples, if None all the samples are used
    'subsample_test_size': None,                                    # Number of test samples, if None all the samples are used
    'max_length': 512,                                              # Maximum length of a sample (should depend on the model, 512 by default)
    'train_frac': 0.8,                                              # train/validation split ratio
    'grad_masks': None,                                             # gradient masks for the random Fine-Tuning    
    'ratio_params': None,                                           # Ratio of parameters trained for random fine-tuning
}

# Full Fine-Tuning

In [9]:
# Full finetuning for SST-2

seeds = [2178, 3514, 770, 4179, 3813]
hyperparameters['Finetuning'] = 'Full'
hyperparameters['lr'] = 3e-5

for seed in seeds:
    hyperparameters['seed'] = seed
    training(hyperparameters)



Full_bert-base-cased_SST-2_seed2178_lr3e-05_epochs20:


Full_bert-base-cased_SST-2_seed2178_lr3e-05_epochs20 already trained




Full_bert-base-cased_SST-2_seed3514_lr3e-05_epochs20:


Full_bert-base-cased_SST-2_seed3514_lr3e-05_epochs20 already trained




Full_bert-base-cased_SST-2_seed770_lr3e-05_epochs20:


Full_bert-base-cased_SST-2_seed770_lr3e-05_epochs20 already trained




Full_bert-base-cased_SST-2_seed4179_lr3e-05_epochs20:


Full_bert-base-cased_SST-2_seed4179_lr3e-05_epochs20 already trained




Full_bert-base-cased_SST-2_seed3813_lr3e-05_epochs20:


Full_bert-base-cased_SST-2_seed3813_lr3e-05_epochs20 already trained




# BitFit Fine-Tuning

In [10]:
# BitFit finetuning for SST-2

seeds = [2178, 3514, 770, 4179, 3813]
hyperparameters['Finetuning'] = 'BitFit'
lrs = [1e-4, 5e-4, 1e-3]

for lr in lrs:
  for seed in seeds:
      hyperparameters['seed'] = seed
      hyperparameters['lr'] = lr
      training(hyperparameters)



BitFit_bert-base-cased_SST-2_seed2178_lr0.0001_epochs20:


BitFit_bert-base-cased_SST-2_seed2178_lr0.0001_epochs20 already trained




BitFit_bert-base-cased_SST-2_seed3514_lr0.0001_epochs20:


BitFit_bert-base-cased_SST-2_seed3514_lr0.0001_epochs20 already trained




BitFit_bert-base-cased_SST-2_seed770_lr0.0001_epochs20:


BitFit_bert-base-cased_SST-2_seed770_lr0.0001_epochs20 already trained




BitFit_bert-base-cased_SST-2_seed4179_lr0.0001_epochs20:


BitFit_bert-base-cased_SST-2_seed4179_lr0.0001_epochs20 already trained




BitFit_bert-base-cased_SST-2_seed3813_lr0.0001_epochs20:


BitFit_bert-base-cased_SST-2_seed3813_lr0.0001_epochs20 already trained




BitFit_bert-base-cased_SST-2_seed2178_lr0.0005_epochs20:


BitFit_bert-base-cased_SST-2_seed2178_lr0.0005_epochs20 already trained




BitFit_bert-base-cased_SST-2_seed3514_lr0.0005_epochs20:


BitFit_bert-base-cased_SST-2_seed3514_lr0.0005_epochs20 already trained




BitFit_bert-base-cased_SST-2_seed770_lr0.0005_epochs20:

# LayerNorm Fine-Tuning

In [11]:
# LayerNorm finetuning for SST-2
seeds = [2178, 3514, 770, 4179, 3813]
hyperparameters['Finetuning'] = 'LayerNorm'
lrs = [1e-4, 5e-4, 1e-3]

for lr in lrs:
  for seed in seeds:
      hyperparameters['seed'] = seed
      hyperparameters['lr'] = lr
      training(hyperparameters)



LayerNorm_bert-base-cased_SST-2_seed2178_lr0.0001_epochs20:


LayerNorm_bert-base-cased_SST-2_seed2178_lr0.0001_epochs20 already trained




LayerNorm_bert-base-cased_SST-2_seed3514_lr0.0001_epochs20:


LayerNorm_bert-base-cased_SST-2_seed3514_lr0.0001_epochs20 already trained




LayerNorm_bert-base-cased_SST-2_seed770_lr0.0001_epochs20:


LayerNorm_bert-base-cased_SST-2_seed770_lr0.0001_epochs20 already trained




LayerNorm_bert-base-cased_SST-2_seed4179_lr0.0001_epochs20:


LayerNorm_bert-base-cased_SST-2_seed4179_lr0.0001_epochs20 already trained




LayerNorm_bert-base-cased_SST-2_seed3813_lr0.0001_epochs20:


LayerNorm_bert-base-cased_SST-2_seed3813_lr0.0001_epochs20 already trained




LayerNorm_bert-base-cased_SST-2_seed2178_lr0.0005_epochs20:


LayerNorm_bert-base-cased_SST-2_seed2178_lr0.0005_epochs20 already trained




LayerNorm_bert-base-cased_SST-2_seed3514_lr0.0005_epochs20:


LayerNorm_bert-base-cased_SST-2_seed3514_lr0.0005_epochs20 already trained




LayerNorm_ber

# Random Fine-Tuning

In [7]:
# Random Finetuning for SST-2
hyperparameters['Finetuning'] = 'Random'
hyperparameters['ratio_params'] = 0.001

seeds = [2957,  741, 7633, 8251, 1100]
lrs = [5e-4, 1e-3, 5e-3]
model_base = BertForSequenceClassification.from_pretrained(hyperparameters['model_name'])

for lr in lrs:
    for seed in seeds:

      hyperparameters['lr'] = lr
      hyperparameters['seed'] = seed
      grad_masks = get_grad_mask(model_base, hyperparameters['ratio_params'])
      hyperparameters['grad_masks'] = grad_masks

      training(hyperparameters)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b



Random0.001_bert-base-cased_SST-2_seed2957_lr0.0005_epochs20:


Random0.001_bert-base-cased_SST-2_seed2957_lr0.0005_epochs20 already trained




Random0.001_bert-base-cased_SST-2_seed741_lr0.0005_epochs20:


Random0.001_bert-base-cased_SST-2_seed741_lr0.0005_epochs20 already trained




Random0.001_bert-base-cased_SST-2_seed7633_lr0.0005_epochs20:


Random0.001_bert-base-cased_SST-2_seed7633_lr0.0005_epochs20 already trained




Random0.001_bert-base-cased_SST-2_seed8251_lr0.0005_epochs20:


Random0.001_bert-base-cased_SST-2_seed8251_lr0.0005_epochs20 already trained




Random0.001_bert-base-cased_SST-2_seed1100_lr0.0005_epochs20:


Random0.001_bert-base-cased_SST-2_seed1100_lr0.0005_epochs20 already trained




Random0.001_bert-base-cased_SST-2_seed2957_lr0.001_epochs20:


Random0.001_bert-base-cased_SST-2_seed2957_lr0.001_epochs20 already trained




Random0.001_bert-base-cased_SST-2_seed741_lr0.001_epochs20:


Random0.001_bert-base-cased_SST-2_seed741_lr0.001_epochs20 already tra

# InitBias+BitFit Fine-Tuning

In [12]:
# InitBias&Bitfit finetuning for CoLA

hyperparameters['Finetuning'] = 'InitBias&BitFit'

seeds = [5271, 6789, 5892, 5078, 3581]
lrs = [1e-4, 5e-4, 1e-3, 5e-3]


for lr in lrs:
    for seed in seeds:
        hyperparameters['seed'] = seed
        hyperparameters['lr'] = lr
        training(hyperparameters)



InitBias_BitFit_bert-base-cased_SST-2_seed5271_lr0.0001_epochs20:


InitBias_BitFit_bert-base-cased_SST-2_seed5271_lr0.0001_epochs20 already trained




InitBias_BitFit_bert-base-cased_SST-2_seed6789_lr0.0001_epochs20:


InitBias_BitFit_bert-base-cased_SST-2_seed6789_lr0.0001_epochs20 already trained




InitBias_BitFit_bert-base-cased_SST-2_seed5892_lr0.0001_epochs20:


InitBias_BitFit_bert-base-cased_SST-2_seed5892_lr0.0001_epochs20 already trained




InitBias_BitFit_bert-base-cased_SST-2_seed5078_lr0.0001_epochs20:


InitBias_BitFit_bert-base-cased_SST-2_seed5078_lr0.0001_epochs20 already trained




InitBias_BitFit_bert-base-cased_SST-2_seed3581_lr0.0001_epochs20:


InitBias_BitFit_bert-base-cased_SST-2_seed3581_lr0.0001_epochs20 already trained




InitBias_BitFit_bert-base-cased_SST-2_seed5271_lr0.0005_epochs20:


InitBias_BitFit_bert-base-cased_SST-2_seed5271_lr0.0005_epochs20 already trained




InitBias_BitFit_bert-base-cased_SST-2_seed6789_lr0.0005_epochs20:


InitBias_Bit