In [1]:
from utils import *
from utils_training import *
from utils_dataset import *
from utils_metrics import *

In [2]:
# The Training is defined by the parameters below

# A few rules:
# - The first metric will be the one used to decide whether to save the model or not
# - num_eval is the number of evaluation per epoch
# - The TRAIN_PATH dataset will be divided in a train and validation set (with split train_frac)
# - subsample_train/test_size is an integer

hyperparameters = {
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',       # device for training (should be GPU)
    'metrics_names': ['MatthewCorr', 'Accuracy', 'F1 Score'],       # Metrics on which to do the evaluations
    'num_eval': 10,                                                 # Number of evaluation per epoch
    'model_name': 'bert-base-cased',                                # Model to train (by default should be 'bert-base-cased')
    'seed': torch.randint(10000, size = (1,1)).squeeze(0).item(),   # Seed for training
    'dataset_name': 'CoLA',                                         # Name of the dataset ('CoLA', 'RTE', 'QNLI', 'SST-2')
    'batch_size': 8,                                                # Training batch size
    'lr': 5e-4,                                                     # Learning rate
    'TRAIN_PATH': 'data/CoLA/train.tsv',                            # Path to training set
    'TEST_PATH': 'data/CoLA/dev.tsv',                               # Path to validation set (will be our test set)
    'num_epochs': 20,                                               # Number of epochs
    'Finetuning': 'Init&BitFit',                                    # Type of Fine-Tuning ('Full', 'BitFit', 'LayerNorm', 'Random', 'Init&BitFit', 'InitBias&BitFit')
    'subsample_train_size': None,                                   # Number of training samples, if None all the samples are used
    'subsample_test_size': None,                                    # Number of test samples, if None all the samples are used
    'max_length': 512,                                              # Maximum length of a sample (should depend on the model, 512 by default)
    'train_frac': 0.8,                                              # train/validation split ratio
    'grad_masks': None,                                             # gradient masks for the random Fine-Tuning    
    'ratio_params': None,                                           # Ratio of parameters trained for random fine-tuning
}

# Full Fine-Tuning

In [None]:
# Full finetuning for CoLA

seeds = [5542, 3568, 6396, 5225, 3583, 6066, 6112, 8083, 4472, 6081]
hyperparameters['Finetuning'] = 'Full'
hyperparameters['lr'] = 3e-5

for seed in seeds:
    hyperparameters['seed'] = seed
    training(hyperparameters)

# BitFit Fine-Tuning

In [None]:
# BitFit finetuning
seeds = [5542, 3568, 6396, 5225, 3583, 6066, 6112, 8083, 4472, 6081]
hyperparameters['Finetuning'] = 'BitFit'
lrs = [1e-4, 5e-4, 1e-3]

for lr in lrs:
  for seed in seeds:
      hyperparameters['seed'] = seed
      hyperparameters['lr'] = lr
      training(hyperparameters)

# LayerNorm Fine-Tuning

In [None]:
# LayerNorm finetuning
seeds = [5542, 3568, 6396, 5225, 3583, 6066, 6112, 8083, 4472, 6081]
hyperparameters['Finetuning'] = 'LayerNorm'
lrs = [1e-4, 5e-4, 1e-3]

for lr in lrs:
    for seed in seeds: 
        hyperparameters['seed'] = seed
        hyperparameters['lr'] = lr
        training(hyperparameters)

# Low Data Regime Fine-Tuning

In [None]:
# Full Finetuning and changing train_size
seeds = [1854, 1717, 3509, 4761, 233, 4561, 5447, 8593, 5259, 5138]
train_sizes = [5000, 2500, 1000, 500, 100]
lrs = [3e-5]
hyperparameters['Finetuning'] = 'Full'

for seed in seeds:
  for lr in lrs:
    for train_size in train_sizes:
      hyperparameters['seed'] = seed
      hyperparameters['lr'] = lr
      hyperparameters['subsample_train_size'] = train_size
      training(hyperparameters)

In [None]:
# BitFit Finetuning and changing train_size
seeds = [1854, 1717, 3509, 4761, 233, 4561, 5447, 8593, 5259, 5138]
train_sizes = [5000, 2500, 1000, 500, 100]
lrs = [1e-4, 5e-4, 1e-3]
hyperparameters['Finetuning'] = 'BitFit'

for seed in seeds:
  for lr in lrs:
    for train_size in train_sizes:
      hyperparameters['seed'] = seed
      hyperparameters['lr'] = lr
      hyperparameters['subsample_train_size'] = train_size
      training(hyperparameters)

In [None]:
# LayerNorm Finetuning and changing train_size
seeds = [1854, 1717, 3509, 4761, 233, 4561, 5447, 8593, 5259, 5138]
train_sizes = [5000, 2500, 1000, 500, 100]
lrs = [1e-4, 5e-4, 1e-3]
hyperparameters['Finetuning'] = 'LayerNorm'

for seed in seeds:
  for lr in lrs:
    for train_size in train_sizes:
      hyperparameters['seed'] = seed
      hyperparameters['lr'] = lr
      hyperparameters['subsample_train_size'] = train_size
      training(hyperparameters)

hyperparameters['subsample_train_size'] = None

# Random Fine-Tuning

In [None]:
# Random Finetuning
seeds = [4500, 6731, 1433, 4120,  943, 9217, 8121, 2816,  633, 5185]
lrs = [5e-4, 1e-3, 5e-3]

hyperparameters['Finetuning'] = 'Random'
model_base = BertForSequenceClassification.from_pretrained(hyperparameters['model_name'])
hyperparameters['ratio_params'] = 0.001

for lr in lrs:
    hyperparameters['lr'] = lr
    for seed in seeds:
        hyperparameters['seed'] = seed
        grad_masks = get_grad_mask(model_base, hyperparameters['ratio_params'])
        hyperparameters['grad_masks'] = grad_masks
        training(hyperparameters)

# Init+BitFit

In [22]:
# Reinitialize all parameters then BitFit

# First we retrieve the different learning rates and seeds

import os

seeds = []
lrs = []
for name in os.listdir('ResultsTest/Models/CoLA'):
    if 'Init_BitFit' in name:
        seeds.append(int(name.split('_')[4].replace('seed','')))
        lrs.append(float(name.split('_')[5].replace('lr','')))

for lr, seed in zip(lrs, seeds):

    if lr == 1.: lr = int(lr)
    
    hyperparameters['lr'] = lr
    hyperparameters['seed'] = seed
    hyperparameters['Finetuning'] = 'Init&BitFit'
    
    training(hyperparameters)



Init_BitFit_bert-base-cased_CoLA_seed3568_lr0.0005_epochs20:


Init_BitFit_bert-base-cased_CoLA_seed3568_lr0.0005_epochs20 already trained




Init_BitFit_bert-base-cased_CoLA_seed3568_lr0.001_epochs20:


Init_BitFit_bert-base-cased_CoLA_seed3568_lr0.001_epochs20 already trained




Init_BitFit_bert-base-cased_CoLA_seed3568_lr0.005_epochs20:


Init_BitFit_bert-base-cased_CoLA_seed3568_lr0.005_epochs20 already trained




Init_BitFit_bert-base-cased_CoLA_seed3568_lr0.01_epochs20:


Init_BitFit_bert-base-cased_CoLA_seed3568_lr0.01_epochs20 already trained




Init_BitFit_bert-base-cased_CoLA_seed3583_lr0.001_epochs20:


Init_BitFit_bert-base-cased_CoLA_seed3583_lr0.001_epochs20 already trained




Init_BitFit_bert-base-cased_CoLA_seed3583_lr0.005_epochs20:


Init_BitFit_bert-base-cased_CoLA_seed3583_lr0.005_epochs20 already trained




Init_BitFit_bert-base-cased_CoLA_seed4472_lr0.001_epochs20:


Init_BitFit_bert-base-cased_CoLA_seed4472_lr0.001_epochs20 already trained




Init_BitFit

# InitBias+BitFit Fine-Tuning

In [5]:
# InitBias&Bitfit finetuning for CoLA
seeds = [5271, 6789, 5892, 5078, 3581, 3706, 8757, 5910,  137, 7205]
lrs = [1e-4, 5e-4, 1e-3, 5e-3]
hyperparameters['Finetuning'] = 'InitBias&BitFit'

for lr in lrs:
    for seed in seeds:
        hyperparameters['seed'] = seed
        hyperparameters['lr'] = lr
        training(hyperparameters)



InitBias_BitFit_bert-base-cased_CoLA_seed5271_lr0.0001_epochs20:


InitBias_BitFit_bert-base-cased_CoLA_seed5271_lr0.0001_epochs20 already trained




InitBias_BitFit_bert-base-cased_CoLA_seed6789_lr0.0001_epochs20:


InitBias_BitFit_bert-base-cased_CoLA_seed6789_lr0.0001_epochs20 already trained




InitBias_BitFit_bert-base-cased_CoLA_seed5892_lr0.0001_epochs20:


InitBias_BitFit_bert-base-cased_CoLA_seed5892_lr0.0001_epochs20 already trained




InitBias_BitFit_bert-base-cased_CoLA_seed5078_lr0.0001_epochs20:


InitBias_BitFit_bert-base-cased_CoLA_seed5078_lr0.0001_epochs20 already trained




InitBias_BitFit_bert-base-cased_CoLA_seed3581_lr0.0001_epochs20:


InitBias_BitFit_bert-base-cased_CoLA_seed3581_lr0.0001_epochs20 already trained




InitBias_BitFit_bert-base-cased_CoLA_seed3706_lr0.0001_epochs20:


InitBias_BitFit_bert-base-cased_CoLA_seed3706_lr0.0001_epochs20 already trained




InitBias_BitFit_bert-base-cased_CoLA_seed8757_lr0.0001_epochs20:


InitBias_BitFit_bert-base