In [1]:
import os, sys
currentdir = os.path.dirname(os.path.realpath('utils_metrics.py'))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

In [2]:
from utils_metrics import *
from utils_dataset import *
from utils_training import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [3]:
def create_df_models():
    
    params_cols = [
        'metrics_names', 'num_eval', 'model_name', 'dataset_name', 'batch_size', 'lr',
        'num_epochs', 'Finetuning', 'subsample_train_size', 'subsample_test_size',
        'max_length', 'train_frac', 'ratio_params', 'grad_masks'
    ]
    
    results_cols = ['steps', 'train_loss', 'validation_loss', 'validation_metrics', 'test_loss', 'test_metrics']
    
    all_cols = params_cols + results_cols + ['eval_metric']
    
    df = pd.DataFrame(columns = all_cols)
    
    for model_dir in os.listdir("Models/"):
        
        if model_dir.endswith('.pt'):
            continue
        
        for model in os.listdir("Models/" + model_dir):
    
            model = torch.load("Models/" + model_dir + "/" + model)

            model_dict = {}

            for col in params_cols:

                if col not in model['hyperparameters'].keys():
                    model_dict[col] = None

                else:
                    model_dict[col] = model['hyperparameters'][col]

            for col in results_cols:
                if col.startswith('test'):
                    model_dict[col] = model[col]

                elif col == 'train_loss':
                    model_dict[col] = model['metrics'].loss_train

                elif col == 'validation_loss':
                    model_dict[col] = model['metrics'].loss_val

                elif col == 'validation_metrics':
                    model_dict[col] = model['metrics'].metrics
                    
                elif col == 'steps':
                    model_dict[col] = model['metrics'].steps
                    
            eval_metric = model_dict['metrics_names'][0]
            model_dict['eval_metric'] = model_dict['test_metrics'][eval_metric]

            df = df.append(model_dict, ignore_index=True)
            
    df['subsample_train_size'] = df['subsample_train_size'].apply(lambda x: 'All' if x == None else x)
    df['subsample_test_size'] = df['subsample_test_size'].apply(lambda x: 'All' if x == None else x)
    df['ratio_params'] = df['ratio_params'].apply(lambda x: 'Unused' if x == None else x)
    df['grad_masks'] = df['grad_masks'].apply(lambda x: 'Unused' if x == None else x)
    
    return df

In [4]:
def find_best_lr(df):
    df_grouped = df.groupby('lr').median()['eval_metric']
    best_lr = df_grouped.index[df_grouped.argmax()]
    
    return best_lr

In [5]:
df = create_df_models()
df.to_csv('results_df.csv', index = False, encoding='utf-8')

In [6]:
# Get Results for Full vs BitFit vs LayerNorm vs Random
df = pd.read_csv('results_df.csv', encoding = 'utf-8')

table_df = pd.DataFrame(columns = ['dataset', 'FT', 'lr', 'entry'])

for FT in ['Full', 'BitFit', 'LayerNorm', 'Random', 'InitBias&BitFit']:
    print(f"\n{FT} Finetuning:\n")
    
    for dataset in ['CoLA', 'RTE', 'SST-2', 'QNLI']:
        print(f"{dataset}:")
        df_data = df[(df['dataset_name']==dataset) & (df['Finetuning']==FT) & (df['subsample_train_size']=='All')]
        
        if len(df_data) == 0:
            continue
            
        for lr in df_data['lr'].unique():
            
            df_lr = df_data[df_data['lr'] == lr]
            
            mean = df_lr['eval_metric'].mean()
            median = df_lr['eval_metric'].median()
            std = df_lr['eval_metric'].std()
            
            print(f"\tlr = {lr}: Mean={mean}, Median={median}, Std={std}")
                        
            entry = "{:.3f}(\u00B1{:.3f})".format(mean, std)
            table_df = table_df.append(pd.DataFrame([[dataset, FT, lr, entry]], columns = table_df.columns))
            
table_df.to_csv('Tables/Table_CompareMethods.csv', encoding = 'utf-8')


Full Finetuning:

CoLA:
	lr = 3e-05: Mean=0.5085524897925381, Median=0.508158687603464, Std=0.017428417443508794
RTE:
	lr = 3e-05: Mean=0.6559566787003609, Median=0.6606498194945849, Std=0.03224715756005855
SST-2:
	lr = 3e-05: Mean=0.8970183486238532, Median=0.8956422018348624, Std=0.014077973430664142
QNLI:
	lr = 3e-05: Mean=0.8904942965779468, Median=0.8893536121673004, Std=0.005397364854212353

BitFit Finetuning:

CoLA:
	lr = 0.0001: Mean=0.5262868712370707, Median=0.5247574843088701, Std=0.014079393296255958
	lr = 0.0005: Mean=0.5390492116379437, Median=0.5381491859818925, Std=0.01598510325445911
	lr = 0.001: Mean=0.5359924526446337, Median=0.5355703887281442, Std=0.017790763597585916
RTE:
	lr = 0.0001: Mean=0.6530685920577618, Median=0.6552346570397112, Std=0.020311649833811805
	lr = 0.0005: Mean=0.6703971119133574, Median=0.6750902527075813, Std=0.01560209194547576
	lr = 0.001: Mean=0.6563176895306858, Median=0.6678700361010831, Std=0.03857545204360593
SST-2:
	lr = 0.0001: Mean=

In [7]:
# Get Results for Full vs BitFit vs LayerNorm with different sizes
table_df = pd.DataFrame(columns = ['dataset', 'FT', 'lr', 'size', 'entry'])
for FT in ['Full', 'BitFit', 'LayerNorm']:
    print(f"\n{FT} Finetuning:\n")
    for dataset in ['CoLA', 'RTE']:
        print(f"{dataset}:")
        df_data = df[(df['dataset_name']==dataset) & (df['Finetuning']==FT)]
        if (len(df_data) == 0) or (len(df_data['subsample_train_size'].unique())==1):
            continue
        
        for train_size in df_data['subsample_train_size'].unique():
            
            df_size = df_data[df_data['subsample_train_size']==train_size]
            
            print(f"Size = {train_size}:")
            
            for lr in df_size['lr'].unique():
                df_lr = df_size[df_size['lr'] == lr]
                mean = df_lr['eval_metric'].mean()
                median = df_lr['eval_metric'].median()
                std = df_lr['eval_metric'].std()
                print(f"\tlr = {lr}: Mean={mean}, Median={median}, Std={std}")
                
                entry = "{:.3f}(\u00B1{:.3f})".format(mean, std)
                
                table_df = table_df.append(pd.DataFrame([[dataset, FT, lr, train_size, entry]], columns = table_df.columns))
                
table_df.to_csv('Tables/Table_CompareMethodsDiffSize.csv', encoding = 'utf-8')


Full Finetuning:

CoLA:
Size = All:
	lr = 3e-05: Mean=0.5085524897925381, Median=0.508158687603464, Std=0.017428417443508794
Size = 1000:
	lr = 3e-05: Mean=0.37619534323899595, Median=0.38516579035576326, Std=0.046094026471819594
Size = 100:
	lr = 3e-05: Mean=0.10172404661954085, Median=0.05877012763685075, Std=0.09279516528743464
Size = 2500:
	lr = 3e-05: Mean=0.40530721413227006, Median=0.4264365907436337, Std=0.04006358381308652
Size = 5000:
	lr = 3e-05: Mean=0.4706719521617142, Median=0.47077323631621976, Std=0.03281955494505523
Size = 500:
	lr = 3e-05: Mean=0.28857158010600664, Median=0.26504811464593875, Std=0.0779410857375012
RTE:
Size = All:
	lr = 3e-05: Mean=0.6559566787003609, Median=0.6606498194945849, Std=0.03224715756005855
Size = 1000:
	lr = 3e-05: Mean=0.6209386281588447, Median=0.6245487364620939, Std=0.04030838561839766
Size = 100:
	lr = 3e-05: Mean=0.5339350180505414, Median=0.5379061371841156, Std=0.03607901451430668
Size = 2000:
	lr = 3e-05: Mean=0.6606498194945849