# FAST: Feedforward-ASsisted Transformers

Notebook for running GLUE tasks.

# Setup

In [1]:
model_param = "DistilRoBERTa"
task_param = "mnli_mismatched"
embedding_param = ["cls_single"]

## Modules

In [2]:
import random
import time
import csv
import os
import pathlib
import itertools
from datetime import datetime
from collections import namedtuple
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

from utils.feed_forward import FeedForward
from utils.cls import extract_cls_embeddings
from utils.mean_pooling import mean_pooling
from utils.energy import get_energy

## Device

In [3]:
# Set CUBLAS_WORKSPACE_CONFIG
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # or ":16:8"
os.environ['CUDNN_DETERMINISTIC'] = '1'

# Standardized default seed
seed = 7
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms(mode=True)

In [4]:
device_name = "cpu"  # default device is CPU
if torch.cuda.is_available():
    # I read that this works for detecting if notebook is being run in a colab environment, not sure though
    if 'COLAB_GPU' in os.environ:
        print("colab environment")
        device_name = "gpu" 
    else:
        device_name = "cuda" # CUDA for NVIDIA GPU
elif torch.backends.mps.is_available():
    device_name = torch.device("mps")  # Metal Performance Shaders for Apple M-series GPU

# device_name = "cuda:0"
device = torch.device(device_name)
print(device_name)

mps


## User parameters

Parameters to set:
- Model
    - MPNet
    - DistilRoBERTa
- Task
    - cola
    - sst2
    - mrpc
    - stsb
    - qqp
    - mnli_matched
    - mnli_mismatched
    - qnli
    - rte
    - wnli
- Embedding type
    - Single Sentence
        - cls
        - meanpool
        - sentence
    - Two Sentence
        - Each sentence separately
            - cls_single
            - meanpool_single
        - Both sentences at once
            - cls_double
            - meanpool_double
        - sentence_single (no sentence_double option)

## Models

In [5]:
if model_param == "MPNet":

    if "cls" in "_".join(embedding_param) or "meanpool" in "_".join(embedding_param):
        from transformers import MPNetTokenizer, MPNetModel
        tokenizer = MPNetTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
        model = MPNetModel.from_pretrained("microsoft/mpnet-base").to(device)
        
    if "sentence" in "_".join(embedding_param):
        from sentence_transformers import SentenceTransformer
        sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)

elif model_param == "DistilRoBERTa":

    if "cls" in "_".join(embedding_param) or "meanpool" in "_".join(embedding_param):
        from transformers import RobertaTokenizer, RobertaModel
        tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
        model = RobertaModel.from_pretrained('distilroberta-base').to(device)

    if "sentence" in "_".join(embedding_param):
        from sentence_transformers import SentenceTransformer
        sentence_model = SentenceTransformer('sentence-transformers/all-distilroberta-v1').to(device)

In [6]:
TaskConfig = namedtuple("TaskConfig", ["sentence_type", "class_type", "num_classes", "col_names"])

task_configs = {
    "cola": TaskConfig("one", "BC", 1, ['sentence']),
    "sst2": TaskConfig("one", "BC", 1, ['sentence']),
    "mrpc": TaskConfig("two", "BC", 1, ['sentence1', 'sentence2']),
    "stsb": TaskConfig("two", "R", 1, ['sentence1', 'sentence2']),
    "qqp": TaskConfig("two", "BC", 1, ['question1', 'question2']),
    "mnli_matched": TaskConfig("two", "MC", 3, ['premise', 'hypothesis']),
    "mnli_mismatched": TaskConfig("two", "MC", 3, ['premise', 'hypothesis']),
    "qnli": TaskConfig("two", "BC", 1, ['question', 'sentence']),
    "rte": TaskConfig("two", "BC", 1, ['sentence1', 'sentence2']),
    "wnli": TaskConfig("two", "BC", 1, ['sentence1', 'sentence2']),
}

task_config = task_configs[task_param]

## Dataset

In [7]:
if task_param == "mnli_matched": 
    data = load_dataset("glue", "mnli") 
    val_key = "validation_matched"
    test_key = "test_matched"
elif task_param == "mnli_mismatched":
    data = load_dataset("glue", "mnli") 
    val_key = "validation_mismatched"
    test_key = "test_mismatched"
else:
    data = load_dataset("glue", task_param)
    val_key = "validation"
    test_key = "test"

data

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

# Embeddings

Labels come directly from dataset so no need to save to file

In [8]:
from sklearn.preprocessing import OneHotEncoder

Y_train = np.array(data["train"]["label"])
Y_val = np.array(data[val_key]["label"])
Y_test = np.array(data[test_key]["label"])

if task_config.class_type == "MC":
    Y_train = np.reshape(Y_train, (-1, 1))
    Y_val = np.reshape(Y_val, (-1, 1))
    Y_test = np.reshape(Y_test, (-1, 1))
    
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False).fit(Y_train)
    print(ohe.categories_)
     
    Y_train = ohe.transform(Y_train)
    Y_val = ohe.transform(Y_val)
    Y_test = ohe.transform(Y_test)

print(Y_train.shape)
print(Y_val.shape)
print(Y_test.shape)

[array([0, 1, 2])]
(392702, 3)
(9815, 3)
(9796, 3)


## Check for saved embeddings

In [9]:
total_length = len(embedding_param)
print(embedding_param)

X_train, X_val, X_test, embedding_tracker = [None]*total_length, [None]*total_length, [None]*total_length, []
create_dataloader, create_sentence = False, False

for id, embedding in enumerate(embedding_param):
    
    cache_path = pathlib.Path(f"./cache/{embedding}/{task_param}")
    cache_path.mkdir(parents=True, exist_ok=True)

    file_names = ['X_train', 'X_val', 'X_test']
    paths = [pathlib.Path(cache_path / f"{f}_{model_param}.npy") for f in file_names]

    if all(path.exists() for path in paths):
        print(f"{embedding} embeddings found!")
        X_train[id] = np.load(paths[0])
        X_val[id] = np.load(paths[1])
        X_test[id] = np.load(paths[2])

        print(f"X_train shape: {X_train[0].shape}")
        print(f"X_val shape  : {X_val[0].shape}")
        print(f"X_test shape : {X_test[0].shape}")

    else:
        embedding_tracker.append(id)
        print(f"No {embedding} saved embeddings found")

        if "cls" in embedding_param[id] or "meanpool" in embedding_param[id]:
            create_dataloader = True
        elif "sentence" in embedding_param[id]:
            create_sentence = True



['cls_single']
cls_single embeddings found!
X_train shape: (392702, 1536)
X_val shape  : (9815, 1536)
X_test shape : (9796, 1536)


In [10]:
X_train = np.concatenate(X_train, axis=1)
X_val = np.concatenate(X_val, axis=1)
X_test = np.concatenate(X_test, axis=1)
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape  : {X_val.shape}")
print(f"X_test shape : {X_test.shape}")

X_train shape: (392702, 1536)
X_val shape  : (9815, 1536)
X_test shape : (9796, 1536)


## Transforming Embeddings

In [11]:
def add_embeddings(embeddings, column_ids, embedding_size, is_UV, is_diff, is_mult):
    '''
    Add embeddings at specific column_ids. For example, for the matrix
    [Z1, Z2, U1, V1, Z3, U2, V2], if we want to replace this with:
    [Z1, Z2, U1 - V1, Z3, U2, V2, U2 - V2, U2 * V2], we provide the parameters:
    is_UV = [False, True] : U1, V1 are NOT kept, but U2, V2 are kept
    is_diff = [True, True] : both U1 - V1 and U2 - V2 are added
    is_mult = [False, True] : U1 * V1 is not included, U2 * V2 is included

    Args:
        embeddings : original matrix to replace
        column_ids : location of Ux (we assume Vx immedeately follows Ux), 
                     for the above example, we would provide column_ids = [2, 5].
                     If you DO NOT want to replace a certain Ux, simply don't include its id in column_ids
        is_UV : should Ux, Vx be included
        is_diff : should Ux - Vx be included
        is_mult : should Ux * Vx be included
    '''

    id_delta = 0 # keep track of changes to embedding inserts/deletions
    for id, column_id in enumerate(column_ids):

        if id>0:
            if not is_UV[id-1]:
                id_delta -= 2
            if is_diff[id-1]:
                id_delta += 1
            if is_mult[id-1]:
                id_delta += 1

        start_id = (column_id + id_delta) * embedding_size
        U_id = start_id + embedding_size
        V_id = start_id + (2*embedding_size)

        U = embeddings[:, start_id:U_id]
        V = embeddings[:, U_id:V_id]

        if is_diff[id] and not is_mult[id]:
            new_embeddings = (U - V)
        elif not is_diff[id] and is_mult[id]:
            new_embeddings = U * V
        else: # both
            new_embeddings = np.hstack([(U - V), (U * V)])

        if is_UV[id]:                  
            embeddings = np.hstack([
                embeddings[:, :V_id],       # Part of original matrix before replacement
                new_embeddings,             # New embeddings to insert
                embeddings[:, V_id:]        # Part of original matrix after replacement
            ])
        else:
            embeddings = np.hstack([
                embeddings[:, :start_id],   # Part of original matrix before replacement
                new_embeddings,             # New embeddings to insert
                embeddings[:, V_id:]        # Part of original matrix after replacement
            ])
    return embeddings

In [12]:
######################
# Choose ids of (Ux, Vx) to alter embeddings
column_ids = [] if task_config.sentence_type == "one" else [0]
######################

if column_ids:
    f = lambda embeddings : add_embeddings(embeddings=embeddings, column_ids=column_ids, embedding_size=768, 
                                           is_UV=[False], is_diff=[True], is_mult=[False])

    X_train_computed = f(embeddings=X_train)
    X_val_computed = f(embeddings=X_val)
    X_test_computed = f(embeddings=X_test)
else:
    X_train_computed = X_train
    X_val_computed = X_val
    X_test_computed = X_test
    
print(X_train_computed.shape)
print(X_val_computed.shape)
print(X_test_computed.shape)

(392702, 768)
(9815, 768)
(9796, 768)


In [13]:
X_train_combined = np.concatenate((X_train_computed, X_val_computed), axis=0)
Y_train_combined = np.concatenate((Y_train, Y_val), axis=0)
print(X_train_combined.shape, Y_train_combined.shape)

(402517, 768) (402517, 3)


# Train & Eval on Test set

## Set best hyperparameter combination

In [14]:
input_size = X_train_computed.shape[1]

# NO ABS

# best hyperparameter combination
param_grid = {
    'max_epochs': [6],
    'batch_size': [32],
    'learning_rate': [0.001],
    'category': ['MC'],
    'norm': [False],
    'input_size': [768],
    'layer_size': [384],
    'num_classes': [3],
    'num_layers': [10],
    'weight_decay': [0.0001],
    'patience': [3],
    'min_delta': [0],
    'verbose': [True],
    'device': ['mps']
}

# default overrides
param_grid['verbose'] = [True]
param_grid['device'] = [device_name]
param_grid['norm'] = [False]

# Create a list of all combinations of hyperparameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
print(f"{len(all_params)} hyperparameter combinations")

1 hyperparameter combinations


In [15]:
y_pred = None

# Create output folder if it doesn't exist
if not os.path.exists('output'):
    os.makedirs('output')

# Setup for logging
console_output_filename = f'./output/{"_".join(embedding_param[0])}_{task_param}_console_output.txt'

with open(console_output_filename, 'a') as logfile:
    logfile.write('\n\nBEGIN FINAL TRAINING LOOP\n\n')

# Setup for saving results
results_folder = pathlib.Path(f"results/{embedding_param[0]}/{task_param}")
results_folder.mkdir(parents=True, exist_ok=True)
save_file_id = datetime.now().strftime("%Y%m%d_%H%M%S")
test_results_file = results_folder / f"test_{save_file_id}_{model_param}.csv"
test_y_pred_file = results_folder / f"y_pred_{save_file_id}_{model_param}.tsv"

# different metrics are recorded for classification vs regression tasks
if task_config.class_type in ["BC", "MC"]:
    metric_types = ['mcc', 'f1', 'accuracy']
elif task_config.class_type == "R":
    metric_types = ['pearson', 'spearman']

with open(test_results_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    header = metric_types + ['training time / epoch', 'embedding time', 'training energy / epoch', 'embedding energy'] + list(all_params[0].keys())
    writer.writerow(header)
print(f"saving results to ./{test_results_file}")
# Saves best accuracy for progress bar display
display_best = float("-inf")

# Iterate over all combinations of hyperparameters
bar = tqdm(enumerate(all_params), total=len(all_params))
for i, params in bar:
    # Formatting params to display
    print_params = params.copy()
    for param in ['category', 'device']:
        del print_params[param]
        
    # reset torch so that results are consistent
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Initialize the model with current set of hyperparameters
    feed_forward = FeedForward(**params)

    _, train_times_per_epoch, energy_per_epoch = feed_forward.fit(X_train_combined,
                                                                Y_train_combined)
    
    metrics = feed_forward._validate(X_test_computed, Y_test, return_predictions=True)

    y_pred = metrics["predictions"]
    del metrics["predictions"]

    y_pred_df = pd.DataFrame(y_pred, columns=["prediction"])
    y_pred_df["index"] = y_pred_df.index
    y_pred_df = y_pred_df[["index", "prediction"]]
    y_pred_df.to_csv(test_y_pred_file, sep='\t', index=False, header=True)
    print(f"saving predictions to ./{test_y_pred_file}")
    
    # Log average training time per epoch for current parameter set
    # Note: FFN frequently stops early
    training_time = np.mean(train_times_per_epoch)
    training_energy = np.mean(energy_per_epoch) 
    # Compute energy for embedding generation
    embedding_energy = 0.0 # This method effectively just computes energy for a given time

    metric_vals = [metrics[mt] for mt in metric_types]
    
    # displaying results in progress bar
    display_recent = metrics["pearson" if task_config.class_type == "R" else "accuracy"]
    display_best = max(display_best, display_recent)
    bar.set_description(f"Best: {display_best:.5f}, Last: {display_recent:.5f}")

    # Write stats to log file
    with open(console_output_filename, 'a') as logfile:
        logfile.write(f"\n\nTraining with parameters:\n{print_params}")
        for name, val in zip(metric_types, metric_vals):
            logfile.write(f"\nValidation {name}: {val}")
        logfile.write(f"\nTraining time      : {training_time}")
        logfile.write(f"\nTraining energy    : {training_energy}") 
        logfile.write(f"\nEmbedding energy   : {embedding_energy}") 
    # Write to results csv
    with open(test_results_file, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        row = metric_vals + [training_time, 0.0, training_energy, embedding_energy] + list(params.values())
        writer.writerow(row)

saving results to ./results/cls_single/mnli_matched/test_20240110_001155_DistilRoBERTa.csv


  0%|          | 0/1 [00:00<?, ?it/s]

saving predictions to ./results/cls_single/mnli_matched/y_pred_20240110_001155_DistilRoBERTa.tsv


In [None]:
results_df = pd.read_csv(test_results_file)
results_df

Unnamed: 0,mcc,f1,accuracy,training time / epoch,embedding time,training energy / epoch,embedding energy,max_epochs,batch_size,learning_rate,...,norm,input_size,layer_size,num_classes,num_layers,weight_decay,patience,min_delta,device,verbose


In [None]:
feed_forward._validate(X_test_computed, Y_test, return_predictions=True)

{'loss': 2.4890098571777344,
 'accuracy': 0.0,
 'f1': 0.0,
 'mcc': 0.0,
 'predictions': tensor([[1],
         [1],
         [1],
         ...,
         [1],
         [1],
         [0]])}

In [None]:
y_pred_df['prediction'] = y_pred_df['prediction'].replace({0: 'entailment', 1: 'not_entailment'})

In [None]:
y_pred_df.to_csv(test_y_pred_file, sep='\t', index=False, header=True)