# FAST: Feedforward-ASsisted Transformers

Notebook for running GLUE tasks.

# Setup

## Modules

In [1]:
import random
import time
import csv
import os
import pathlib
import itertools
from datetime import datetime
from collections import namedtuple
from tqdm.notebook import tqdm

from carbontracker.tracker import CarbonTracker

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

# from utils.feed_forward import FeedForward
from utils.feed_forward import FeedForward
from utils.cls import extract_cls_embeddings
from utils.mean_pooling import mean_pooling
from utils.energy import get_energy

## Device

In [2]:
# Set CUBLAS_WORKSPACE_CONFIG
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # or ":16:8"
os.environ['CUDNN_DETERMINISTIC'] = '1'

# Standardized default seed
seed = 7
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms(mode=True)

In [3]:
device_name = "cpu"  # default device is CPU
if torch.cuda.is_available():
    # I read that this works for detecting if notebook is being run in a colab environment, not sure though
    if 'COLAB_GPU' in os.environ:
        print("colab environment")
        device_name = "gpu" 
    else:
        device_name = "cuda" # CUDA for NVIDIA GPU
elif torch.backends.mps.is_available():
    device_name = torch.device("mps")  # Metal Performance Shaders for Apple M-series GPU

# device_name = "cuda:0"
device = torch.device(device_name)
print(device_name)

mps


## User parameters

Parameters to set:
- Model
    - MPNet
    - DistilRoBERTa
- Task
    - cola
    - sst2
    - mrpc
    - stsb
    - qqp
    - mnli_matched
    - mnli_mismatched
    - qnli
    - rte
    - wnli
- Embedding type
    - Single Sentence
        - cls
        - meanpool
        - sentence
    - Two Sentence
        - Each sentence separately
            - cls_single
            - meanpool_single
        - Both sentences at once
            - cls_double
            - meanpool_double
        - sentence_single (no sentence_double option)

In [4]:
model_param = "DistilRoBERTa"
task_param = "cola"
embedding_param = ["meanpool"]

use_carbontracker = False
# NOTE: Carbontracker should generate its own output file 
# => logged benchmarking results will be dummy values if use_carbontracker = True

## Models

In [5]:
if model_param == "MPNet":

    if "cls" in "_".join(embedding_param) or "meanpool" in "_".join(embedding_param):
        from transformers import MPNetTokenizer, MPNetModel
        tokenizer = MPNetTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
        model = MPNetModel.from_pretrained("microsoft/mpnet-base").to(device)
        
    if "sentence" in "_".join(embedding_param):
        from sentence_transformers import SentenceTransformer
        sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)

elif model_param == "DistilRoBERTa":

    if "cls" in "_".join(embedding_param) or "meanpool" in "_".join(embedding_param):
        from transformers import RobertaTokenizer, RobertaModel
        tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
        model = RobertaModel.from_pretrained('distilroberta-base').to(device)

    if "sentence" in "_".join(embedding_param):
        from sentence_transformers import SentenceTransformer
        sentence_model = SentenceTransformer('sentence-transformers/all-distilroberta-v1').to(device)

In [6]:
TaskConfig = namedtuple("TaskConfig", ["sentence_type", "class_type", "num_classes", "col_names"])

task_configs = {
    "cola": TaskConfig("one", "BC", 1, ['sentence']),
    "sst2": TaskConfig("one", "BC", 1, ['sentence']),
    "mrpc": TaskConfig("two", "BC", 1, ['sentence1', 'sentence2']),
    "stsb": TaskConfig("two", "R", 1, ['sentence1', 'sentence2']),
    "qqp": TaskConfig("two", "BC", 1, ['question1', 'question2']),
    "mnli_matched": TaskConfig("two", "MC", 3, ['premise', 'hypothesis']),
    "mnli_mismatched": TaskConfig("two", "MC", 3, ['premise', 'hypothesis']),
    "qnli": TaskConfig("two", "BC", 1, ['question', 'sentence']),
    "rte": TaskConfig("two", "BC", 1, ['sentence1', 'sentence2']),
    "wnli": TaskConfig("two", "BC", 1, ['sentence1', 'sentence2']),
}

task_config = task_configs[task_param]

## Dataset

In [7]:
if task_param == "mnli_matched": 
    data = load_dataset("glue", "mnli") 
    val_key = "validation_matched"
    test_key = "test_matched"
elif task_param == "mnli_mismatched":
    data = load_dataset("glue", "mnli") 
    val_key = "validation_mismatched"
    test_key = "test_mismatched"
else:
    data = load_dataset("glue", task_param)
    val_key = "validation"
    test_key = "test"

data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

# Embeddings

Labels come directly from dataset so no need to save to file

In [8]:
from sklearn.preprocessing import OneHotEncoder

Y_train = np.array(data["train"]["label"])
Y_val = np.array(data[val_key]["label"])
Y_test = np.array(data[test_key]["label"])

if task_config.class_type == "MC":
    Y_train = np.reshape(Y_train, (-1, 1))
    Y_val = np.reshape(Y_val, (-1, 1))
    Y_test = np.reshape(Y_test, (-1, 1))
    
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False).fit(Y_train)
    print(ohe.categories_)
     
    Y_train = ohe.transform(Y_train)
    Y_val = ohe.transform(Y_val)
    Y_test = ohe.transform(Y_test)

print(Y_train.shape)
print(Y_val.shape)
print(Y_test.shape)

(8551,)
(1043,)
(1063,)


## Check for saved embeddings

In [9]:
total_length = len(embedding_param)
print(embedding_param)

X_train, X_val, X_test, embedding_tracker = [None]*total_length, [None]*total_length, [None]*total_length, []
create_dataloader, create_sentence = False, False

for id, embedding in enumerate(embedding_param):
    
    cache_path = pathlib.Path(f"./cache/{embedding}/{task_param}")
    cache_path.mkdir(parents=True, exist_ok=True)

    file_names = ['X_train', 'X_val', 'X_test']
    paths = [pathlib.Path(cache_path / f"{f}_{model_param}.npy") for f in file_names]

    if all(path.exists() for path in paths):
        print(f"{embedding} embeddings found!")
        X_train[id] = np.load(paths[0])
        X_val[id] = np.load(paths[1])
        X_test[id] = np.load(paths[2])

        print(f"X_train shape: {X_train[0].shape}")
        print(f"X_val shape  : {X_val[0].shape}")
        print(f"X_test shape : {X_test[0].shape}")

    else:
        embedding_tracker.append(id)
        print(f"No {embedding} saved embeddings found")

        if "cls" in embedding_param[id] or "meanpool" in embedding_param[id]:
            create_dataloader = True
        elif "sentence" in embedding_param[id]:
            create_sentence = True



['meanpool']
meanpool embeddings found!
X_train shape: (8551, 768)
X_val shape  : (1043, 768)
X_test shape : (1063, 768)


## Tokenize dataset

In [10]:
max_len=512

def tokenize(examples, name):
    # print(task_config.col_names[0])
    tokenized = tokenizer(examples[task_config.col_names[0]],
                          add_special_tokens=True,
                          padding='max_length',
                          truncation=True,
                          max_length=max_len,
                          return_tensors='pt')
    return {name + "_" + key: value.to(device) for key, value in tokenized.items()}

def tokenize_single(examples, name):
    # print(task_config.col_names[0], task_config.col_names[1])
    tokenized_1 = tokenizer(examples[task_config.col_names[0]],
                         add_special_tokens=True,
                         padding='max_length',
                         truncation=True,
                         max_length=max_len,
                         return_tensors='pt')
    tokenized_2 = tokenizer(examples[task_config.col_names[1]],
                         add_special_tokens=True,
                         padding='max_length',
                         truncation=True,
                         max_length=max_len,
                         return_tensors='pt')
    
    tokenized_output = {name + "_" + key + "_1": value.to(device) for key, value in tokenized_1.items()}
    tokenized_output.update({name + "_" + key + "_2": value.to(device) for key, value in tokenized_2.items()})
    return tokenized_output

def tokenize_double(examples, name):
    # print(task_config.col_names[0], task_config.col_names[1])
    tokenized = tokenizer(examples[task_config.col_names[0]],
                          examples[task_config.col_names[1]],
                          add_special_tokens=True,
                          padding='max_length',
                          truncation=True,
                          max_length=max_len,
                          return_tensors='pt')
    return {name + "_" + key: value.to(device) for key, value in tokenized.items()}

inputs = []
for id, embedding in enumerate(embedding_param):
    if id in embedding_tracker and "sentence" not in embedding:
        if "single" in embedding:
            inputs.append(data.map(lambda examples: tokenize_single(examples, name=embedding), batched=True))
        elif "double" in embedding:
            inputs.append(data.map(lambda examples: tokenize_double(examples, name=embedding), batched=True))
        else:
            inputs.append(data.map(lambda examples: tokenize(examples, name=embedding), batched=True))

## Create Dataloader

In [11]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch

# Function to merge datasets
def clean_dataset(inputs, split):
    common_columns_to_remove = {'sentence1', 'sentence2', 'label', 'question1', 'question2', 'premise', 'hypothesis', 'question'}

    if isinstance(inputs, list):
        dataframe = [ds[split].to_pandas() for ds in inputs]
        dataframe = [df.drop(columns=common_columns_to_remove, errors='ignore') for df in dataframe]
        cleaned_df = dataframe[0]
        for df in dataframe[1:]:
            cleaned_df = pd.merge(cleaned_df, df, on='idx', how='inner')
    else:
        dataframe = inputs[split].to_pandas()
        cleaned_df = dataframe.drop(columns=common_columns_to_remove, errors='ignore')

    return cleaned_df

class TransformerDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        # Exclude 'idx' column and convert others to tensors
        self.columns = [col for col in dataframe.columns if col != 'idx']
        for col in self.columns:
            # Convert to NumPy array first for efficiency
            np_array = np.array(dataframe[col].tolist())
            setattr(self, col, torch.tensor(np_array, dtype=torch.int64))

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Retrieve tensors by index for each column
        item = {col: getattr(self, col)[idx] for col in self.columns}
        return item

# Define batch size
batch_size = 128

if create_dataloader:

    # Merge datasets for each split and convert to PyTorch datasets
    train_df = clean_dataset(inputs, 'train')
    validation_df = clean_dataset(inputs, 'validation')
    test_df = clean_dataset(inputs, 'test')

    # Create dataset instances
    train_dataset = TransformerDataset(train_df)
    validation_dataset = TransformerDataset(validation_df)
    test_dataset = TransformerDataset(test_df)

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Compute Embbeddings

In [12]:
def compute_embeddings(loader, X):
    column_names = list(next(iter(loader)).keys())

    # Creating pairs of input IDs and attention masks
    pairs = []
    for name in column_names:
        if 'input_ids' in name:
            mask_name = name.replace('input_ids', 'attention_mask')
            pairs.append((name, mask_name))

    # Loop over each pair of input ID and attention mask
    for input_id_name, mask_name in pairs:
        embedding = []
        tqdm.write(f"{input_id_name} {mask_name}")
        # time.sleep(0.2) # tqdm prints weird without slight time delay - delay subtracted when calculating embedding time
    
        for input_embedding in tqdm(loader):
            # Move batch to device
            model.eval()
            input_embedding = {key: value.to(device) for key, value in input_embedding.items()}
            with torch.no_grad():
                outputs = model(input_embedding[input_id_name], attention_mask=input_embedding[mask_name])

                if "cls" in input_id_name:
                    embed = extract_cls_embeddings(outputs)
                elif "meanpool" in input_id_name:
                    embed = mean_pooling(outputs, input_embedding[mask_name])

            embedding.append(embed.cpu().numpy())

        id = embedding_param.index(input_id_name.split("_input_ids")[0])
        # print(id)
        if "2" in input_id_name:
            U, V = X[id], np.concatenate(embedding, axis=0)
            X[id] = np.hstack([U, V])
            # print(X[id])
        else:
            X[id] = np.concatenate(embedding, axis=0)
            # print(X[id])

def compute_sentence(sentences):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size]
        batch_embeddings = sentence_model.encode(batch_sentences)
        embeddings.append(batch_embeddings)
    return np.concatenate(embeddings, axis=0)

if use_carbontracker:
    print("Embedding: Using Carbontracker")
    tracker = CarbonTracker(epochs=1)
    tracker.epoch_start()
else:
    print("Embedding: Using default time tracking")
    # Track time
    start_time = time.time()

if create_dataloader: # Compute CLS/Meanpool embeddings
    compute_embeddings(train_loader, X_train)
    compute_embeddings(validation_loader, X_val)
    compute_embeddings(test_loader, X_test)
if create_sentence: # Compute Sentence embeddings
    if "sentence_single" in embedding_param:
        id = embedding_param.index("sentence_single")
        U_train, V_train = compute_sentence(data["train"][task_config.col_names[0]]), compute_sentence(data["train"][task_config.col_names[1]])
        U_val, V_val = compute_sentence(data["validation"][task_config.col_names[0]]), compute_sentence(data["validation"][task_config.col_names[1]])
        U_test, V_test = compute_sentence(data["test"][task_config.col_names[0]]), compute_sentence(data["test"][task_config.col_names[1]])

        X_train[id] = np.hstack([U_train, V_train])
        X_val[id] = np.hstack([U_val, V_val])
        X_test[id] = np.hstack([U_test, V_test])

    elif "sentence" in embedding_param:
        id = embedding_param.index("sentence")
        X_train[id] = compute_sentence(data["train"][task_config.col_names[0]])
        X_val[id] = compute_sentence(data["validation"][task_config.col_names[0]])
        X_test[id] = compute_sentence(data["test"][task_config.col_names[0]])

if use_carbontracker:
    tracker.epoch_end()
    embedding_time = 0.0 # temporary value for logging
else:
    embedding_time = time.time() - start_time

Embedding: Using default time tracking


## Save Embeddings to File

In [13]:
for id in embedding_tracker:

    cache_path = pathlib.Path(f"./cache/{embedding_param[id]}/{task_param}")
    cache_path.mkdir(parents=True, exist_ok=True)

    file_names = ['X_train', 'X_val', 'X_test']
    paths = [pathlib.Path(cache_path / f"{f}_{model_param}.npy") for f in file_names]

    with open(paths[0], 'wb') as X_train_file:
        np.save(X_train_file, X_train[id])
    with open(paths[1], 'wb') as X_val_file:
        np.save(X_val_file, X_val[id])
    with open(paths[2], 'wb') as X_test_file:
        np.save(X_test_file, X_test[id])

In [14]:
X_train = np.concatenate(X_train, axis=1)
X_val = np.concatenate(X_val, axis=1)
X_test = np.concatenate(X_test, axis=1)
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape  : {X_val.shape}")
print(f"X_test shape : {X_test.shape}")

X_train shape: (8551, 768)
X_val shape  : (1043, 768)
X_test shape : (1063, 768)


## Transforming Embeddings

In [15]:
def add_embeddings(embeddings, column_ids, embedding_size, is_UV, is_diff, is_mult):
    '''
    Add embeddings at specific column_ids. For example, for the matrix
    [Z1, Z2, U1, V1, Z3, U2, V2], if we want to replace this with:
    [Z1, Z2, U1 - V1, Z3, U2, V2, U2 - V2, U2 * V2], we provide the parameters:
    is_UV = [False, True] : U1, V1 are NOT kept, but U2, V2 are kept
    is_diff = [True, True] : both U1 - V1 and U2 - V2 are added
    is_mult = [False, True] : U1 * V1 is not included, U2 * V2 is included

    Args:
        embeddings : original matrix to replace
        column_ids : location of Ux (we assume Vx immedeately follows Ux), 
                     for the above example, we would provide column_ids = [2, 5].
                     If you DO NOT want to replace a certain Ux, simply don't include its id in column_ids
        is_UV : should Ux, Vx be included
        is_diff : should Ux - Vx be included
        is_mult : should Ux * Vx be included
    '''

    id_delta = 0 # keep track of changes to embedding inserts/deletions
    for id, column_id in enumerate(column_ids):

        if id>0:
            if not is_UV[id-1]:
                id_delta -= 2
            if is_diff[id-1]:
                id_delta += 1
            if is_mult[id-1]:
                id_delta += 1

        start_id = (column_id + id_delta) * embedding_size
        U_id = start_id + embedding_size
        V_id = start_id + (2*embedding_size)

        U = embeddings[:, start_id:U_id]
        V = embeddings[:, U_id:V_id]

        if is_diff[id] and not is_mult[id]:
            new_embeddings = np.abs(U - V)
        elif not is_diff[id] and is_mult[id]:
            new_embeddings = U * V
        else: # both
            new_embeddings = np.hstack([np.abs(U - V), (U * V)])

        if is_UV[id]:                  
            embeddings = np.hstack([
                embeddings[:, :V_id],       # Part of original matrix before replacement
                new_embeddings,             # New embeddings to insert
                embeddings[:, V_id:]        # Part of original matrix after replacement
            ])
        else:
            embeddings = np.hstack([
                embeddings[:, :start_id],   # Part of original matrix before replacement
                new_embeddings,             # New embeddings to insert
                embeddings[:, V_id:]        # Part of original matrix after replacement
            ])
    return embeddings

In [16]:
print(embedding_param)

['meanpool']


In [17]:
######################
# Choose ids of (Ux, Vx) to alter embeddings
column_ids = []
######################

if column_ids:
    f = lambda embeddings : add_embeddings(embeddings=embeddings, column_ids=column_ids, embedding_size=768, 
                                           is_UV=[False, False], is_diff=[True, False], is_mult=[False, False])

    X_train_computed = f(embeddings=X_train)
    X_val_computed = f(embeddings=X_val)
    X_test_computed = f(embeddings=X_test)
else:
    X_train_computed = X_train
    X_val_computed = X_val
    X_test_computed = X_test
    
print(X_train_computed.shape)
print(X_val_computed.shape)
print(X_test_computed.shape)

(8551, 768)
(1043, 768)
(1063, 768)


# Training loop

In [18]:
input_size = X_train_computed.shape[1]

param_grid = {
    'num_epochs': [50],
    'batch_size': [32, 512],
    'learning_rate': [1e-2, 1e-4],
    'category': [task_config.class_type],
    'norm': [False],
    'input_size': [input_size],
    'layer_size': [768],
    'num_classes': [task_config.num_classes],
    'num_layers': [1, 3, 5],
    'weight_decay':[1e-2, 1e-4],
    'patience': [3],
    'min_delta': [0],
    'device': [device_name]
}

# 'input_size': [input_size],
# 'layer_size': [input_size // 4, input_size // 2, input_size, input_size * 2, input_size * 4],

# Create a list of all combinations of hyperparameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
print(f"{len(all_params)} hyperparameter combinations")

24 hyperparameter combinations


In [None]:
# Create output folder if it doesn't exist
if not os.path.exists('output'):
    os.makedirs('output')

# Setup for logging
console_output_filename = f'./output/{"_".join(embedding_param[0])}_{task_param}_console_output.txt'

with open(console_output_filename, 'a') as logfile:
    logfile.write('\n\nBEGIN TRAINING LOOP\n\n')

# Setup for saving results
results_folder = pathlib.Path(f"results/{embedding_param[0]}/{task_param}")
results_folder.mkdir(parents=True, exist_ok=True)
save_file_id = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = results_folder / f"val_{save_file_id}_{model_param}.csv"

# different metrics are recorded for classification vs regression tasks
if task_config.class_type in ["BC", "MC"]:
    metric_types = ['mcc', 'f1', 'accuracy']
elif task_config.class_type == "R":
    metric_types = ['pearson', 'spearman']

with open(results_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    header = metric_types + ['training time', 'embedding time', 'training energy', 'embedding energy'] + list(all_params[0].keys())
    writer.writerow(header)
print(f"saving results to ./{results_file}")
# Saves best accuracy for progress bar display
display_best = float("-inf")

# Iterate over all combinations of hyperparameters
bar = tqdm(enumerate(all_params), total=len(all_params))
for i, params in bar:
    # Formatting params to display
    print_params = params.copy()
    for param in ['category', 'device']:
        del print_params[param]
        
    # reset torch so that results are consistent
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Initialize the model with current set of hyperparameters
    feed_forward = FeedForward(**params)

    metrics, train_times_per_epoch, energy_per_epoch = feed_forward.fit(X_train_computed,
                                                                        Y_train,
                                                                        X_val_computed,
                                                                        Y_val,
                                                                        use_carbontracker)
    
    # Log average training time per epoch for current parameter set
    # Note: FFN frequently stops early
    training_time = np.mean(train_times_per_epoch)
    training_energy = np.mean(energy_per_epoch) 
    # Compute energy for embedding generation
    embedding_energy = get_energy(embedding_time, device) # This method effectively just computes energy for a given time
    
    metric_vals = [metrics[mt] for mt in metric_types]
    
    # displaying results in progress bar
    display_recent = metrics["pearson" if task_config.class_type == "R" else "accuracy"]
    display_best = max(display_best, display_recent)
    bar.set_description(f"Best: {display_best:.5f}, Last: {display_recent:.5f}")

    # Write stats to log file
    with open(console_output_filename, 'a') as logfile:
        logfile.write(f"\n\nTraining with parameters:\n{print_params}")
        logfile.write(f"\nEarly stopped on epoch: {metrics['epoch']}")
        for name, val in zip(metric_types, metric_vals):
            logfile.write(f"\nValidation {name}: {val}")
        logfile.write(f"\nTraining time      : {training_time}") 
        logfile.write(f"\nEmbedding time     : {embedding_time}") 
        logfile.write(f"\nTraining energy    : {training_energy}") 
        logfile.write(f"\nEmbedding energy   : {embedding_energy}") 
    # Write to results csv
    with open(results_file, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        row = metric_vals + [training_time, embedding_time, training_energy, embedding_energy] + list(params.values())
        writer.writerow(row)

saving results to ./results/meanpool/cola/val_20240105_190541_DistilRoBERTa.csv


  0%|          | 0/24 [00:00<?, ?it/s]

In [None]:
results_df = pd.read_csv(results_file)
# results_df = pd.read_csv("output/val_results_cola_20231127_151717.csv")
if task_config.class_type in ["BC", "MC"]:
    print_metric = "accuracy"
elif task_config.class_type == "R":
    print_metric = "pearson"

best = results_df[print_metric].max()
best_row = results_df[results_df[print_metric] == best]

results_df