# FAST: Feedforward-Augmented Sentence Transformers

## Imports & Setup

In [None]:
# REQUIRED IMPORTS & SETUP

import os
import pickle
import numpy as np 
import pandas as pd 
import warnings
import itertools

import torch
from sentence_transformers import SentenceTransformer
from datasets import Dataset, load_dataset
from sklearn.metrics import accuracy_score

from utils.feed_forward import FeedForward
from utils.cls import extract_cls_embeddings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set Device ##########################################################
device_name = "cpu"  # default device is CPU
if torch.cuda.is_available():
    device_name = "cuda:0"  # CUDA for NVIDIA GPU
elif torch.backends.mps.is_available():
    device_name = torch.device("mps")  # Metal Performance Shaders for Apple M-series GPU
device = torch.device(device_name)
#######################################################################

# Load Models
Load models from HuggingFace and send to GPU device

In [None]:
mpnetv2 = SentenceTransformer("all-mpnet-base-v2").to(device)

## Load Data

In [None]:
data = load_dataset("glue", "cola")
data

# Encode Sentence Embeddings

Generate sentence embeddings using sentence transformer model.

In [None]:
X_train = mpnetv2.encode(data["train"]["sentence"])
X_val = mpnetv2.encode(data["validation"]["sentence"])
X_test = mpnetv2.encode(data["test"]["sentence"])

Y_train = np.array(data["train"]["label"])
Y_val = np.array(data["validation"]["label"])
Y_test = np.array(data["test"]["label"])

Save encodings and labels to disk for reuse. This is done because encoding embeddings takes a significant time but the encodings do not change throughout training, so we can cache it.

In [None]:
with open('./output/X_train.pt', 'wb') as X_train_file:
    torch.save(X_train, X_train_file)
with open('./output/X_val.pt', 'wb') as X_val_file:
    torch.save(X_val, X_val_file)
with open('./output/X_test.pt', 'wb') as X_test_file:
    torch.save(X_test, X_test_file)
with open('./output/Y_train.npy', 'wb') as Y_train_file:
    np.save(Y_train_file, Y_train)
with open('./output/Y_val.npy', 'wb') as Y_val_file:
    np.save(Y_val_file, Y_val)
with open('./output/Y_test.npy', 'wb') as Y_test_file:
    np.save(Y_test_file, Y_test)

Load saved encodings and labels from disk, if previously saved.

In [None]:
with open('./output/X_train.pt', 'rb') as X_train_file:
    X_train = torch.load(X_train_file)
with open('./output/X_val.pt', 'rb') as X_val_file:
    X_val = torch.load(X_val_file)
with open('./output/X_test.pt', 'rb') as X_test_file:
    X_test = torch.load(X_test_file)
with open('./output/Y_train.npy', 'rb') as Y_train_file:
    Y_train = np.load(Y_train_file)
with open('./output/Y_val.npy', 'rb') as Y_val_file:
    Y_val = np.load(Y_val_file)
with open('./output/Y_test.npy', 'rb') as Y_test_file:
    Y_test = np.load(Y_test_file)

## Define Hyperparameters
Defining hyperparameter grid for grid search

In [None]:
param_grid = {
    'num_epochs': [100],
    'batch_size': [32, 128, 512],
    'learning_rate': [1e-2, 1e-3, 1e-4, 1e-5],
    'category': ['C'],
    'norm': [False],
    'input_size': [768],
    'layer_size': [768],
    'num_layers': [1, 2, 3],
    'weight_decay':[1e-2, 1e-3, 1e-4, 1e-5],
    'patience': [3],
    'min_delta': [0],
    'device': [device_name]
}

# Create a list of all combinations of hyperparameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

best_params = None
highest_val_accuracy = 0

# Training Loop for Hyperparameter Grid Search

In [None]:
# Iterate over all combinations of hyperparameters
for params in all_params:
    print("\nTraining with parameters:\n", params)
    # Initialize the model with current set of hyperparameters
    feed_forward = FeedForward(num_epochs=params['num_epochs'],
                                batch_size=params['batch_size'],
                                learning_rate=params['learning_rate'],
                                category=params['category'],
                                norm=params['norm'],
                                input_size=params['input_size'],
                                layer_size=params['layer_size'],
                                num_layers=params['num_layers'],
                                weight_decay=params['weight_decay'],
                                patience=params['patience'],
                                min_delta=params['min_delta'],
                                device=params['device'])
    
    epoch, val_loss, val_accuracy, val_f1, val_mcc = feed_forward.fit(X_train, Y_train, X_val, Y_val)
    print("Early stopped on epoch:", epoch)
    print("Validation accuracy:", val_accuracy)
    print("Validation f1-score:", val_f1)
    print("Validation MCC     :", val_mcc)
    
    # Save the parameters if they provide a better accuracy
    if val_accuracy > highest_val_accuracy:
        highest_val_accuracy = val_accuracy
        best_params = params

# Print the best parameters
print("\nBest Parameters:", best_params)
print("Highest Validation Accuracy:", highest_val_accuracy)

# Evaluate Best Hyperparameters

In [None]:
best_feed_forward = FeedForward(num_epochs=best_params['num_epochs'],
                                batch_size=best_params['batch_size'],
                                learning_rate=best_params['learning_rate'],
                                category=best_params['category'],
                                norm=best_params['norm'],
                                input_size=best_params['input_size'],
                                layer_size=best_params['layer_size'],
                                num_layers=best_params['num_layers'],
                                weight_decay=best_params['weight_decay'],
                                patience=best_params['patience'],
                                min_delta=best_params['min_delta'],
                                device=best_params['device'])

X = np.concatenate((X_train, X_val), axis=0)
Y = np.concatenate((Y_train, Y_val), axis=0)

best_feed_forward.fit(X, Y)

preds = np.argmax(best_feed_forward.predict_proba(X_test), axis=1)
print(preds.shape)


df = pd.DataFrame({
    'index': range(len(preds)),
    'prediction': preds
})

# Write the DataFrame to a .tsv file, without the header and index
df.to_csv('CoLA.tsv', sep='\t', index=False, header=True)