# FAST: Feedforward-Augmented Sentence Transformers

## Imports & Setup

In [29]:
# REQUIRED IMPORTS & SETUP

import os
import pickle
import numpy as np 
import pandas as pd 
import warnings
import itertools

import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, MPNetModel, MPNetConfig
import torch
import torch.nn.functional as F

from utils.feed_forward import FeedForward
from utils.cls import extract_cls_embeddings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set Device ##########################################################
device_name = "cpu"  # default device is CPU
if torch.cuda.is_available():
    device_name = "cuda:0"  # CUDA for NVIDIA GPU
elif torch.backends.mps.is_available():
    device_name = torch.device("mps")  # Metal Performance Shaders for Apple M-series GPU
device = torch.device(device_name)
#######################################################################

# Load Models
Load models from HuggingFace and send to GPU device

In [31]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = MPNetModel.from_pretrained("microsoft/mpnet-base")

NameError: name 'mpnetmodel' is not defined

## Load Data

In [12]:
data = load_dataset("glue", "cola")

X_train = data["train"]["sentence"]
X_val = data["validation"]["sentence"]
X_test = data["test"]["sentence"]

Y_train = data["train"]["label"]
Y_val = data["validation"]["label"]
Y_test = data["test"]["label"]

# Encode Sentence Embeddings

Generate vector encodings using tokenizer.

In [13]:
X_train = tokenizer(X_train, padding=True, truncation=True, return_tensors='pt')
X_val = tokenizer(X_val, padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(X_test, padding=True, truncation=True, return_tensors='pt')

Run vector encodings through model and extract CLS token from output.

In [14]:
# EXPENSIVE OPERATION: This cell may take about 2 minutes or more to run

with torch.no_grad():
    X_train = model(**X_train)
    X_val = model(**X_val)
    X_test = model(**X_test)

X_train = extract_cls_embeddings(X_train)
X_val = extract_cls_embeddings(X_val)
X_test = extract_cls_embeddings(X_test)

Save generated CLS tokens to disk.

In [18]:
with open('./output/cls/X_train.npy', 'wb') as X_train_file:
    np.save(X_train_file, X_train)
with open('./output/cls/X_val.npy', 'wb') as X_val_file:
    np.save(X_val_file, X_val)
with open('./output/cls/X_test.npy', 'wb') as X_test_file:
    np.save(X_test_file, X_test)
with open('./output/cls/Y_train.npy', 'wb') as Y_train_file:
    np.save(Y_train_file, Y_train)
with open('./output/cls/Y_val.npy', 'wb') as Y_val_file:
    np.save(Y_val_file, Y_val)
with open('./output/cls/Y_test.npy', 'wb') as Y_test_file:
    np.save(Y_test_file, Y_test)

Load generated CLS tokens from disk.

In [25]:
with open('./output/cls/X_train.npy', 'rb') as X_train_file:
    X_train = np.load(X_train_file)
with open('./output/cls/X_val.npy', 'rb') as X_val_file:
    X_val = np.load(X_val_file)
with open('./output/cls/X_test.npy', 'rb') as X_test_file:
    X_test = np.load(X_test_file)
with open('./output/cls/Y_train.npy', 'rb') as Y_train_file:
    Y_train = np.load(Y_train_file)
with open('./output/cls/Y_val.npy', 'rb') as Y_val_file:
    Y_val = np.load(Y_val_file)
with open('./output/cls/Y_test.npy', 'rb') as Y_test_file:
    Y_test = np.load(Y_test_file)

len(X_train)

8551

## Define Hyperparameters
Defining hyperparameter grid for grid search

In [26]:
param_grid = {
    'num_epochs': [100],
    'batch_size': [32, 128, 512],
    'learning_rate': [1e-2,1e-4],
    'category': ['C'],
    'norm': [False],
    'input_size': [768],
    'layer_size': [192, 384, 768, 1536],
    'num_layers': [1, 3, 5, 7, 9],
    'weight_decay':[1e-2, 1e-4],
    'patience': [3],
    'min_delta': [0],
    'device': [device_name]
}

# Create a list of all combinations of hyperparameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

best_params = None
highest_val_mcc = 0

# Training Loop for Hyperparameter Grid Search

In [27]:
# Iterate over all combinations of hyperparameters
for params in all_params:
    print("\nTraining with parameters:\n", params)
    # Initialize the model with current set of hyperparameters
    feed_forward = FeedForward(num_epochs=params['num_epochs'],
                                batch_size=params['batch_size'],
                                learning_rate=params['learning_rate'],
                                category=params['category'],
                                norm=params['norm'],
                                input_size=params['input_size'],
                                layer_size=params['layer_size'],
                                num_layers=params['num_layers'],
                                weight_decay=params['weight_decay'],
                                patience=params['patience'],
                                min_delta=params['min_delta'],
                                device=params['device'])
    
    epoch, val_loss, val_accuracy, val_f1, val_mcc = feed_forward.fit(X_train, Y_train, X_val, Y_val)
    print("Early stopped on epoch:", epoch)
    print("Validation accuracy:", val_accuracy)
    print("Validation f1-score:", val_f1)
    print("Validation MCC     :", val_mcc)
    print("Best MCC so far    :", highest_val_mcc)
    
    # Save the parameters if they provide a better accuracy
    if val_mcc > highest_val_mcc:
        highest_val_mcc = val_mcc
        best_params = params

# Print the best parameters
print("\nBest Parameters:", best_params)
print("Highest Validation MCC:", highest_val_mcc)


Training with parameters:
 {'num_epochs': 100, 'batch_size': 32, 'learning_rate': 0.01, 'category': 'C', 'norm': False, 'input_size': 768, 'layer_size': 192, 'num_layers': 1, 'weight_decay': 0.01, 'patience': 3, 'min_delta': 0, 'device': device(type='mps')}
Early stopped on epoch: 5
Validation accuracy: 0.7066155321188878
Validation f1-score: 0.8111111111111111
Validation MCC     : 0.21384020216125427
Best MCC so far    : 0

Training with parameters:
 {'num_epochs': 100, 'batch_size': 32, 'learning_rate': 0.01, 'category': 'C', 'norm': False, 'input_size': 768, 'layer_size': 192, 'num_layers': 1, 'weight_decay': 0.0001, 'patience': 3, 'min_delta': 0, 'device': device(type='mps')}
Early stopped on epoch: 4
Validation accuracy: 0.673058485139022
Validation f1-score: 0.7792880258899676
Validation MCC     : 0.16504566661063733
Best MCC so far    : 0.21384020216125427

Training with parameters:
 {'num_epochs': 100, 'batch_size': 32, 'learning_rate': 0.01, 'category': 'C', 'norm': False, 'i

KeyboardInterrupt: 

# Evaluate Best Hyperparameters

In [None]:
best_feed_forward = FeedForward(num_epochs=best_params['num_epochs'],
                                batch_size=best_params['batch_size'],
                                learning_rate=best_params['learning_rate'],
                                category=best_params['category'],
                                norm=best_params['norm'],
                                input_size=best_params['input_size'],
                                layer_size=best_params['layer_size'],
                                num_layers=best_params['num_layers'],
                                weight_decay=best_params['weight_decay'],
                                patience=best_params['patience'],
                                min_delta=best_params['min_delta'],
                                device=best_params['device'])

X = np.concatenate((X_train, X_val), axis=0)
Y = np.concatenate((Y_train, Y_val), axis=0)

best_feed_forward.fit(X, Y)

preds = np.argmax(best_feed_forward.predict_proba(X_test), axis=1)
print(preds.shape)


df = pd.DataFrame({
    'index': range(len(preds)),
    'prediction': preds
})

# Write the DataFrame to a .tsv file, without the header and index
df.to_csv('CoLA.tsv', sep='\t', index=False, header=True)