# FAST: Feedforward-Augmented Sentence Transformers

# IMPORTS

In [1]:
# REQUIRED IMPORTS & SETUP

import os
import pickle
import numpy as np 
import pandas as pd 
import warnings
import itertools
import csv
import random
random.seed(0)  # standardized default seed

import torch
from datasets import Dataset, load_dataset
import torch
import torch.nn.functional as F

from utils.feed_forward import FeedForward
from utils.cls import extract_cls_embeddings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set Device ##########################################################
device_name = "cpu"  # default device is CPU
if torch.cuda.is_available():
    device_name = "cuda:0"  # CUDA for NVIDIA GPU
elif torch.backends.mps.is_available():
    device_name = torch.device("mps")  # Metal Performance Shaders for Apple M-series GPU
device = torch.device(device_name)
print(device_name)
#######################################################################

cuda:0


# LOAD MODELS & DATA
Load models from HuggingFace and send to GPU device. Below are some options for models.

## Base Models

### MPNet Base

In [None]:
from transformers import MPNetTokenizer, MPNetModel
tokenizer = MPNetTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = MPNetModel.from_pretrained("microsoft/mpnet-base").to(device)

### Distil RoBERTa Base

In [None]:
from transformers import RobertaTokenizer, RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
model = RobertaModel.from_pretrained('distilroberta-base').to(device)

## Sentence Transformers

### MPNet Sentence Transformer

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)

### Distil RoBERTa Sentence Transformer

In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1').to(device)

## Load Dataset

In [3]:
data = load_dataset("glue", "cola")
data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

# GENERATE EMBEDDINGS

## Generate Embeddings for CLS

Generate vector encodings using tokenizer.

In [9]:
X_train = data["train"]["sentence"]
X_val = data["validation"]["sentence"]
X_test = data["test"]["sentence"]

Y_train = data["train"]["label"]
Y_val = data["validation"]["label"]
Y_test = data["test"]["label"]

X_train = tokenizer(X_train, padding=True, truncation=True, return_tensors='pt')
X_val = tokenizer(X_val, padding=True, truncation=True, return_tensors='pt')
X_test = tokenizer(X_test, padding=True, truncation=True, return_tensors='pt')

Run vector encodings through model and extract CLS token from output.

In [12]:
# EXPENSIVE OPERATION: This cell may take about 2 minutes or more to run

with torch.no_grad():
    X_train = model(**X_train)
    X_val = model(**X_val)
    X_test = model(**X_test)

X_train = extract_cls_embeddings(X_train)
X_val = extract_cls_embeddings(X_val)
X_test = extract_cls_embeddings(X_test)

TypeError: forward() got an unexpected keyword argument 'last_hidden_state'

Save generated CLS tokens to disk.

In [55]:
model = "distilroberta"  # rename to model name
dataset = "cola"         # rename to dataset name

from pathlib import Path
Path("./output/cls").mkdir(parents=True, exist_ok=True)

with open(f'./output/cls/X_train_{dataset}_{model}.npy', 'wb') as X_train_file:
    np.save(X_train_file, X_train)
with open(f'./output/cls/X_val_{dataset}_{model}.npy', 'wb') as X_val_file:
    np.save(X_val_file, X_val)
with open(f'./output/cls/X_test_{dataset}_{model}.npy', 'wb') as X_test_file:
    np.save(X_test_file, X_test)
with open(f'./output/cls/Y_train_{dataset}_{model}.npy', 'wb') as Y_train_file:
    np.save(Y_train_file, Y_train)
with open(f'./output/cls/Y_val_{dataset}_{model}.npy', 'wb') as Y_val_file:
    np.save(Y_val_file, Y_val)
with open(f'./output/cls/Y_test_{dataset}_{model}.npy', 'wb') as Y_test_file:
    np.save(Y_test_file, Y_test)

Load generated CLS tokens from disk.

In [52]:
model = "distilroberta"  # rename to model name
dataset = "cola"         # rename to dataset name

with open(f'./output/cls/X_train_{dataset}_{model}.npy', 'rb') as X_train_file:
    X_train = np.load(X_train_file)
with open(f'./output/cls/X_val_{dataset}_{model}.npy', 'rb') as X_val_file:
    X_val = np.load(X_val_file)
with open(f'./output/cls/X_test_{dataset}_{model}.npy', 'rb') as X_test_file:
    X_test = np.load(X_test_file)
with open(f'./output/cls/Y_train_{dataset}_{model}.npy', 'rb') as Y_train_file:
    Y_train = np.load(Y_train_file)
with open(f'./output/cls/Y_val_{dataset}_{model}.npy', 'rb') as Y_val_file:
    Y_val = np.load(Y_val_file)
with open(f'./output/cls/Y_test_{dataset}_{model}.npy', 'rb') as Y_test_file:
    Y_test = np.load(Y_test_file)

print(f"size of X_train: {len(X_train)}")

size of X_train: 8551


## Mean Pooled Embeddings Example

In [7]:
import sys
sys.path.append("..")
from utils import mean_pooling
sentences = ['sentence@!!!!', 'yes']
# tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
# model = RobertaModel.from_pretrained('distilroberta-base')
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    model_output = model(**encoded_input)

# Pooling
sentence_embeddings = mean_pooling.mean_pooling(model_output, encoded_input['attention_mask'])

print("Embeddings:", sentence_embeddings)


Note: you may need to restart the kernel to use updated packages.
Embeddings: [[-0.00309106 -0.00176029 -0.00381204 ... -0.01621991 -0.00107355
   0.00249623]
 [ 0.00124463  0.00778843 -0.00230429 ... -0.00839902 -0.00316125
  -0.00278425]]


## Generate Embeddings for Sentence Transformers

Generate sentence embeddings using sentence transformer model.

In [4]:
# EXPENSIVE OPERATION: This cell may take about 2 minutes or more to run

with torch.no_grad():
    X_train = model.encode(data["train"]["sentence"])
    X_val = model.encode(data["validation"]["sentence"])
    X_test = model.encode(data["test"]["sentence"])
    
    Y_train = np.array(data["train"]["label"])
    Y_val = np.array(data["validation"]["label"])
    Y_test = np.array(data["test"]["label"])

Save encodings and labels to disk for reuse. This is done because encoding embeddings takes a significant time but the encodings do not change throughout training, so we can cache it.

In [5]:
model = "distilroberta"  # rename to model name
dataset = "cola"         # rename to dataset name

from pathlib import Path
Path("./output/fast").mkdir(parents=True, exist_ok=True)

with open(f'./output/fast/X_train_{dataset}_{model}.pt', 'wb') as X_train_file:
    torch.save(X_train, X_train_file)
with open(f'./output/fast/X_val_{dataset}_{model}.pt', 'wb') as X_val_file:
    torch.save(X_val, X_val_file)
with open(f'./output/fast/X_test_{dataset}_{model}.pt', 'wb') as X_test_file:
    torch.save(X_test, X_test_file)
with open(f'./output/fast/Y_train_{dataset}_{model}.npy', 'wb') as Y_train_file:
    np.save(Y_train_file, Y_train)
with open(f'./output/fast/Y_val_{dataset}_{model}.npy', 'wb') as Y_val_file:
    np.save(Y_val_file, Y_val)
with open(f'./output/fast/Y_test_{dataset}_{model}.npy', 'wb') as Y_test_file:
    np.save(Y_test_file, Y_test)

Load saved encodings and labels from disk, if previously saved.

In [6]:
model = "distilroberta"  # rename to model name
dataset = "cola"         # rename to dataset name

with open(f'./output/fast/X_train_{dataset}_{model}.pt', 'rb') as X_train_file:
    X_train = torch.load(X_train_file)
with open(f'./output/fast/X_val_{dataset}_{model}.pt', 'rb') as X_val_file:
    X_val = torch.load(X_val_file)
with open(f'./output/fast/X_test_{dataset}_{model}.pt', 'rb') as X_test_file:
    X_test = torch.load(X_test_file)
with open(f'./output/fast/Y_train_{dataset}_{model}.npy', 'rb') as Y_train_file:
    Y_train = np.load(Y_train_file)
with open(f'./output/fast/Y_val_{dataset}_{model}.npy', 'rb') as Y_val_file:
    Y_val = np.load(Y_val_file)
with open(f'./output/fast/Y_test_{dataset}_{model}.npy', 'rb') as Y_test_file:
    Y_test = np.load(Y_test_file)

print(f"size of X_train: {len(X_train)}")

size of X_train: 8551


# TRAINING LOOP

## Define Hyperparameters
Defining hyperparameter grid for grid search

In [8]:
param_grid = {
    'num_epochs': [50],
    'batch_size': [32, 128, 512],
    'learning_rate': [1e-2, 1e-3],
    'category': ['C'],
    'norm': [False],
    'input_size': [768],
    'layer_size': [6, 7, 8],
    'num_layers': [1, 2, 3],
    'weight_decay':[1e-2, 1e-3, 1e-4],
    'patience': [3],
    'min_delta': [0],
    'device': [device_name]
}

# Create a list of all combinations of hyperparameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
print(f"{len(all_params)} hyperparameter combinations")

162 hyperparameter combinations


In [None]:
# parameters sorted by accuracy metric
params_sorted = []

random.seed()  # set random seed based on current time just to generate random file_id
random_file_id = str(round(random.random() * 10000))
random.seed(0)  # reset random seed back to standard 0 seed

with open('./output/cls_console_output.txt', 'a') as logfile:
    logfile.write('\n\nBEGIN TRAINING LOOP\n\n')
with open(f'./output/val_results_cls_{random_file_id}.csv', 'w', newline='') as csvfile:
    print(f'CSV filename: val_results_cls_{random_file_id}.csv')
    writer = csv.writer(csvfile)
    headers = list(all_params[0].keys())
    writer.writerow(['mcc', 'f1', 'accuracy'] + headers)

# Iterate over all combinations of hyperparameters
for i, params in enumerate(all_params):
    
    print_params = params.copy()
    for param in ['category', 'device']:
        del print_params[param]
    print(f"\nLoop {i + 1} / {len(all_params)} | {round(i / len(all_params) * 100, 2)} %:")
    print(print_params)
    
    # Initialize the model with current set of hyperparameters
    feed_forward = FeedForward(**params)

    # Print stats to console
    epoch, val_loss, val_accuracy, val_f1, val_mcc = feed_forward.fit(X_train, Y_train, X_val, Y_val)
    print(f"Early stopped on epoch: {epoch}")
    print(f"Validation accuracy: {val_accuracy}", )
    print(f"Validation f1-score: {val_f1}")
    print(f"Validation MCC     : {val_mcc}")

    # Write stats to log file
    with open('./output/cls_console_output.txt', 'a') as logfile:
        logfile.write(f"\n\nTraining with parameters:\n{print_params}")
        logfile.write(f"\nEarly stopped on epoch: {epoch}")
        logfile.write(f"\nValidation accuracy: {val_accuracy}")
        logfile.write(f"\nValidation f1-score: {val_f1}")
        logfile.write(f"\nValidation MCC     : {val_mcc}")

    with open(f'./output/val_results_cls_{random_file_id}.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([val_mcc, val_f1, val_accuracy] + list(params.values()))

# PREDICT TEST SET

In [None]:
# manually set this based on output CSV file

best_params = {
    'num_epochs': 50,
    'batch_size': 128,
    'learning_rate': 1e-2,
    'category': 'C',
    'norm': False,
    'input_size': 768,
    'layer_size': 6,
    'num_layers': 3,
    'weight_decay':1e-2,
    'patience': 3,
    'min_delta': 0,
    'device': device_name
}

In [None]:
best_feed_forward = FeedForward(**best_params)

X = np.concatenate((X_train, X_val), axis=0)
Y = np.concatenate((Y_train, Y_val), axis=0)

best_feed_forward.fit(X, Y)

preds = np.argmax(best_feed_forward.predict_proba(X_test), axis=1)
print(preds.shape)


df = pd.DataFrame({
    'index': range(len(preds)),
    'prediction': preds
})

random.seed()  # set random seed based on current time just to generate random file_id
random_file_id = str(round(random.random() * 10000))
random.seed(0)  # reset random seed back to standard 0 seed
# Write the DataFrame to a .tsv file, without the header and index
df.to_csv(f'CoLA_{random_file_id}.tsv', sep='\t', index=False, header=True)