# FAST: Feedforward-Augmented Sentence Transformers

Notebook for running GLUE tasks.

# Setup

## Modules

In [1]:
import random
random.seed(0)  # standardized default seed

import torch
from datasets import load_dataset

from collections import namedtuple

import numpy as np

from utils.feed_forward import FeedForward
from utils.cls import extract_cls_embeddings
from utils.mean_pooling import mean_pooling

import itertools
import pandas as pd

from pathlib import Path

import datetime
import csv
import tqdm

## Device

In [2]:
device_name = "cpu"  # default device is CPU
if torch.cuda.is_available():
    device_name = "cuda"  # CUDA for NVIDIA GPU
elif torch.backends.mps.is_available():
    device_name = torch.device("mps")  # Metal Performance Shaders for Apple M-series GPU
device = torch.device(device_name)
print(device_name)

cpu


## User parameters

Parameters to set:
- Model
    - MPNetBase
    - DistilRoBERTaBase
    - MPNetST
    - DistilRoBERTaST
- Task
    - cola
    - sst2
    - mrpc
    - stsb
    - qqp
    - mnli-m
    - mnli-mm
    - qnli
    - rte
    - wnli
- Embedding type
    - cls
    - mean_pooling
    - sentence_transformer

In [3]:
model_param = "DistilRoBERTaBase"
task_param = "cola"
embedding_param = "cls"

## Models

In [4]:
if model_param == "MPNetBase": # MPNet Base
    from transformers import MPNetTokenizer, MPNetModel
    tokenizer = MPNetTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
    model = MPNetModel.from_pretrained("microsoft/mpnet-base").to(device)
elif model_param == "DistilRoBERTaBase": # DistilRoBERTa Base
    from transformers import RobertaTokenizer, RobertaModel
    tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
    model = RobertaModel.from_pretrained('distilroberta-base').to(device)
elif model_param == "MPNetST": # MPNet Sentence Transformer
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)
elif model_param == "DistilRoBERTaST": # DistilRoBERTa Sentence Transformer
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('sentence-transformers/all-distilroberta-v1').to(device)
else:
    raise Exception(f"ERROR: Bad model_param")

## Tasks

In [5]:
# sentence_type: ["one", "two"]
# class_type: ["binary", "multi", "regression"]
# input_size: int (represents input size of feedforward, could also be called embedding size)
# col_names: column names of relavent sentences on hugging face

TaskConfig = namedtuple("TaskConfig", ["sentence_type", "class_type", "input_size", "col_names"])
task_configs = {
    "cola": TaskConfig("one", "binary", 768, ['sentence']),
    "sst2": TaskConfig("one", "binary", 768, ['sentence']),
    "mrpc": TaskConfig("two", "binary", 768*2, ['sentence1', 'sentence2']),
    "stsb": TaskConfig("two", "regression", 768*2, ['sentence1', 'sentence2']),
    "qqp": TaskConfig("two", "binary", 768*2, ['question1', 'question2']),
    "mnli-m": TaskConfig("two", "multi", 768*2, ['premise', 'hypothesis']),
    "mnli-mm": TaskConfig("two", "multi", 768*2, ['premise', 'hypothesis']),
    "qnli": TaskConfig("two", "binary", 768*2, ['question', 'sentence']),
    "rte": TaskConfig("two", "binary", 768*2, ['sentence1', 'sentence2']),
    "wnli": TaskConfig("two", "binary", 768*2, ['sentence1', 'sentence2']),
}

task_config = task_configs[task_param]

## Dataset

In [6]:
data = load_dataset("glue", task_param)
data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

## Embedding

In [7]:
# Generate encodings

if embedding_param == "cls" or embedding_param == "mean_pooling":
    X_train = data["train"]["sentence"]
    X_val = data["validation"]["sentence"]
    X_test = data["test"]["sentence"]

    Y_train = data["train"]["label"]
    Y_val = data["validation"]["label"]
    Y_test = data["test"]["label"]

    X_train = tokenizer(X_train, padding=True, truncation=True, return_tensors='pt')
    X_val = tokenizer(X_val, padding=True, truncation=True, return_tensors='pt')
    X_test = tokenizer(X_test, padding=True, truncation=True, return_tensors='pt')
elif embedding_param == "sentence_transformer":
    # EXPENSIVE OPERATION: This cell may take about 2 minutes or more to run
    with torch.no_grad():
        if task_config.sentence_type == "one":
            X_train = model.encode(data["train"][task_config.col_names[0]])
            X_val = model.encode(data["validation"][task_config.col_names[0]])
            X_test = model.encode(data["test"][task_config.col_names[0]])
        elif task_config.sentence_type == "two":
            X_train_u = model.encode(data["train"][task_config.col_names[0]])
            X_val_u = model.encode(data["validation"][task_config.col_names[0]])
            X_test_u = model.encode(data["test"][task_config.col_names[0]])

            X_train_v = model.encode(data["train"][task_config.col_names[1]])
            X_val_v = model.encode(data["validation"][task_config.col_names[1]])
            X_test_v = model.encode(data["test"][task_config.col_names[1]])

            X_train = np.concatenate([X_train_u, X_train_v]
                                    , axis=1)
            X_val = np.concatenate([X_val_u, X_val_v], axis=1)
            X_test = np.concatenate([X_test_u, X_test_v], axis=1)
        else:
            raise Exception(f"{task_config.sentence_type}: sentence type not recognized")
        
        Y_train = np.array(data["train"]["label"])
        Y_val = np.array(data["validation"]["label"])
        Y_test = np.array(data["test"]["label"])

In [8]:
# Extract embeddings for CLS or mean pooling
if embedding_param == "cls":
    # EXPENSIVE OPERATION: This cell may take about 2 minutes or more to run
    with torch.no_grad():
        X_train = model(**X_train)
        X_val = model(**X_val)
        X_test = model(**X_test)
    
    X_train = extract_cls_embeddings(X_train)
    X_val = extract_cls_embeddings(X_val)
    X_test = extract_cls_embeddings(X_test)

elif embedding_param == "mean_pooling":
    X_train_saved_mask = X_train['attention_mask'].to(device)
    X_val_saved_mask = X_val['attention_mask'].to(device)
    X_test_saved_mask = X_test['attention_mask'].to(device)

    X_train.to(device)
    X_val.to(device)
    X_test.to(device)

    with torch.no_grad():
        X_train = model(**X_train)
        X_val = model(**X_val)
        X_test = model(**X_test)

    X_train = mean_pooling(X_train, X_train_saved_mask)
    X_val = mean_pooling(X_val, X_val_saved_mask)
    X_test = mean_pooling(X_test, X_test_saved_mask)

what
here?


In [None]:
# Save files
output_directory = f"./output/{embedding_param}"

Path(output_directory).mkdir(parents=True, exist_ok=True)

if embedding_param == "cls" or embedding_param == "mean_pooling":
    x_suffix = "npy"
    def save_file(file1, file2):
        np.save(file1, file2)
    def load_file(file1):
        np.load(file1)
elif embedding_param == "sentence_transformer":
    x_suffix = "pt"
    def save_file(file1, file2):
        torch.save(file1, file2)
    def load_file(file1):
        torch.load(file1)

# write
with open(f'{output_directory}/X_train_{task_param}_{model_param}.{x_suffix}', 'wb') as X_train_file:
    save_file(X_train_file, X_train)
with open(f'{output_directory}/X_val_{task_param}_{model_param}.{x_suffix}', 'wb') as X_val_file:
    save_file(X_val_file, X_val)
with open(f'{output_directory}/X_test_{task_param}_{model_param}.{x_suffix}', 'wb') as X_test_file:
    save_file(X_test_file, X_test)
with open(f'{output_directory}/Y_train_{task_param}_{model_param}.npy', 'wb') as Y_train_file:
    np.save(Y_train_file, Y_train)
with open(f'{output_directory}/Y_val_{task_param}_{model_param}.npy', 'wb') as Y_val_file:
    np.save(Y_val_file, Y_val)
with open(f'{output_directory}/Y_test_{task_param}_{model_param}.npy', 'wb') as Y_test_file:
    np.save(Y_test_file, Y_test)

# read
with open(f'{output_directory}/X_train_{task_param}_{model_param}.{x_suffix}', 'rb') as X_train_file:
    X_train = load_file(X_train_file)
with open(f'{output_directory}/X_val_{task_param}_{model_param}.{x_suffix}', 'rb') as X_val_file:
    X_val = load_file(X_val_file)
with open(f'{output_directory}/X_test_{task_param}_{model_param}.{x_suffix}', 'rb') as X_test_file:
    X_test = load_file(X_test_file)
with open(f'{output_directory}/Y_train_{task_param}_{model_param}.npy', 'rb') as Y_train_file:
    Y_train = np.load(Y_train_file)
with open(f'{output_directory}/Y_val_{task_param}_{model_param}.npy', 'rb') as Y_val_file:
    Y_val = np.load(Y_val_file)
with open(f'{output_directory}/Y_test_{task_param}_{model_param}.npy', 'rb') as Y_test_file:
    Y_test = np.load(Y_test_file)

print(f"size of X_train: {len(X_train)}")

# Training loop

In [None]:
param_grid = {
    'num_epochs': [50],
    'batch_size': [32, 128, 512],
    'learning_rate': [1e-2, 1e-3],
    'category': ['BC'],
    'norm': [False],
    'input_size': [task_config.input_size],
    'layer_size': [task_config.input_size],
    'num_layers': [1, 2, 3],
    'weight_decay':[1e-2, 1e-3, 1e-4],
    'patience': [3],
    'min_delta': [0],
    'device': [device_name]
}

# Create a list of all combinations of hyperparameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
print(f"{len(all_params)} hyperparameter combinations")

In [None]:
# setup for logging
save_file_id = datetime.now().strftime("%Y%m%d_%H%M%S")
console_output_filename = f'./output/{task_param}_console_output.txt'
with open(console_output_filename, 'a') as logfile:
    logfile.write('\n\nBEGIN TRAINING LOOP\n\n')
results_filename = f'./output/val_results_{embedding_param}_{task_param}_{save_file_id}.csv'
with open(results_filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    headers = list(all_params[0].keys())
    writer.writerow(['mcc', 'f1', 'accuracy'] + headers)
print(f"saving results to {results_filename}")

# saves best accuracy for progress bar display
best_acc = 0
# Iterate over all combinations of hyperparameters
bar = tqdm(enumerate(all_params), total=len(all_params))
for i, params in bar:
    # formatting params to display
    print_params = params.copy()
    for param in ['category', 'device']:
        del print_params[param]
    
    # Initialize the model with current set of hyperparameters
    feed_forward = FeedForward(**params)

    # Print stats to console
    epoch, val_loss, val_accuracy, val_f1, val_mcc = feed_forward.fit(X_train, Y_train, X_val, Y_val)

    best_acc = max(best_acc, val_accuracy)
    bar.set_description(f"Best Acc: {best_acc:.5f}, Last test: {val_accuracy:.5f}")

    # Write stats to log file
    with open(console_output_filename, 'a') as logfile:
        logfile.write(f"\n\nTraining with parameters:\n{print_params}")
        logfile.write(f"\nEarly stopped on epoch: {epoch}")
        logfile.write(f"\nValidation accuracy: {val_accuracy}")
        logfile.write(f"\nValidation f1-score: {val_f1}")
        logfile.write(f"\nValidation MCC     : {val_mcc}")

    with open(results_filename, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([val_mcc, val_f1, val_accuracy] + list(params.values()))

In [None]:
# results_df = pd.read_csv(results_filename)
results_df = pd.read_csv("output/val_results_cola_20231127_151717.csv")

metric = "mcc"
best = results_df[metric].max()
best_row = results_df[results_df[metric] == best]
print(f"Best {metric}: {best:.5f}")
print(best_row)

# Predictions

In [None]:
# manually set this based on output CSV file

best_params = {
    'num_epochs': 50,
    'batch_size': 128,
    'learning_rate': 1e-2,
    'category': 'C',
    'norm': False,
    'input_size': 768,
    'layer_size': 6,
    'num_layers': 3,
    'weight_decay':1e-2,
    'patience': 3,
    'min_delta': 0,
    'device': device_name
}

In [None]:
best_feed_forward = FeedForward(**best_params)

X = np.concatenate((X_train, X_val), axis=0)
Y = np.concatenate((Y_train, Y_val), axis=0)

best_feed_forward.fit(X, Y)

preds = np.argmax(best_feed_forward.predict_proba(X_test), axis=1)
print(preds.shape)

df = pd.DataFrame({
    'index': range(len(preds)),
    'prediction': preds
})

random.seed()  # set random seed based on current time just to generate random file_id
random_file_id = str(round(random.random() * 10000))
random.seed(0)  # reset random seed back to standard 0 seed
# Write the DataFrame to a .tsv file, without the header and index
df.to_csv(f'CoLA_{random_file_id}.tsv', sep='\t', index=False, header=True)