# FAST: Feedforward-Augmented Sentence Transformers

Notebook for running GLUE tasks.

# Setup

## Modules

In [2]:
import random
import csv
import pathlib
import itertools
from datetime import datetime
from collections import namedtuple
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

from utils.feed_forward import FeedForward
from utils.cls import extract_cls_embeddings
from utils.mean_pooling import mean_pooling

In [3]:
# standardized default seed
seed = 7
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

## Device

In [4]:
device_name = "cpu"  # default device is CPU
if torch.cuda.is_available():
    device_name = "cuda"  # CUDA for NVIDIA GPU
elif torch.backends.mps.is_available():
    device_name = torch.device("mps")  # Metal Performance Shaders for Apple M-series GPU
device = torch.device(device_name)
print(device_name)

cuda


## User parameters

Parameters to set:
- Model
    - MPNetBase
    - DistilRoBERTaBase
    - MPNetST
    - DistilRoBERTaST
- Task
    - cola
    - sst2
    - mrpc
    - stsb
    - qqp
    - mnli-m
    - mnli-mm
    - qnli
    - rte
    - wnli
- Embedding type
    - cls
    - mean_pooling
    - sentence

In [30]:
model_param = "DistilRoBERTaBase"
task_param = "cola"
embedding_param = "mean_pooling"

## Models

In [31]:
if model_param == "MPNetBase": # MPNet Base
    from transformers import MPNetTokenizer, MPNetModel
    tokenizer = MPNetTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
    model = MPNetModel.from_pretrained("microsoft/mpnet-base").to(device)
elif model_param == "DistilRoBERTaBase": # DistilRoBERTa Base
    from transformers import RobertaTokenizer, RobertaModel
    tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
    model = RobertaModel.from_pretrained('distilroberta-base').to(device)
elif model_param == "MPNetST": # MPNet Sentence Transformer
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)
elif model_param == "DistilRoBERTaST": # DistilRoBERTa Sentence Transformer
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('sentence-transformers/all-distilroberta-v1').to(device)
else:
    raise Exception(f"ERROR: Bad model_param")

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Tasks

In [32]:
# sentence_type: ["one", "two"]
# class_type: ["binary", "multi", "regression"]
# input_size: int (represents input size of feedforward, aka embedding size)
# col_names: column names of relavent sentences on hugging face

TaskConfig = namedtuple("TaskConfig", ["sentence_type", "class_type", "input_size", "col_names"])
task_configs = {
    "cola": TaskConfig("one", "binary", 768, ['sentence']),
    "sst2": TaskConfig("one", "binary", 768, ['sentence']),
    "mrpc": TaskConfig("two", "binary", 768*2, ['sentence1', 'sentence2']),
    "stsb": TaskConfig("two", "regression", 768*2, ['sentence1', 'sentence2']),
    "qqp": TaskConfig("two", "binary", 768*2, ['question1', 'question2']),
    "mnli-m": TaskConfig("two", "multi", 768*2, ['premise', 'hypothesis']),
    "mnli-mm": TaskConfig("two", "multi", 768*2, ['premise', 'hypothesis']),
    "qnli": TaskConfig("two", "binary", 768*2, ['question', 'sentence']),
    "rte": TaskConfig("two", "binary", 768*2, ['sentence1', 'sentence2']),
    "wnli": TaskConfig("two", "binary", 768*2, ['sentence1', 'sentence2']),
}

task_config = task_configs[task_param]

## Dataset

In [33]:
data = load_dataset("glue", task_param)
data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

# Embeddings

Labels come directly from dataset so no need to save to file

In [34]:
Y_train = np.array(data["train"]["label"])
Y_val = np.array(data["validation"]["label"])
Y_test = np.array(data["test"]["label"])

## Check for saved embeddings

In [35]:
cache_dir = f"./cache/{embedding_param}/{task_param}"
cache_path = pathlib.Path(cache_dir)
cache_path.mkdir(parents=True, exist_ok=True)

file_names = ['X_train', 'X_val', 'X_test']
paths = [pathlib.Path(cache_path / f"{f}_{model_param}.npy") for f in file_names]

if all(path.exists() for path in paths):
    print("saved embeddings found!")
    X_train = np.load(paths[0])
    X_val = np.load(paths[1])
    X_test = np.load(paths[2])
else:
    print("no saved embeddings found")

no saved embeddings found


## Generate embeddings

In [36]:
class GLUESingleSentence(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0)
    
class GLUEPairedSentence(Dataset):
    def __init__(self, texts1, texts2, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts1 = texts1
        self.texts2 = texts2
        self.max_length = max_length

    def __len__(self):
        return len(self.texts1)

    def __getitem__(self, idx):
        text1 = self.texts1[idx]
        text2 = self.texts2[idx]

        inputs1 = self.tokenizer.encode_plus(
            text1, 
            add_special_tokens=True, 
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        inputs2 = self.tokenizer.encode_plus(
            text2, 
            add_special_tokens=True, 
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return inputs1['input_ids'].squeeze(0), inputs1['attention_mask'].squeeze(0), \
               inputs2['input_ids'].squeeze(0), inputs2['attention_mask'].squeeze(0)

In [37]:
if task_config.sentence_type == "one":
    train_dataset = GLUESingleSentence(data['train']['sentence'], tokenizer)
    val_dataset = GLUESingleSentence(data['validation']['sentence'], tokenizer)
    test_dataset = GLUESingleSentence(data['test']['sentence'], tokenizer)
elif task_config.sentence_type == "two":
    key1 = task_config.col_names[0]
    key2 = task_config.col_names[1]
    train_dataset = GLUEPairedSentence(data['train'][key1], data['train'][key2], tokenizer)
    val_dataset = GLUEPairedSentence(data['validation'][key1], data['validation'][key2], tokenizer)
    test_dataset = GLUEPairedSentence(data['test'][key1], data['test'][key2], tokenizer)
else:
    raise Exception(f"{task_config.sentence_type}: sentence type not recognized")

# pick batch size based on GPU memory
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [38]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    masked_embeddings = token_embeddings * input_mask_expanded
    mean_embeddings = torch.sum(masked_embeddings, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return torch.nn.functional.normalize(mean_embeddings, p=2, dim=1)

def extract_cls(model_output):
    last_hidden_state = model_output['last_hidden_state']
    cls_embedding = last_hidden_state[:, 0, :]
    return cls_embedding

def compute_single_embeddings(loader):
    embeddings = []
    model.eval()
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)

            if embedding_param == "cls":
                embed = extract_cls(outputs)
            elif embedding_param == "mean_pooling":
                embed = mean_pooling(outputs, attention_mask)

            embeddings.append(embed.cpu().numpy())

    return np.concatenate(embeddings, axis=0)

def compute_pair_embeddings(loader):
    embeddings = []
    model.eval()
    with torch.no_grad():
        for input_ids1, attention_mask1, input_ids2, attention_mask2 in tqdm(loader):
            input_ids1 = input_ids1.to(device)
            attention_mask1 = attention_mask1.to(device)
            input_ids2 = input_ids2.to(device)
            attention_mask2 = attention_mask2.to(device)

            outputs1 = model(input_ids1, attention_mask=attention_mask1)
            outputs2 = model(input_ids2, attention_mask=attention_mask2)

            if embedding_param == "cls":
                U = extract_cls(outputs1)
                V = extract_cls(outputs2)
            elif embedding_param == "mean_pooling":
                U = mean_pooling(outputs1, attention_mask1)
                V = mean_pooling(outputs2, attention_mask2)

            embed = torch.cat([U, V], dim=1)
            embeddings.append(embed.cpu().numpy())

    return np.concatenate(embeddings, axis=0)

if task_config.sentence_type == "one":
    train_embed = compute_single_embeddings(train_loader)
    val_embed = compute_single_embeddings(val_loader)
    test_embed = compute_single_embeddings(test_loader)
elif task_config.sentence_type == "two":
    train_embed = compute_pair_embeddings(train_loader)
    val_embed = compute_pair_embeddings(val_loader)
    test_embed = compute_pair_embeddings(test_loader)

print(train_embed.shape)
print(val_embed.shape)
print(test_embed.shape)

100%|██████████| 17/17 [00:34<00:00,  2.03s/it]
100%|██████████| 3/3 [00:04<00:00,  1.42s/it]
100%|██████████| 3/3 [00:04<00:00,  1.45s/it]

(8551, 768)
(1043, 768)
(1063, 768)





In [41]:
X_train = train_embed
X_val = val_embed
X_test = test_embed

## Saving embeddings to file

In [40]:
cache_path.mkdir(parents=True, exist_ok=True)

file_names = ['X_train', 'X_val', 'X_test']
paths = [pathlib.Path(cache_path / f"{f}_{model_param}.npy") for f in file_names]

with open(paths[0], 'wb') as X_train_file:
    np.save(X_train_file, X_train)
with open(paths[1], 'wb') as X_val_file:
    np.save(X_val_file, X_val)
with open(paths[2], 'wb') as X_test_file:
    np.save(X_test_file, X_test)

# Training loop

In [42]:
param_grid = {
    'num_epochs': [50],
    'batch_size': [32, 128, 512],
    'learning_rate': [1e-2, 1e-3],
    'category': ['BC'],
    'norm': [False],
    'input_size': [task_config.input_size],
    'layer_size': [task_config.input_size],
    'num_layers': [1, 2, 3],
    'weight_decay':[1e-2, 1e-3, 1e-4],
    'patience': [3],
    'min_delta': [0],
    'device': [device_name]
}

# Create a list of all combinations of hyperparameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
print(f"{len(all_params)} hyperparameter combinations")

54 hyperparameter combinations


In [None]:
# setup for logging
console_output_filename = f'./output/{task_param}_console_output.txt'
with open(console_output_filename, 'a') as logfile:
    logfile.write('\n\nBEGIN TRAINING LOOP\n\n')
# setup for saving results
results_folder = pathlib.Path(f"results/{embedding_param}/{task_param}")
results_folder.mkdir(parents=True, exist_ok=True)
save_file_id = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = results_folder / f"val_{save_file_id}_{model_param}.csv"
with open(results_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    headers = list(all_params[0].keys())
    writer.writerow(['mcc', 'f1', 'accuracy'] + headers)
print(f"saving results to ./{results_file}")

# saves best accuracy for progress bar display
best_acc = 0
# Iterate over all combinations of hyperparameters
bar = tqdm(enumerate(all_params), total=len(all_params))
for i, params in bar:
    # formatting params to display
    print_params = params.copy()
    for param in ['category', 'device']:
        del print_params[param]
    
    # Initialize the model with current set of hyperparameters
    feed_forward = FeedForward(**params)

    epoch, val_loss, val_accuracy, val_f1, val_mcc = feed_forward.fit(X_train, Y_train, X_val, Y_val)

    best_acc = max(best_acc, val_accuracy)
    bar.set_description(f"Best Acc: {best_acc:.5f}, Last test: {val_accuracy:.5f}")

    # Write stats to log file
    with open(console_output_filename, 'a') as logfile:
        logfile.write(f"\n\nTraining with parameters:\n{print_params}")
        logfile.write(f"\nEarly stopped on epoch: {epoch}")
        logfile.write(f"\nValidation accuracy: {val_accuracy}")
        logfile.write(f"\nValidation f1-score: {val_f1}")
        logfile.write(f"\nValidation MCC     : {val_mcc}")
    # write to results csv
    with open(results_file, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([val_mcc, val_f1, val_accuracy] + list(params.values()))

In [47]:
results_df = pd.read_csv(results_file)
# results_df = pd.read_csv("output/val_results_cola_20231127_151717.csv")
metric = "accuracy"
best = results_df[metric].max()
best_row = results_df[results_df[metric] == best]
print(f"Best {metric}: {best:.5f}")
best_row

Best accuracy: 0.78140


Unnamed: 0,mcc,f1,accuracy,num_epochs,batch_size,learning_rate,category,norm,input_size,layer_size,num_layers,weight_decay,patience,min_delta,device
4,0.492335,0.7814,0.7814,50,32,0.01,BC,False,768,768,2,0.001,3,0,cuda
