# FAST: Feedforward-Augmented Sentence Transformers

Notebook for running GLUE tasks.

# Setup

## Modules

In [27]:
import random
import csv
import pathlib
import itertools
from datetime import datetime
from collections import namedtuple
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

from utils.feed_forward import FeedForward
from utils.cls import extract_cls_embeddings
from utils.mean_pooling import mean_pooling

In [4]:
# standardized default seed
seed = 7
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

## Device

In [5]:
device_name = "cpu"  # default device is CPU
if torch.cuda.is_available():
    device_name = "cuda"  # CUDA for NVIDIA GPU
elif torch.backends.mps.is_available():
    device_name = torch.device("mps")  # Metal Performance Shaders for Apple M-series GPU
device = torch.device(device_name)
print(device_name)

cuda


## User parameters

Parameters to set:
- Model
    - MPNetBase
    - DistilRoBERTaBase
    - MPNetST
    - DistilRoBERTaST
- Task
    - cola
    - sst2
    - mrpc
    - stsb
    - qqp
    - mnli-m
    - mnli-mm
    - qnli
    - rte
    - wnli
- Embedding type
    - cls
    - mean_pooling
    - sentence_transformer

In [11]:
model_param = "DistilRoBERTaBase"
task_param = "cola"
embedding_param = "cls"

## Models

In [12]:
if model_param == "MPNetBase": # MPNet Base
    from transformers import MPNetTokenizer, MPNetModel
    tokenizer = MPNetTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
    model = MPNetModel.from_pretrained("microsoft/mpnet-base").to(device)
elif model_param == "DistilRoBERTaBase": # DistilRoBERTa Base
    from transformers import RobertaTokenizer, RobertaModel
    tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
    model = RobertaModel.from_pretrained('distilroberta-base').to(device)
elif model_param == "MPNetST": # MPNet Sentence Transformer
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)
elif model_param == "DistilRoBERTaST": # DistilRoBERTa Sentence Transformer
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('sentence-transformers/all-distilroberta-v1').to(device)
else:
    raise Exception(f"ERROR: Bad model_param")

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Tasks

In [13]:
# sentence_type: ["one", "two"]
# class_type: ["binary", "multi", "regression"]
# input_size: int (represents input size of feedforward, aka embedding size)
# col_names: column names of relavent sentences on hugging face

TaskConfig = namedtuple("TaskConfig", ["sentence_type", "class_type", "input_size", "col_names"])
task_configs = {
    "cola": TaskConfig("one", "binary", 768, ['sentence']),
    "sst2": TaskConfig("one", "binary", 768, ['sentence']),
    "mrpc": TaskConfig("two", "binary", 768*2, ['sentence1', 'sentence2']),
    "stsb": TaskConfig("two", "regression", 768*2, ['sentence1', 'sentence2']),
    "qqp": TaskConfig("two", "binary", 768*2, ['question1', 'question2']),
    "mnli-m": TaskConfig("two", "multi", 768*2, ['premise', 'hypothesis']),
    "mnli-mm": TaskConfig("two", "multi", 768*2, ['premise', 'hypothesis']),
    "qnli": TaskConfig("two", "binary", 768*2, ['question', 'sentence']),
    "rte": TaskConfig("two", "binary", 768*2, ['sentence1', 'sentence2']),
    "wnli": TaskConfig("two", "binary", 768*2, ['sentence1', 'sentence2']),
}

task_config = task_configs[task_param]

## Dataset

In [14]:
data = load_dataset("glue", task_param)
data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

# Generating Embeddings

In [16]:
class GLUESingleSentence(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0)

In [22]:
train_dataset = GLUESingleSentence(data['train']['sentence'], tokenizer)
val_dataset = GLUESingleSentence(data['validation']['sentence'], tokenizer)
test_dataset = GLUESingleSentence(data['test']['sentence'], tokenizer)

# pick batch size based on GPU memory
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [23]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    masked_embeddings = token_embeddings * input_mask_expanded
    mean_embeddings = torch.sum(masked_embeddings, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return torch.nn.functional.normalize(mean_embeddings, p=2, dim=1)

def extract_cls(model_output):
    last_hidden_state = model_output['last_hidden_state']
    cls_embedding = last_hidden_state[:, 0, :]
    return cls_embedding

def compute_embeddings(loader):
    model.eval()
    with torch.no_grad():
        embeddings = []
        for input_ids, attention_mask in tqdm(loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)

            # embeding = mean_pooling(outputs, attention_mask)
            embeding = extract_cls(outputs)

            embeddings.append(embeding)

        return torch.cat(embeddings, 0)
    
train_embed = compute_embeddings(train_loader)
val_embed = compute_embeddings(val_loader)
test_embed = compute_embeddings(test_loader)

print(train_embed.shape)
print(val_embed.shape)
print(test_embed.shape)

100%|██████████| 17/17 [00:30<00:00,  1.77s/it]
100%|██████████| 3/3 [00:05<00:00,  1.70s/it]
100%|██████████| 3/3 [00:03<00:00,  1.31s/it]

torch.Size([8551, 768])
torch.Size([1043, 768])
torch.Size([1063, 768])





In [24]:
X_train = train_embed.cpu().numpy()
X_val = val_embed.cpu().numpy()
X_test = test_embed.cpu().numpy()

Y_train = np.array(data["train"]["label"])
Y_val = np.array(data["validation"]["label"])
Y_test = np.array(data["test"]["label"])

### previous system

In [None]:
# ~13 minutes for CLS

class CustomDataset(Dataset):
    def __init__(self, sentences, labels, sentences2=None):
        self.sentences = sentences
        self.labels = labels
        self.sentences2 = sentences2

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]

        if embedding_param == "cls" or embedding_param == "mean_pooling":
            # Tokenize the sentence
            tokenized = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')

            # Perform model inference
            with torch.no_grad():
                outputs = model(**tokenized)
        elif embedding_param == "sentence_transformer":
            with torch.no_grad():
                sentence2 = self.sentences2[idx]
                embeddings = model.encode(sentence) # j gonna call them embeddings for now x
                if task_config.sentence_type == "two":
                    tokenized_2 = model.encode(sentence2)
                    embeddings = np.concatenate([embeddings, tokenized_2], axis=1)
        
        # Extract embeddings
        if embedding_param == "cls":
            embeddings = extract_cls_embeddings(outputs)  
        elif embedding_param == "mean_pooling":
            embeddings = mean_pooling(outputs, outputs['attention_mask'])

        return embeddings, label


if embedding_param == "sentence_transformer" and task_config.sentence_type == "two":
    train_dataset = CustomDataset(data["train"][task_config.col_names[0]], data["train"]["label"], data["train"][task_config.col_names[1]])
    val_dataset = CustomDataset(data["validation"][task_config.col_names[0]], data["validation"]["label"], data["validation"][task_config.col_names[1]])
    test_dataset = CustomDataset(data["test"][task_config.col_names[0]], data["test"]["label"], data["test"][task_config.col_names[1]])
else: # if cls, mp, or st type one
    train_dataset = CustomDataset(data["train"][task_config.col_names[0]], data["train"]["label"])
    val_dataset = CustomDataset(data["validation"][task_config.col_names[0]], data["validation"]["label"])
    test_dataset = CustomDataset(data["test"][task_config.col_names[0]], data["test"]["label"])

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(train_dataset, batch_size=32)
test_dataloader = DataLoader(train_dataset, batch_size=32)

# DataLoader iteration
for batch in train_dataloader:
    X_train, Y_train = batch
    # X_train = np.array(X_train) # When you iterate through a DataLoader, the batch that is returned from the dataset is automatically converted into tensors by PyTorch. Convert BACK?!
for batch in val_dataloader:
    X_val, Y_val = batch
#     X_val = np.array(X_val) 
for batch in test_dataloader:
    X_test, Y_test = batch
#     X_test = np.array(X_test) 


## saving embeddings to file

In [46]:
# Save files
# TODO: load_file has bugs :(
output_directory = f"./output/{embedding_param}"

output_path = pathlib.Path(output_directory)
output_path.mkdir(parents=True, exist_ok=True)

if embedding_param == "cls" or embedding_param == "mean_pooling":
    x_suffix = "npy"
    def save_file(file1, file2):
        np.save(file1, file2)
    def load_file(file1):
        np.load(file1, allow_pickle=True)
elif embedding_param == "sentence_transformer":
    x_suffix = "pt"
    def save_file(file1, file2):
        torch.save(file1, file2)
    def load_file(file1):
        torch.load(file1)

# write
with open(f'{output_directory}/X_train_{task_param}_{model_param}.{x_suffix}', 'wb') as X_train_file:
    save_file(X_train_file, X_train)
with open(f'{output_directory}/X_val_{task_param}_{model_param}.{x_suffix}', 'wb') as X_val_file:
    save_file(X_val_file, X_val)
with open(f'{output_directory}/X_test_{task_param}_{model_param}.{x_suffix}', 'wb') as X_test_file:
    save_file(X_test_file, X_test)
with open(f'{output_directory}/Y_train_{task_param}_{model_param}.npy', 'wb') as Y_train_file:
    np.save(Y_train_file, Y_train)
with open(f'{output_directory}/Y_val_{task_param}_{model_param}.npy', 'wb') as Y_val_file:
    np.save(Y_val_file, Y_val)
with open(f'{output_directory}/Y_test_{task_param}_{model_param}.npy', 'wb') as Y_test_file:
    np.save(Y_test_file, Y_test)

# read
with open(f'{output_directory}/X_train_{task_param}_{model_param}.{x_suffix}', 'rb') as X_train_file:
    X_train = load_file(X_train_file)
with open(f'{output_directory}/X_val_{task_param}_{model_param}.{x_suffix}', 'rb') as X_val_file:
    X_val = load_file(X_val_file)
with open(f'{output_directory}/X_test_{task_param}_{model_param}.{x_suffix}', 'rb') as X_test_file:
    X_test = load_file(X_test_file)
with open(f'{output_directory}/Y_train_{task_param}_{model_param}.npy', 'rb') as Y_train_file:
    Y_train = np.load(Y_train_file)
with open(f'{output_directory}/Y_val_{task_param}_{model_param}.npy', 'rb') as Y_val_file:
    Y_val = np.load(Y_val_file)
with open(f'{output_directory}/Y_test_{task_param}_{model_param}.npy', 'rb') as Y_test_file:
    Y_test = np.load(Y_test_file)

print(f"size of X_train: {len(X_train)}")

# Training loop

In [28]:
param_grid = {
    'num_epochs': [50],
    'batch_size': [32, 128, 512],
    'learning_rate': [1e-2, 1e-3],
    'category': ['BC'],
    'norm': [False],
    'input_size': [task_config.input_size],
    'layer_size': [task_config.input_size],
    'num_layers': [1, 2, 3],
    'weight_decay':[1e-2, 1e-3, 1e-4],
    'patience': [3],
    'min_delta': [0],
    'device': [device_name]
}

# Create a list of all combinations of hyperparameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
print(f"{len(all_params)} hyperparameter combinations")

54 hyperparameter combinations


In [29]:
# setup for logging
save_file_id = datetime.now().strftime("%Y%m%d_%H%M%S")
console_output_filename = f'./output/{task_param}_console_output.txt'
with open(console_output_filename, 'a') as logfile:
    logfile.write('\n\nBEGIN TRAINING LOOP\n\n')
results_filename = f'./output/val_results_{embedding_param}_{task_param}_{save_file_id}.csv'
with open(results_filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    headers = list(all_params[0].keys())
    writer.writerow(['mcc', 'f1', 'accuracy'] + headers)
print(f"saving results to {results_filename}")

# saves best accuracy for progress bar display
best_acc = 0
# Iterate over all combinations of hyperparameters
bar = tqdm(enumerate(all_params), total=len(all_params))
for i, params in bar:
    # formatting params to display
    print_params = params.copy()
    for param in ['category', 'device']:
        del print_params[param]
    
    # Initialize the model with current set of hyperparameters
    feed_forward = FeedForward(**params)

    # Print stats to console
    epoch, val_loss, val_accuracy, val_f1, val_mcc = feed_forward.fit(X_train, Y_train, X_val, Y_val)

    best_acc = max(best_acc, val_accuracy)
    bar.set_description(f"Best Acc: {best_acc:.5f}, Last test: {val_accuracy:.5f}")

    # Write stats to log file
    with open(console_output_filename, 'a') as logfile:
        logfile.write(f"\n\nTraining with parameters:\n{print_params}")
        logfile.write(f"\nEarly stopped on epoch: {epoch}")
        logfile.write(f"\nValidation accuracy: {val_accuracy}")
        logfile.write(f"\nValidation f1-score: {val_f1}")
        logfile.write(f"\nValidation MCC     : {val_mcc}")

    with open(results_filename, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([val_mcc, val_f1, val_accuracy] + list(params.values()))

saving results to ./output/val_results_cls_cola_20231129_140656.csv


Best Acc: 0.76414, Last test: 0.75935: 100%|██████████| 54/54 [01:59<00:00,  2.22s/it]


In [30]:
results_df = pd.read_csv(results_filename)
# results_df = pd.read_csv("output/val_results_cola_20231127_151717.csv")
metric = "mcc"
best = results_df[metric].max()
best_row = results_df[results_df[metric] == best]
print(f"Best {metric}: {best:.5f}")
print(best_row)

Best mcc: 0.43688
        mcc        f1  accuracy  num_epochs  batch_size  learning_rate  \
5  0.436881  0.758389  0.758389          50          32           0.01   

  category   norm  input_size  layer_size  num_layers  weight_decay  patience  \
5       BC  False         768         768           2        0.0001         3   

   min_delta device  
5          0   cuda  


# Predictions

In [None]:
# manually set this based on output CSV file
best_params = {
    'num_epochs': 50,
    'batch_size': 128,
    'learning_rate': 1e-2,
    'category': 'C',
    'norm': False,
    'input_size': 768,
    'layer_size': 6,
    'num_layers': 3,
    'weight_decay':1e-2,
    'patience': 3,
    'min_delta': 0,
    'device': device_name
}

In [None]:
best_feed_forward = FeedForward(**best_params)

X = np.concatenate((X_train, X_val), axis=0)
Y = np.concatenate((Y_train, Y_val), axis=0)

best_feed_forward.fit(X, Y)

preds = np.argmax(best_feed_forward.predict_proba(X_test), axis=1)
print(preds.shape)

df = pd.DataFrame({
    'index': range(len(preds)),
    'prediction': preds
})

random.seed()  # set random seed based on current time just to generate random file_id
random_file_id = str(round(random.random() * 10000))
random.seed(0)  # reset random seed back to standard 0 seed
# Write the DataFrame to a .tsv file, without the header and index
df.to_csv(f'CoLA_{random_file_id}.tsv', sep='\t', index=False, header=True)