# FAST: Feedforward-ASsisted Transformers

Notebook for running GLUE tasks.

# Setup

## Modules

In [1]:
import random
import time
import csv
import os
import pathlib
import itertools
from datetime import datetime
from collections import namedtuple
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from adapters import AutoAdapterModel

from utils.feed_forward import FeedForward
from utils.cls import extract_cls_embeddings
from utils.mean_pooling import mean_pooling
from utils.energy import get_energy

## Device

In [2]:
# Set CUBLAS_WORKSPACE_CONFIG
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # or ":16:8"
os.environ['CUDNN_DETERMINISTIC'] = '1'
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Standardized default seed
seed = 7
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms(mode=True)

In [3]:
device_name = "cpu"  # default device is CPU
if torch.cuda.is_available():
    # I read that this works for detecting if notebook is being run in a colab environment, not sure though
    if 'COLAB_GPU' in os.environ:
        print("colab environment")
        device_name = "gpu" 
    else:
        device_name = "cuda:0" # CUDA for NVIDIA GPU
elif torch.backends.mps.is_available():
    device_name = torch.device("mps")  # Metal Performance Shaders for Apple M-series GPU

# device_name = "cuda:0"
device = torch.device(device_name)
print(device_name)

cuda:0


## User parameters

Task Parameters:
- cola
- sst2
- mrpc
- stsb
- qqp
- mnli_matched
- mnli_mismatched
- qnli
- rte
- wnli

In [4]:
task_param = "cola"

## Model

In [5]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')

In [6]:
TaskConfig = namedtuple("TaskConfig", ["sentence_type", "class_type", "num_classes", "col_names"])

task_configs = {
    "cola": TaskConfig("one", "BC", 1, ['sentence']),
    "sst2": TaskConfig("one", "BC", 1, ['sentence']),
    "mrpc": TaskConfig("two", "BC", 1, ['sentence1', 'sentence2']),
    "stsb": TaskConfig("two", "R", 1, ['sentence1', 'sentence2']),
    "qqp": TaskConfig("two", "BC", 1, ['question1', 'question2']),
    "mnli_matched": TaskConfig("two", "MC", 3, ['premise', 'hypothesis']),
    "mnli_mismatched": TaskConfig("two", "MC", 3, ['premise', 'hypothesis']),
    "qnli": TaskConfig("two", "BC", 1, ['question', 'sentence']),
    "rte": TaskConfig("two", "BC", 1, ['sentence1', 'sentence2']),
    "wnli": TaskConfig("two", "BC", 1, ['sentence1', 'sentence2']),
}

task_config = task_configs[task_param]

## Dataset

In [7]:
if task_param == "mnli_matched": 
    data = load_dataset("glue", "mnli") 
    val_key = "validation_matched"
    test_key = "test_matched"
elif task_param == "mnli_mismatched":
    data = load_dataset("glue", "mnli") 
    val_key = "validation_mismatched"
    test_key = "test_mismatched"
else:
    data = load_dataset("glue", task_param)
    val_key = "validation"
    test_key = "test"

data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

## Tokenize Dataset

In [8]:
max_len=512

def tokenize(examples):
    return tokenizer(examples[task_config.col_names[0]],
                     add_special_tokens=True,
                     padding='max_length',
                     truncation=True,
                     max_length=max_len,
                     return_tensors='pt')

def tokenize_double(examples):
    return tokenizer(examples[task_config.col_names[0]],
                     examples[task_config.col_names[1]],
                     add_special_tokens=True,
                     padding='max_length',
                     truncation=True,
                     max_length=max_len,
                     return_tensors='pt')

data = data.map(tokenize, batched=True)
data = data.rename_column("label", "labels")
data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [9]:
from adapters import BnConfig
from adapters import AutoAdapterModel

In [10]:
config=BnConfig(mh_adapter=True,
                output_adapter=True,
                reduction_factor=2,
                non_linearity="relu",
                )

In [11]:
model = AutoAdapterModel.from_pretrained('distilroberta-base', config=config).to(device)

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
model.add_classification_head(task_param, num_labels=2)

In [13]:
from adapters.training import AdapterArguments
adaptargs = AdapterArguments()

In [14]:
# task adapter - only add if not existing
if task_param not in model.adapters_config:
    # resolve the adapter config
    adapter_config = config.load(adaptargs.adapter_config)
    # add a new adapter
    model.add_adapter(task_param, config=adapter_config)
# Enable adapter training
model.train_adapter(task_param)

In [15]:
model.set_active_adapters(task_param)

In [16]:
import numpy as np
from transformers import TrainingArguments, EvalPrediction
from adapters import AdapterTrainer

training_args = TrainingArguments(
    learning_rate=3e-4,
    max_steps=10000,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=1000,
    output_dir="adapter-roberta-base-amazon-polarity",
    overwrite_output_dir=True,
    remove_unused_columns=False,
)

def compute_accuracy(eval_pred):
  preds = np.argmax(eval_pred.predictions, axis=1)
  return {"acc": (preds == eval_pred.label_ids).mean()}

In [17]:
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["test"],
    compute_metrics=compute_accuracy,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
trainer.train()

RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
    output = module(*input, **kwargs)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/adapters/models/roberta/adapter_model.py", line 69, in forward
    outputs, context = self.roberta(
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/adapters/context.py", line 116, in wrapper_func
    results = f(self, *args, **kwargs)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/adapters/model_mixin.py", line 1270, in forward
    return super().forward(*args, **kwargs)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/transformers/models/roberta/modeling_roberta.py", line 835, in forward
    encoder_outputs = self.encoder(
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/transformers/models/roberta/modeling_roberta.py", line 524, in forward
    layer_outputs = layer_module(
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/transformers/models/roberta/modeling_roberta.py", line 455, in forward
    layer_output = apply_chunking_to_forward(
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/transformers/pytorch_utils.py", line 241, in apply_chunking_to_forward
    return forward_fn(*input_tensors)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/transformers/models/roberta/modeling_roberta.py", line 468, in feed_forward_chunk
    layer_output = self.output(intermediate_output, attention_output)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/adapters/models/roberta/modeling_roberta.py", line 159, in forward
    hidden_states = self.bottleneck_layer_forward(hidden_states, input_tensor, self.LayerNorm)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/adapters/methods/bottleneck.py", line 348, in bottleneck_layer_forward
    state = self.compose(adapter_setup, state)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/adapters/methods/adapter_layer_base.py", line 472, in compose
    state = composition_func(adapter_setup, state, lvl=0)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/adapters/methods/adapter_layer_base.py", line 308, in compose_stack
    state = self.compose_single(adapter_stack_layer, state, lvl=lvl + 1)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/adapters/methods/bottleneck.py", line 230, in compose_single
    layer_output = adapter_layer(
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/adapters/methods/modeling.py", line 172, in forward
    down = self.adapter_down(x)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/torch/nn/modules/container.py", line 217, in forward
    input = module(input)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/mvq0861/miniconda3/envs/fast/lib/python3.9/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)


## Create Dataloader

In [None]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch

# Function to merge datasets
def clean_dataset(inputs, split):
    common_columns_to_remove = {'sentence', 'sentence1', 'sentence2', 'label', 'question1', 'question2', 'premise', 'hypothesis', 'question'}

    if isinstance(inputs, list):
        dataframe = [ds[split].to_pandas() for ds in inputs]
        dataframe = [df.drop(columns=common_columns_to_remove, errors='ignore') for df in dataframe]
        cleaned_df = dataframe[0]
        for df in dataframe[1:]:
            cleaned_df = pd.merge(cleaned_df, df, on='idx', how='inner')
    else:
        dataframe = inputs[split].to_pandas()
        cleaned_df = dataframe.drop(columns=common_columns_to_remove, errors='ignore')

    return cleaned_df

class TransformerDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        # Exclude 'idx' column and convert others to tensors
        self.columns = [col for col in dataframe.columns if col != 'idx']
        for col in self.columns:
            # Convert to NumPy array first for efficiency
            np_array = np.array(dataframe[col].tolist())
            setattr(self, col, torch.tensor(np_array, dtype=torch.int64))

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Retrieve tensors by index for each column
        item = {col: getattr(self, col)[idx] for col in self.columns}
        return item

# Define batch size
batch_size = 128

if create_dataloader:

    # Merge datasets for each split and convert to PyTorch datasets
    train_df = clean_dataset(inputs, 'train')
    validation_df = clean_dataset(inputs, 'validation')
    test_df = clean_dataset(inputs, 'test')

    # Create dataset instances
    train_dataset = TransformerDataset(train_df)
    validation_dataset = TransformerDataset(validation_df)
    test_dataset = TransformerDataset(test_df)

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Compute Embbeddings

In [None]:
def compute_embeddings(loader, X):
    column_names = list(next(iter(loader)).keys())

    # Creating pairs of input IDs and attention masks
    pairs = []
    for name in column_names:
        if 'input_ids' in name:
            mask_name = name.replace('input_ids', 'attention_mask')
            pairs.append((name, mask_name))

    # Loop over each pair of input ID and attention mask
    for input_id_name, mask_name in pairs:
        embedding = []
        tqdm.write(f"{input_id_name} {mask_name}")
        # time.sleep(0.2) # tqdm prints weird without slight time delay - delay subtracted when calculating embedding time
    
        for input_embedding in tqdm(loader):
            # Move batch to device
            model.eval()
            input_embedding = {key: value.to(device) for key, value in input_embedding.items()}
            with torch.no_grad():
                outputs = model(input_embedding[input_id_name], attention_mask=input_embedding[mask_name])

                if "cls" in input_id_name:
                    embed = extract_cls_embeddings(outputs)
                elif "meanpool" in input_id_name:
                    embed = mean_pooling(outputs, input_embedding[mask_name])

            embedding.append(embed.cpu().numpy())

        id = embedding_param.index(input_id_name.split("_input_ids")[0])
        # print(id)
        if "2" in input_id_name:
            U, V = X[id], np.concatenate(embedding, axis=0)
            X[id] = np.hstack([U, V])
            # print(X[id])
        else:
            X[id] = np.concatenate(embedding, axis=0)
            # print(X[id])

def compute_sentence(sentences):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size]
        batch_embeddings = sentence_model.encode(batch_sentences)
        embeddings.append(batch_embeddings)
    return np.concatenate(embeddings, axis=0)

print("Embedding: Using default time tracking")
# Track time
start_time = time.time()

if create_dataloader: # Compute CLS/Meanpool embeddings
    compute_embeddings(train_loader, X_train)
    compute_embeddings(validation_loader, X_val)
    compute_embeddings(test_loader, X_test)
if create_sentence: # Compute Sentence embeddings
    if "sentence_single" in embedding_param:
        id = embedding_param.index("sentence_single")
        U_train, V_train = compute_sentence(data["train"][task_config.col_names[0]]), compute_sentence(data["train"][task_config.col_names[1]])
        U_val, V_val = compute_sentence(data["validation"][task_config.col_names[0]]), compute_sentence(data["validation"][task_config.col_names[1]])
        U_test, V_test = compute_sentence(data["test"][task_config.col_names[0]]), compute_sentence(data["test"][task_config.col_names[1]])

        X_train[id] = np.hstack([U_train, V_train])
        X_val[id] = np.hstack([U_val, V_val])
        X_test[id] = np.hstack([U_test, V_test])

    elif "sentence" in embedding_param:
        id = embedding_param.index("sentence")
        X_train[id] = compute_sentence(data["train"][task_config.col_names[0]])
        X_val[id] = compute_sentence(data["validation"][task_config.col_names[0]])
        X_test[id] = compute_sentence(data["test"][task_config.col_names[0]])

embedding_time = time.time() - start_time

## Save Embeddings to File

In [None]:
for id in embedding_tracker:

    cache_path = pathlib.Path(f"./cache/{embedding_param[id]}/{task_param}")
    cache_path.mkdir(parents=True, exist_ok=True)

    file_names = ['X_train', 'X_val', 'X_test']
    paths = [pathlib.Path(cache_path / f"{f}_{model_param}.npy") for f in file_names]

    with open(paths[0], 'wb') as X_train_file:
        np.save(X_train_file, X_train[id])
    with open(paths[1], 'wb') as X_val_file:
        np.save(X_val_file, X_val[id])
    with open(paths[2], 'wb') as X_test_file:
        np.save(X_test_file, X_test[id])

In [None]:
X_train = np.concatenate(X_train, axis=1)
X_val = np.concatenate(X_val, axis=1)
X_test = np.concatenate(X_test, axis=1)
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape  : {X_val.shape}")
print(f"X_test shape : {X_test.shape}")

## Transforming Embeddings

In [None]:
def add_embeddings(embeddings, column_ids, embedding_size, is_UV, is_diff, is_mult):
    '''
    Add embeddings at specific column_ids. For example, for the matrix
    [Z1, Z2, U1, V1, Z3, U2, V2], if we want to replace this with:
    [Z1, Z2, U1 - V1, Z3, U2, V2, U2 - V2, U2 * V2], we provide the parameters:
    is_UV = [False, True] : U1, V1 are NOT kept, but U2, V2 are kept
    is_diff = [True, True] : both U1 - V1 and U2 - V2 are added
    is_mult = [False, True] : U1 * V1 is not included, U2 * V2 is included

    Args:
        embeddings : original matrix to replace
        column_ids : location of Ux (we assume Vx immedeately follows Ux), 
                     for the above example, we would provide column_ids = [2, 5].
                     If you DO NOT want to replace a certain Ux, simply don't include its id in column_ids
        is_UV : should Ux, Vx be included
        is_diff : should Ux - Vx be included
        is_mult : should Ux * Vx be included
    '''

    id_delta = 0 # keep track of changes to embedding inserts/deletions
    for id, column_id in enumerate(column_ids):

        if id>0:
            if not is_UV[id-1]:
                id_delta -= 2
            if is_diff[id-1]:
                id_delta += 1
            if is_mult[id-1]:
                id_delta += 1

        start_id = (column_id + id_delta) * embedding_size
        U_id = start_id + embedding_size
        V_id = start_id + (2*embedding_size)

        U = embeddings[:, start_id:U_id]
        V = embeddings[:, U_id:V_id]

        if is_diff[id] and not is_mult[id]:
            new_embeddings = (U - V)
        elif not is_diff[id] and is_mult[id]:
            new_embeddings = U * V
        else: # both
            new_embeddings = np.hstack([(U - V), (U * V)])

        if is_UV[id]:                  
            embeddings = np.hstack([
                embeddings[:, :V_id],       # Part of original matrix before replacement
                new_embeddings,             # New embeddings to insert
                embeddings[:, V_id:]        # Part of original matrix after replacement
            ])
        else:
            embeddings = np.hstack([
                embeddings[:, :start_id],   # Part of original matrix before replacement
                new_embeddings,             # New embeddings to insert
                embeddings[:, V_id:]        # Part of original matrix after replacement
            ])
    return embeddings

In [None]:
######################
# Choose ids of (Ux, Vx) to alter embeddings
column_ids = [] if task_config.sentence_type == "one" else [0]
######################

if column_ids:
    f = lambda embeddings : add_embeddings(embeddings=embeddings, column_ids=column_ids, embedding_size=768, 
                                           is_UV=[False], is_diff=[True], is_mult=[False])

    X_train_computed = f(embeddings=X_train)
    X_val_computed = f(embeddings=X_val)
    X_test_computed = f(embeddings=X_test)
else:
    X_train_computed = X_train
    X_val_computed = X_val
    X_test_computed = X_test
    
print(X_train_computed.shape)
print(X_val_computed.shape)
print(X_test_computed.shape)

# Training loop

In [None]:
input_size = X_train_computed.shape[1]

param_grid = {
    'max_epochs': [50],
    'batch_size': [32],
    'learning_rate': [0.001],
    'category': ['MC'],
    'norm': [False],
    'input_size': [768],
    'layer_size': [768],
    'num_classes': [3],
    'num_layers': [1],
    'weight_decay': [0.0001],
    'patience': [3],
    'min_delta': [0],
}

# default overrides
param_grid['verbose'] = [True]
param_grid['device'] = [device_name]

# Create a list of all combinations of hyperparameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
print(f"{len(all_params)} hyperparameter combinations")

In [None]:
# Create output folder if it doesn't exist
if not os.path.exists('output'):
    os.makedirs('output')

# Setup for logging
console_output_filename = f'./output/{"_".join(embedding_param[0])}_{task_param}_console_output.txt'

with open(console_output_filename, 'a') as logfile:
    logfile.write('\n\nBEGIN TRAINING LOOP\n\n')

# Setup for saving results
results_folder = pathlib.Path(f"results/{embedding_param[0]}/{task_param}")
results_folder.mkdir(parents=True, exist_ok=True)
save_file_id = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = results_folder / f"val_{save_file_id}_{model_param}.csv"

# different metrics are recorded for classification vs regression tasks
if task_config.class_type in ["BC", "MC"]:
    metric_types = ['mcc', 'f1', 'accuracy']
elif task_config.class_type == "R":
    metric_types = ['pearson', 'spearman']

with open(results_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    header = metric_types + ['num_epochs', 'training time / epoch', 'embedding time', 'training energy / epoch', 'embedding energy'] + list(all_params[0].keys())
    writer.writerow(header)
print(f"saving results to ./{results_file}")
metric_types += ["epoch"]
# Saves best accuracy for progress bar display
display_best = float("-inf")

# Iterate over all combinations of hyperparameters
bar = tqdm(enumerate(all_params), total=len(all_params))
for i, params in bar:
    # Formatting params to display
    print_params = params.copy()
    for param in ['category', 'device']:
        del print_params[param]
        
    # reset torch so that results are consistent
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Initialize the model with current set of hyperparameters
    feed_forward = FeedForward(**params)

    metrics, train_times_per_epoch, energy_per_epoch = feed_forward.fit(X_train_computed,
                                                                        Y_train,
                                                                        X_val_computed,
                                                                        Y_val)
    
    # Log average training time per epoch for current parameter set
    # Note: FFN frequently stops early
    training_time = np.mean(train_times_per_epoch)
    training_energy = np.mean(energy_per_epoch) 
    # Compute energy for embedding generation
    embedding_energy = get_energy(embedding_time, device) # This method effectively just computes energy for a given time

    metric_vals = [metrics[mt] for mt in metric_types]
    
    # displaying results in progress bar
    display_recent = metrics["pearson" if task_config.class_type == "R" else "accuracy"]
    display_best = max(display_best, display_recent)
    bar.set_description(f"Best: {display_best:.5f}, Last: {display_recent:.5f}")

    # Write stats to log file
    with open(console_output_filename, 'a') as logfile:
        logfile.write(f"\n\nTraining with parameters:\n{print_params}")
        logfile.write(f"\nEarly stopped on epoch: {metrics['epoch']}")
        for name, val in zip(metric_types, metric_vals):
            logfile.write(f"\nValidation {name}: {val}")
        logfile.write(f"\nTraining time      : {training_time}") 
        logfile.write(f"\nEmbedding time     : {embedding_time}") 
        logfile.write(f"\nTraining energy    : {training_energy}") 
        logfile.write(f"\nEmbedding energy   : {embedding_energy}") 
    # Write to results csv
    with open(results_file, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        row = metric_vals + [training_time, embedding_time, training_energy, embedding_energy] + list(params.values())
        writer.writerow(row)

In [None]:
results_df = pd.read_csv(results_file)
if task_config.class_type in ["BC", "MC"]:
    print_metric = "accuracy"
elif task_config.class_type == "R":
    print_metric = "pearson"

best = results_df[print_metric].max()
best_row = results_df[results_df[print_metric] == best]

results_df

In [None]:
best_row

# Train & Eval on Test set

## Set best hyperparameter combination

In [None]:
input_size = X_train_computed.shape[1]

# best hyperparameter combination
param_grid = {
    'max_epochs': [7],
    'batch_size': [512],
    'learning_rate': [0.01],
    'category': ['R'],
    'norm': [False],
    'input_size': [768],
    'layer_size': [768],
    'num_classes': [1],
    'num_layers': [5],
    'weight_decay': [0.0001],
    'patience': [5],
    'min_delta': [0],
    'device': ['mps'],
    'verbose': [False]
}

# Create a list of all combinations of hyperparameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
print(f"{len(all_params)} hyperparameter combinations")

In [None]:
y_pred = None

# Create output folder if it doesn't exist
if not os.path.exists('output'):
    os.makedirs('output')

# Setup for logging
console_output_filename = f'./output/{"_".join(embedding_param[0])}_{task_param}_console_output.txt'

with open(console_output_filename, 'a') as logfile:
    logfile.write('\n\nBEGIN FINAL TRAINING LOOP\n\n')

# Setup for saving results
results_folder = pathlib.Path(f"results/{embedding_param[0]}/{task_param}")
results_folder.mkdir(parents=True, exist_ok=True)
save_file_id = datetime.now().strftime("%Y%m%d_%H%M%S")
test_results_file = results_folder / f"test_{save_file_id}_{model_param}.csv"
test_y_pred_file = results_folder / f"y_pred_{save_file_id}_{model_param}.tsv"

# different metrics are recorded for classification vs regression tasks
if task_config.class_type in ["BC", "MC"]:
    metric_types = ['mcc', 'f1', 'accuracy']
elif task_config.class_type == "R":
    metric_types = ['pearson', 'spearman']

with open(test_results_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    header = metric_types + ['training time / epoch', 'embedding time', 'training energy / epoch', 'embedding energy'] + list(all_params[0].keys())
    writer.writerow(header)
print(f"saving results to ./{test_results_file}")
# Saves best accuracy for progress bar display
display_best = float("-inf")

# Iterate over all combinations of hyperparameters
bar = tqdm(enumerate(all_params), total=len(all_params))
for i, params in bar:
    # Formatting params to display
    print_params = params.copy()
    for param in ['category', 'device']:
        del print_params[param]
        
    # reset torch so that results are consistent
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Initialize the model with current set of hyperparameters
    feed_forward = FeedForward(**params)

    _, train_times_per_epoch, energy_per_epoch = feed_forward.fit(X_train_computed,
                                                                Y_train,
                                                                X_val_computed,
                                                                Y_val)
    
    metrics = feed_forward._validate(X_test_computed, Y_test, return_predictions=True)

    y_pred = metrics["predictions"]
    del metrics["predictions"]

    y_pred_df = pd.DataFrame(y_pred, columns=["prediction"])
    y_pred_df["index"] = y_pred_df.index
    y_pred_df = y_pred_df[["index", "prediction"]]
    y_pred_df.to_csv(test_y_pred_file, sep='\t', index=False, header=True)
    print(f"saving predictions to ./{test_y_pred_file}")
    
    # Log average training time per epoch for current parameter set
    # Note: FFN frequently stops early
    training_time = np.mean(train_times_per_epoch)
    training_energy = np.mean(energy_per_epoch) 
    # Compute energy for embedding generation
    embedding_energy = get_energy(embedding_time, device) # This method effectively just computes energy for a given time

    metric_vals = [metrics[mt] for mt in metric_types]
    
    # displaying results in progress bar
    display_recent = metrics["pearson" if task_config.class_type == "R" else "accuracy"]
    display_best = max(display_best, display_recent)
    bar.set_description(f"Best: {display_best:.5f}, Last: {display_recent:.5f}")

    # Write stats to log file
    with open(console_output_filename, 'a') as logfile:
        logfile.write(f"\n\nTraining with parameters:\n{print_params}")
        for name, val in zip(metric_types, metric_vals):
            logfile.write(f"\nValidation {name}: {val}")
        logfile.write(f"\nTraining time      : {training_time}")
        logfile.write(f"\nEmbedding time     : {embedding_time}") 
        logfile.write(f"\nTraining energy    : {training_energy}") 
        logfile.write(f"\nEmbedding energy   : {embedding_energy}") 
    # Write to results csv
    with open(test_results_file, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        row = metric_vals + [training_time, embedding_time, training_energy, embedding_energy] + list(params.values())
        writer.writerow(row)

In [None]:
results_df = pd.read_csv(test_results_file)
results_df