# FInd Optimal Hyperparameters using Grid Search CV

#

In [1]:
import os
import shutil
from copy import deepcopy
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from transformers import AdamW

from utils.functions import load_model, load_extraction_model
from utils.args_helper import get_parser, append_dataset_args
from torch.utils.data import DataLoader
from data_utils.ag_news.whitening import BertWhiteningDataset, BertWhiteningDataLoader
from utils.forward_fn import forward_word_classification, modified_forward_word_classification
from utils.metrics import news_categorization_metrics_fn
import time


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Andri\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
!pip install skorch




[notice] A new release of pip available: 22.2.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
args = {
        'device': 0,
        'seed': 88,
        'lr': 2e-3,
        'eps': 1e-8,
        'max_seq_len': 512,
        'num_labels': 4,
        'model_name': 'bilstm-dim-reduction',
        'task': 'sequence_classification',
        'num_labels': BertWhiteningDataset.NUM_LABELS,
        'dataset_class': BertWhiteningDataset,
        'dataloader_class': BertWhiteningDataLoader,
        'extract_model': 'bert-base-uncased',
        'forward_fn': modified_forward_word_classification,
        'metrics_fn': news_categorization_metrics_fn,
        'valid_criterion': 'F1',
        'train_set_path': './dataset/ag-news/train.csv',
        'valid_set_path': './dataset/ag-news/valid.csv',
        'test_set_path': './dataset/ag-news/test.csv',
        'vocab_path': "",
        'train_batch_size': 128,
        'valid_batch_size': 4,
        'vocab_path': "",
        'lower': True,
        'no_special_token': True,
        'k_fold': 1
    }


In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [3]:
def efficiency_metrics_wrapper(function):
    def wrapper(*args, **kwargs):

        start_time = time.time()  # Record the start time
        result = function(*args, **kwargs)
        elapsed_time = time.time() - start_time  # Calculate the elapsed time
        gpu_used = torch.cuda.max_memory_allocated() / (1024 * 1024) # in MB

        return result, elapsed_time, gpu_used
    
    return wrapper

In [7]:
from torch.utils.data import ConcatDataset

extract_model, extract_tokenizer, a, b_ = load_extraction_model(args)
train_dataset_path = args['train_set_path']
train_dataset = args['dataset_class'](args['device'], train_dataset_path, extract_tokenizer, extract_model, args['max_seq_len'],  lowercase=args["lower"], no_special_token=args['no_special_token'])


valid_dataset_path = args['valid_set_path']
valid_dataset = args['dataset_class'](args['device'], valid_dataset_path, extract_tokenizer, extract_model, args['max_seq_len'], lowercase=args["lower"], no_special_token=args['no_special_token'])


test_dataset_path = args['test_set_path']
test_dataset = args['dataset_class'](args['device'], test_dataset_path, extract_tokenizer, extract_model, args['max_seq_len'], lowercase=args["lower"], no_special_token=args['no_special_token'])




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Word Embedding Process: 7600 / 7600 words | GPU Usages: 839.4 3            

In [10]:
# Merge datasets
merged_dataset = ConcatDataset([train_dataset, valid_dataset, test_dataset])

# Create a DataLoader for the merged dataset
merged_loader = args['dataloader_class'](dataset=merged_dataset, batch_size=args['train_batch_size'], shuffle=True)

In [11]:
import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn.functional as F

class BiLSTMLayer(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(BiLSTMLayer, self).__init__()

        self.lstm = nn.LSTM(input_size=input_size, 
                            hidden_size=hidden_size, 
                            num_layers=num_layers, 
                            batch_first=True, 
                            bidirectional=True)
    
    def forward(self, input):
        outputs, _ = self.lstm(input)
        return outputs[:, -1, :]

class BiLSTMForWordClassification(nn.Module):
    def __init__(self, num_classes, input_size, hidden_size):
        super(BiLSTMForWordClassification, self).__init__()

        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            BiLSTMLayer(input_size=input_size, hidden_size=hidden_size, num_layers=2),
            nn.Linear(hidden_size*2, num_classes),
            nn.ReLU()
        )

        # self.classifier = nn.Sequential(
        #     nn.Dropout(0.1),
        #     BiLSTMLayer(input_size=input_size, hidden_size=hidden_size, num_layers=2),
        #     nn.Linear(hidden_size*2, num_classes),
        #     nn.ReLU(),
        #     # nn.Dropout(0.1),
        #     nn.Softmax(dim=1)
        # )
        # self.classifier = nn.Linear(input_size, num_classes)
        # self.dropout = nn.Dropout()

        self.num_classes = num_classes

    def forward(self, vector):
        logits = self.classifier(vector)
        outputs = F.softmax(logits, dim=1)
        
        return outputs

In [12]:
model = BiLSTMForWordClassification(args['num_labels'], input_size=256, hidden_size=32)

In [10]:
# !pip install optuna

In [13]:
 # Make sure cuda is deterministic
torch.backends.cudnn.deterministic = True

 # Set random seed
set_seed(args['seed'])  # Added here for reproductibility    

w2i, i2w = args['dataset_class'].LABEL2INDEX, args['dataset_class'].INDEX2LABEL
metrics_scores = []
result_dfs = []
efficiency_metrics = []

print(model)

BiLSTMForWordClassification(
  (classifier): Sequential(
    (0): Dropout(p=0.1, inplace=False)
    (1): BiLSTMLayer(
      (lstm): LSTM(256, 32, num_layers=2, batch_first=True, bidirectional=True)
    )
    (2): Linear(in_features=64, out_features=4, bias=True)
    (3): ReLU()
  )
)


In [16]:
# Learning parameters. 
lr = 0.001
epochs = 20
device = 'cpu'
print(f"Computation device: {device}\n")

# Loss function. Required for defining `NeuralNetClassifier`
criterion = CrossEntropyLoss()

# Define hyperparameters to search
params = {
    'lr': [2e-3, 2e-4, 5e-3, 5e-4],
    'max_epochs': list(range(1, 10, 1)),
    'module__hidden_size': [32, 64, 128],
    'optimizer__eps': [1e-7, 1e-6, 1e-8],  # Add epsilon values for numerical stability
    'optimizer': [torch.optim.Adam, torch.optim.AdamW, torch.optim.Adamax],
    
}

Computation device: cpu



In [15]:
# from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
# from ignite.handlers import ModelCheckpoint
# from ignite.contrib.handlers import GridSearch

# # Define the training function
# def train_fn(engine, batch):
#     model.train()
#     optimizer.zero_grad()
#     x, y = batch
#     y_pred = model(x)
#     loss = criterion(y_pred, y)
#     loss.backward()
#     optimizer.step()
#     return loss.item()

# # Set up the trainer and evaluator
# trainer = create_supervised_trainer(model, optimizer, criterion, device='cuda' if torch.cuda.is_available() else 'cpu')
# evaluator = create_supervised_evaluator(model, metrics={'accuracy': ignite.metrics.Accuracy()}, device='cuda' if torch.cuda.is_available() else 'cpu')

# # Set up the parameter grid search
# gs = GridSearch(trainer, params, evaluator, score_name="accuracy", smaller_is_better=False)

# # Attach the grid search to the trainer
# gs.attach(trainer)

# # Run the training loop
# trainer.run(dataloader, max_epochs=10)

from skorch import NeuralNetClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Create skorch NeuralNetClassifier
net = NeuralNetClassifier(
    BiLSTMForWordClassification,
    module__input_size=256,  # Assuming input size is the number of features
    module__num_classes=4,  # Adjust based on the number of classes in your task
    module__hidden_size=32,
    max_epochs=10,
    criterion=torch.nn.CrossEntropyLoss,
    optimizer=torch.optim.Adam,
    iterator_train__shuffle=True,
    device=device
)


In [17]:

"""
Define `GridSearchCV`.
4 lrs * 7 max_epochs * 4 module__first_conv_out * 3 module__first_fc_out
* 2 CVs = 672 fits.
"""
gs = GridSearchCV(
    net, params, refit=False, scoring='accuracy', verbose=1, cv=2
)
counter = 0
# Run each fit for 2 batches. So, if we have `n` fits, then it will
# actually for `n*2` times. We have 672 fits, so total, 
# 672 * 2 = 1344 runs.
search_batches = 2
"""
This will run `n` (`n` is calculated from `params`) number of fits 
on each batch of data, so be careful.
If you want to run the `n` number of fits just once, 
that is, on one batch of data,
add `break` after this line:
    `outputs = gs.fit(image, labels)`
Note: This will take a lot of time to run
"""
for i, data in enumerate(merged_loader):
    counter += 1
    vector_batch, label_batch = data[:-1]
    # Prepare input & label
    vector_batch = torch.FloatTensor(vector_batch)
    label_batch = torch.LongTensor(label_batch)

    if device == "cuda":
        vector_batch = vector_batch.cuda()
        label_batch = label_batch.cuda()
    
    outputs = gs.fit(vector_batch, label_batch)

    # GridSearch for `search_batches` number of times.
    if counter == search_batches:
        break

print('SEARCH COMPLETE')
print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))

# Print the best parameters and accuracy
# print("Best Parameters:", best_params)
# print("Test Accuracy:", accuracy)

Fitting 2 folds for each of 972 candidates, totalling 1944 fits
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.3872[0m       [32m0.2308[0m        [35m1.3868[0m  0.0521
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.3872[0m       [32m0.2308[0m        [35m1.3868[0m  0.0050
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.3873[0m       [32m0.2308[0m        [35m1.3868[0m  0.0060
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.3872[0m       [32m0.2308[0m        [35m1.3867[0m  0.0070
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.3860[0m

In [15]:
# Suppose you have logits for three classes
m = torch.nn.Softmax(dim=1)

logits = torch.rand((128, 4))
print(logits.size())

print(m(logits).size())

# Using view(-1) to flatten the logits tensor

print(logits.size())
print(logits.view(-1, 4).size())

torch.Size([128, 4])
torch.Size([128, 4])
torch.Size([128, 4])
torch.Size([128, 4])


In [16]:
random_tensor = torch.rand(128)
labels_tensor = 3 * random_tensor + 1
print(label_batch)

tensor([3, 0, 1, 3, 3, 0, 0, 3, 3, 3, 2, 1, 2, 3, 0, 1, 1, 0, 1, 2, 2, 1, 3, 2,
        2, 1, 1, 2, 0, 3, 2, 0, 3, 1, 0, 3, 1, 1, 2, 3, 3, 1, 0, 0, 0, 0, 2, 1,
        2, 0, 1, 3, 2, 2, 1, 3, 3, 1, 3, 3, 3, 3, 0, 3, 1, 1, 0, 0, 2, 1, 1, 3,
        2, 0, 3, 0, 0, 1, 1, 1, 2, 1, 0, 1, 0, 3, 0, 2, 0, 2, 1, 3, 0, 0, 3, 1,
        0, 0, 2, 1, 1, 2, 3, 3, 1, 2, 3, 2, 0, 3, 0, 2, 2, 3, 0, 0, 1, 3, 0, 1,
        1, 0, 0, 1, 3, 0, 1, 3])


In [17]:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, 4), label_batch.view(-1))
loss

tensor(1.4468)

In [18]:
from ignite.metrics import ClassificationReport

metric = ClassificationReport(output_dict=True)
metric.attach(default_evaluator, "cr")

state = default_evaluator.run([[logits, label_batch]])
print(state.metrics["cr"].keys())
print(state.metrics["cr"]["0"])
print(state.metrics["cr"]["1"])
print(state.metrics["cr"]["2"])
print(state.metrics["cr"]["macro avg"])

NameError: name 'default_evaluator' is not defined