In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, RandomSampler

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd
from tqdm import tqdm

import random

from dataset import CSICDataset, Vocab

In [2]:
# Defining global constants
RANDOM_SEED = 42
BATCH_SIZE = 64

torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

In [3]:
# This is how we select a GPU if it's available on your computer or in the Colab environment.
print('Device of execution - ', device)

Device of execution -  mps


In [4]:
df = pd.read_csv('./dataset/dataset.csv')
df = CSICDataset.process_df(df)

# The following two lines are used to load the indices of the training and validation sets
train_indices = np.load('./dataset/train_indices.npy')
val_indices = np.load('./dataset/val_indices.npy')

# The following two lines are used to select the training and validation sets from the dataframe based on the indices loaded above
train_data = df.loc[train_indices].reset_index(drop=True)
val_data = df.loc[val_indices].reset_index(drop=True)

In [5]:
train_dataset = CSICDataset(df=train_data, vocab_size=5000, min_frequency=1, tokenization_algorithm='bpe')
train_vocab = train_dataset.vocab

val_dataset = CSICDataset(df=val_data, vocab=train_vocab)

train_sampler = RandomSampler(train_dataset)
val_sampler   = RandomSampler(val_dataset)






In [6]:
# Check Dataset Lengths
assert len(train_dataset) == 45319, "Training Dataset is of incorrect size"
assert len(val_dataset) == 11330, "Validation Dataset is of incorrect size"

print('Training and Validation dataset sizes match!')

Training and Validation dataset sizes match!


In [7]:
PADDING_VALUE = train_vocab.pad_id

In [8]:
def collate_fn(batch, padding_value=PADDING_VALUE):
    # Batch is of the form List[Tuple(Features(tokenized_ids,...), Labels)]
    sequences = [torch.tensor(sample[0]['tokenized_ids'], dtype=torch.long, device=device) for sample in batch]
    padded_tokens = torch.nn.utils.rnn.pad_sequence(sequences=sequences,batch_first=True, padding_value=padding_value)
    
    labels = torch.tensor([sample[1] for sample in batch])

    return padded_tokens, labels

In [9]:
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator   = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)

In [10]:
for x, y in train_iterator:
    print(f'x: {x.shape}')
    print(f'y: {y.shape}')
    break

x: torch.Size([64, 199])
y: torch.Size([64])


In [44]:
class RecurrentWAF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, rec_hidden_size, fc_hidden_size, recurrent_type='LSTM', dropout=None):
        super(RecurrentWAF, self).__init__()
        
        self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=PADDING_VALUE)
        
        match recurrent_type:
            case 'LSTM':
                self.recurrent = nn.LSTM(input_size=embedding_dim, hidden_size=rec_hidden_size, batch_first=True)
            case 'RNN':
                self.recurrent = nn.RNN(input_size=embedding_dim, hidden_size=rec_hidden_size, batch_first=True)
            case 'GRU':
                self.recurrent = nn.GRU(input_size=embedding_dim, hidden_size=rec_hidden_size, batch_first=True)
            
            case _:
                raise TypeError("Unsupported Recurrent Layer Type received")

        self.fc = nn.Sequential(
            nn.Linear(in_features=rec_hidden_size, out_features=fc_hidden_size, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=fc_hidden_size, out_features=1, bias=True)
        )

        self.activation = nn.Sigmoid()

        if dropout:
            self.dropout = nn.Dropout(p=dropout)
        else:
            self.dropout = None

        self.embedding_dim = embedding_dim
        self.rec_hidden_size = rec_hidden_size
        self.fc_hidden_size = fc_hidden_size
        self.recurrent_type = recurrent_type

    def forward(self, input):
        embed = self.embed_input(input)
        
        if self.recurrent_type == 'RNN' or self.recurrent_type == 'GRU':
            _, hidden = self.recurrent(embed)
        else:
            _, (hidden, cell) = self.recurrent(embed)

        hidden = hidden.squeeze(dim=0)
        out = self.activation(self.fc(hidden))

        return out
    
    def embed_input(self, input):
        if self.dropout:
            return self.dropout(self.embed(input))
        else:
            return self.embed(input)

In [45]:
def get_accuracy_and_f1_score(y_true, y_predicted):
    """
    This function takes in two numpy arrays and computes the accuracy and F1 score
    between them. You can use the imported sklearn functions to do this.

    Args:
        y_true (list) : A 1D numpy array of ground truth labels
        y_predicted (list) : A 1D numpy array of predicted labels

    Returns:
        accuracy (float) : The accuracy of the predictions
        f1_score (float) : The F1 score of the predictions
    """

    # Get the accuracy
    accuracy = accuracy_score(y_true, y_predicted)

    # Get the F1 score
    f1 = f1_score(y_true, y_predicted)

    return accuracy, f1


In [46]:
def train_loop(model, criterion, optimizer, iterator):
    """
    This function is used to train a model for one epoch.
    :param model: The model to be trained
    :param criterion: The loss function
    :param optim: The optimizer
    :param iterator: The training data iterator
    :return: The average loss for this epoch for all batches
    """
    # Set the model to train mode (build computation graph)
    model.train()
    
    total_loss = 0
    for x, y in tqdm(iterator, total=len(iterator), desc="Training Model"):

        optimizer.zero_grad()

        x, y = x.to(device), y.to(device)
        outs = model(x).squeeze(dim=-1)

        loss = criterion(outs, y.float())
        loss.backward()

        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(iterator)

    return average_loss

In [52]:
def val_loop(model, criterion, iterator):
    """
    This function is used to evaluate a model on the validation set.
    :param model: The model to be evaluated
    :param iterator: The validation data iterator
    :return: true: a Python boolean array of all the ground truth values
             pred: a Python boolean array of all model predictions.
            average_loss: The average loss over the validation set
    """

    true, pred = [], []
    total_loss = 0

    # Set the model to evaluation mode
    model.eval()

    # Don't calculate gradients
    with torch.no_grad():
        for x, y in tqdm(iterator, total=len(iterator), desc="Validating Model"):
            x, y = x.to(device), y.to(device)

            outs = model(x)
            outs = outs.squeeze(dim=-1)

            predictions = [True if out >= 0.5 else False for out in outs]
            labels = [True if label == 1 else False for label in y]

            loss = criterion(outs, y.float())
            total_loss += loss.item()

            # Add the predictions and labels to the lists
            pred.extend(predictions)
            true.extend(labels)
        average_loss = total_loss / len(iterator)

    return true, pred, average_loss

In [48]:
# HYPERPARAMETERS:

EMBEDDING_DIM = 64
REC_HIDDEN_DIM = 64
FC_HIDDEN_DIM = 32
REC_LAYER_TYPE = 'LSTM'
DROPOUT = 0.1

BETAS = (0.9,0.999)
LR = 1e-4
EPOCHS = 10

In [53]:
model = RecurrentWAF(vocab_size=len(train_vocab),embedding_dim=EMBEDDING_DIM,
                     rec_hidden_size=REC_HIDDEN_DIM, fc_hidden_size=FC_HIDDEN_DIM,
                     recurrent_type=REC_LAYER_TYPE, dropout=DROPOUT).to(device)

In [54]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(params=model.parameters(),lr=LR, betas=BETAS)

In [55]:
for epoch in range(EPOCHS):
    train_loss = train_loop(model, criterion, optimizer, train_iterator)
    true, pred, val_loss = val_loop(model, criterion, val_iterator)
    accuracy, f1 = get_accuracy_and_f1_score(true, pred)
    print(f"Epoch {epoch+1} -- Train_Loss: {train_loss} -- Val_Loss: {val_loss} -- Val_Accuracy: {accuracy} -- Val_F1: {f1}")

Training Model: 100%|██████████| 709/709 [01:05<00:00, 10.88it/s]
Validating Model: 100%|██████████| 178/178 [00:14<00:00, 12.19it/s]


Epoch 1 -- Train_Loss: 0.6852185000653663 -- Val_Loss: 0.682008421153165 -- Val_Accuracy: 0.5684907325684024 -- Val_F1: 0.0


Training Model: 100%|██████████| 709/709 [01:04<00:00, 10.91it/s]
Validating Model: 100%|██████████| 178/178 [00:14<00:00, 12.42it/s]


Epoch 2 -- Train_Loss: 0.6789916478023879 -- Val_Loss: 0.6730516790673974 -- Val_Accuracy: 0.5684907325684024 -- Val_F1: 0.0


Training Model: 100%|██████████| 709/709 [01:09<00:00, 10.27it/s]
Validating Model: 100%|██████████| 178/178 [00:15<00:00, 11.49it/s]


Epoch 3 -- Train_Loss: 0.6646653217052034 -- Val_Loss: 0.6526645522439078 -- Val_Accuracy: 0.5684907325684024 -- Val_F1: 0.0


Training Model: 100%|██████████| 709/709 [01:08<00:00, 10.36it/s]
Validating Model: 100%|██████████| 178/178 [00:14<00:00, 12.61it/s]


Epoch 4 -- Train_Loss: 0.6349879104344566 -- Val_Loss: 0.6551228120420756 -- Val_Accuracy: 0.6643424536628421 -- Val_F1: 0.6062739414018015


Training Model: 100%|██████████| 709/709 [01:06<00:00, 10.63it/s]
Validating Model: 100%|██████████| 178/178 [00:14<00:00, 12.32it/s]


Epoch 5 -- Train_Loss: 0.6248709489663666 -- Val_Loss: 0.6392389244577857 -- Val_Accuracy: 0.6685789938217123 -- Val_F1: 0.5665473854322983


Training Model: 100%|██████████| 709/709 [01:09<00:00, 10.23it/s]
Validating Model: 100%|██████████| 178/178 [00:15<00:00, 11.41it/s]


Epoch 6 -- Train_Loss: 0.6141146163341861 -- Val_Loss: 0.5784137320987294 -- Val_Accuracy: 0.6581641659311562 -- Val_F1: 0.7054528861510381


Training Model: 100%|██████████| 709/709 [01:05<00:00, 10.78it/s]
Validating Model: 100%|██████████| 178/178 [00:15<00:00, 11.55it/s]


Epoch 7 -- Train_Loss: 0.6250883101240031 -- Val_Loss: 0.6210690805416429 -- Val_Accuracy: 0.68261253309797 -- Val_F1: 0.6257285595337219


Training Model: 100%|██████████| 709/709 [01:09<00:00, 10.20it/s]
Validating Model: 100%|██████████| 178/178 [00:19<00:00,  9.29it/s]


Epoch 8 -- Train_Loss: 0.6228632140058724 -- Val_Loss: 0.6259712670626265 -- Val_Accuracy: 0.6643424536628421 -- Val_F1: 0.6532956513811651


Training Model: 100%|██████████| 709/709 [01:14<00:00,  9.50it/s]
Validating Model: 100%|██████████| 178/178 [00:15<00:00, 11.71it/s]


Epoch 9 -- Train_Loss: 0.6675716506577345 -- Val_Loss: 0.6773813591244515 -- Val_Accuracy: 0.5849073256840247 -- Val_F1: 0.21183174124350596


Training Model: 100%|██████████| 709/709 [01:17<00:00,  9.13it/s]
Validating Model: 100%|██████████| 178/178 [00:17<00:00, 10.08it/s]

Epoch 10 -- Train_Loss: 0.6701094345918663 -- Val_Loss: 0.6447695939058669 -- Val_Accuracy: 0.6119152691968226 -- Val_F1: 0.3570697470390408





In [59]:
torch.save(model,'./models/rnn_waf.bin')

In [56]:
true, pred, val_loss = val_loop(model, criterion, val_iterator)
accuracy, f1 = get_accuracy_and_f1_score(true, pred)
print(f"Final Validation Accuracy: {accuracy}")
print(f"Final Validation F1-Score: {f1}")

Validating Model: 100%|██████████| 178/178 [00:16<00:00, 10.75it/s]

Final Validation Accuracy: 0.6120035304501323
Final Validation F1-Score: 0.35730994152046786



