In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, RandomSampler

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd
from tqdm import tqdm

import random

# CSIC and Vocab Classes

In [1]:
import os
from typing import List
from collections import Counter

import torch
from torch.utils.data import Dataset

import pandas as pd
from sklearn.preprocessing import LabelEncoder

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel

from urllib import parse

class CSICDataset(Dataset):
    def __init__(self, df: pd.DataFrame, vocab=None, vocab_size=1000, min_frequency=1, special_tokens=["[UNK]","[CLS]","[PAD]"], tokenization_algorithm="bpe"):
        self.df = df

        # Export text content to csv for learning tokenization; apply BPE
        path = os.path.join('.', 'tokenization_input')

        self.df.to_csv(path_or_buf=path, columns=['content_for_tokenization'], index=False, header=False)

        if vocab == None:
            vocab = Vocab(vocab_size=vocab_size, min_frequency=min_frequency,
                           special_tokens=special_tokens,
                           tokenization_algorithm=tokenization_algorithm)
            vocab.build(corpus_files=[path])

        self.vocab = vocab

        self.encode_df()


    @staticmethod
    def process_df(df: pd.DataFrame):
        # Pre-process data by dropping rows without POST-Data or GET-Query
        get_mask, post_mask = df['GET-Query'].notna(), df['POST-Data'].notna()

        df.loc[get_mask,"content_for_tokenization"] = df.loc[get_mask,"GET-Query"]
        df.loc[post_mask,"content_for_tokenization"] = df.loc[post_mask,"POST-Data"]

        df = df[get_mask | post_mask]
        df = df.drop(columns=["GET-Query","POST-Data", "Accept-Charset", "Accept-Language", "Accept", "Cache-control", "Pragma", "Content-Type", "Host-Header", "Connection"])

        return df

    def encode_df(self):
        # Tokenize the GET-Query and POST-Data columns according to the subword vocabulary learned from BPE
        self.df["tokenized_ids"] = self.df["content_for_tokenization"].apply(lambda x: self.vocab.words2indices(x))
        self.df["tokenized"] = self.df["content_for_tokenization"].apply(lambda x: self.vocab.tokenize(x))
        self.df = self.df.drop(columns=["content_for_tokenization"])

        self.class_encoder, self.method_encoder = LabelEncoder(), LabelEncoder()
        self.df['Class'], self.df['Method'] = self.class_encoder.fit_transform(self.df['Class']), self.class_encoder.fit_transform(self.df['Method'])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        features = self.df.iloc[index].drop(['Class', 'User-Agent'])
        label = self.df.iloc[index]['Class']

        return features, label

In [2]:
class Vocab(object):
    def __init__(self, vocab_size=0, min_frequency=0, special_tokens: List[str]=[], unk_token="[UNK]", pad_token="[PAD]", tokenizer=None, tokenization_algorithm="bpe"):
        if tokenizer:
            self.tokenizer = tokenizer

            self.word2id = tokenizer.get_vocab()
            self.id2word = {v: k for k, v in self.word2id.items()}

            self.unk_id = self.word2id[unk_token]

        else:
            assert vocab_size > 0
            assert min_frequency > 0

            self.vocab_size = vocab_size
            self.min_frequency = min_frequency
            self.special_tokens = special_tokens
            self.unk_token = unk_token
            self.pad_token = pad_token
            self.tokenization_algorithm = tokenization_algorithm

    def build(self, corpus_files: List[str]):
        if self.tokenization_algorithm == 'bpe':
            tokenizer = Tokenizer(BPE(unk_token=self.unk_token))
            trainer = BpeTrainer(vocab_size=self.vocab_size, min_frequency=self.min_frequency, special_tokens=self.special_tokens)
            tokenizer.pre_tokenizer = ByteLevel()

            tokenizer.train(corpus_files, trainer)

            self.tokenizer = tokenizer
            self.word2id = tokenizer.get_vocab()
            self.id2word = {v: k for k, v in self.word2id.items()}

        elif self.tokenization_algorithm == 'vocab_map':
            self.word2id, self.id2word = dict(), dict()
            curr_id, self.min_frequency = 1,1
            counter = Counter()

            def add_to_vocab(token: str, ignore_cutoff=False):
                nonlocal curr_id
                if (ignore_cutoff or counter[token] >= self.min_frequency) and token not in self.word2id:
                    self.word2id[token] = curr_id
                    self.id2word[curr_id] = token

                    curr_id += 1

            for file_path in corpus_files:
                with open(file_path, 'r') as file:
                    for line in file:
                        tokens = Vocab.parse_req_body_or_params(line)
                        counter.update(tokens)

            unwanted_tokens = [' ','']
            for token in unwanted_tokens:
                if token in counter:
                    del counter[token]

            for token in self.special_tokens:
                add_to_vocab(token, ignore_cutoff=True)

            for token in set(counter.elements()):
                add_to_vocab(token)

        else:
            raise TypeError("Unsupported tokenization algorithm detected")

        self.unk_id = self.word2id[self.unk_token]
        self.pad_id = self.word2id[self.pad_token]

    def __getitem__(self, word):
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        return word in self.word2id

    def __setitem__(self, key, value):
        raise ValueError('vocabulary is readonly')

    def __len__(self):
        return len(self.word2id)

    def __repr__(self):
        return 'Vocabulary[size=%d]' % len(self)

    def id2word(self, wid):
        return self.id2word[wid]

    def save(self, file_path):
        self.tokenizer.save(path=file_path)

    def words2indices(self, content):
        if self.tokenization_algorithm == 'bpe':
            if type(content) == list:
                return [self.tokenizer.encode(row).ids for row in content]
            else:
                return self.tokenizer.encode(content).ids

        elif self.tokenization_algorithm == 'vocab_map':
            if type(content) == list:
                return [[self[token] for token in Vocab.parse_req_body_or_params(line)] for line in content]
            else:
                return [self[token] for token in Vocab.parse_req_body_or_params(content)]
        else:
            raise TypeError("Unsupported tokenization algorithm detected")

    def tokenize(self, content):
        if self.tokenization_algorithm == 'bpe':
            if type(content) == list:
                return [self.tokenizer.encode(row).tokens for row in content]
            else:
                return self.tokenizer.encode(content).tokens

        elif self.tokenization_algorithm == 'vocab_map':
            if type(content) == list:
                return [[token if self.__contains__(token) else self.unk_token for token in Vocab.parse_req_body_or_params(line)] for line in content]
            else:
                return [token if self.__contains__(token) else self.unk_token for token in Vocab.parse_req_body_or_params(content)]

        else:
            raise TypeError("Unsupported tokenization algorithm detected")

    @staticmethod
    def parse_req_body_or_params(line: str):
        parsed_line = parse.parse_qs(parse.unquote_plus(string=line))

        tokens = []
        for k, v in parsed_line.items():
            tokens.append(k)
            tokens.extend(v)

        return tokens

    @staticmethod
    def load(file_path: str):
        return Vocab(tokenizer=Tokenizer.from_file(file_path))

# Data Loading

In [5]:
# Defining global constants
RANDOM_SEED = 42
BATCH_SIZE = 64

torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

In [6]:
# This is how we select a GPU if it's available on your computer or in the Colab environment.
print('Device of execution - ', device)

Device of execution -  cpu


In [7]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('./dataset.csv')
df = CSICDataset.process_df(df)

# # The following two lines are used to load the indices of the training and validation sets
# train_indices = np.load('./dataset/train_indices.npy')
# val_indices = np.load('./dataset/val_indices.npy')

# # The following two lines are used to select the training and validation sets from the dataframe based on the indices loaded above
# train_data = df.loc[train_indices].reset_index(drop=True)
# val_data = df.loc[val_indices].reset_index(drop=True)

train_data, val_data = train_test_split(df, test_size=0.2)

In [8]:
train_dataset = CSICDataset(df=train_data, vocab_size=5000, min_frequency=1, tokenization_algorithm='bpe')
train_vocab = train_dataset.vocab

val_dataset = CSICDataset(df=val_data, vocab=train_vocab)

train_sampler = RandomSampler(train_dataset)
val_sampler   = RandomSampler(val_dataset)

In [9]:
# Check Dataset Lengths
assert len(train_dataset) == 45319, "Training Dataset is of incorrect size"
assert len(val_dataset) == 11330, "Validation Dataset is of incorrect size"

print('Training and Validation dataset sizes match!')

Training and Validation dataset sizes match!


In [10]:
PADDING_VALUE = train_vocab.pad_id

In [11]:
def collate_fn(batch, padding_value=PADDING_VALUE):
    # Batch is of the form List[Tuple(Features(tokenized_ids,...), Labels)]
    sequences = [torch.tensor(sample[0]['tokenized_ids'], dtype=torch.long, device=device) for sample in batch]
    padded_tokens = torch.nn.utils.rnn.pad_sequence(sequences=sequences,batch_first=True, padding_value=padding_value)
    attention_mask = (padded_tokens != padding_value).float()
    labels = torch.tensor([sample[1] for sample in batch])

    return padded_tokens, attention_mask, labels

In [12]:
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator   = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)

In [13]:
for tokens, attention_masks, labels in train_iterator:
    print(f'tokens: {tokens.shape}')
    print(f'atteniton masks: {attention_masks.shape}')
    print(f'labels: {labels.shape}')
    break

tokens: torch.Size([64, 199])
atteniton masks: torch.Size([64, 199])
labels: torch.Size([64])


# DistillBERT Model

In [23]:
import transformers
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
# from torchsummary import summary
from tqdm import tqdm

# add python path to include src directory
import sys
sys.path.insert(0, '../src')

# standard library imports
from dataclasses import dataclass
from pathlib import Path
from typing import Tuple
import math

# related third party imports
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader
from transformers import BertTokenizer
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import tqdm
import time

In [27]:
class BertTrainer:
    """ A training and evaluation loop for PyTorch models with a BERT like architecture. """


    def __init__(
        self,
        model,
        train_dataloader,
        eval_dataloader=None,
        epochs=1,
        lr=5e-04,
        output_dir='./',
        output_filename='model_state_dict.pt',
        save=False,
        tabular=False,
    ):
        """
        Args:
            model: torch.nn.Module: = A PyTorch model with a BERT like architecture,
            tokenizer: = A BERT tokenizer for tokenizing text input,
            train_dataloader: torch.utils.data.DataLoader =
                A dataloader containing the training data with "text" and "label" keys (optionally a "tabular" key),
            eval_dataloader: torch.utils.data.DataLoader =
                A dataloader containing the evaluation data with "text" and "label" keys (optionally a "tabular" key),
            epochs: int = An integer representing the number epochs to train,
            lr: float = A float representing the learning rate for the optimizer,
            output_dir: str = A string representing the directory path to save the model,
            output_filename: string = A string representing the name of the file to save in the output directory,
            save: bool = A boolean representing whether or not to save the model,
            tabular: bool = A boolean representing whether or not the BERT model is modified to accept tabular data,
        """

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.train_dataloader = train_dataloader
        self.eval_dataloader = eval_dataloader
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
        self.loss_fn = nn.CrossEntropyLoss()
        self.output_dir = output_dir
        self.output_filename = output_filename
        self.save = save
        self.eval_loss = float('inf')  # tracks the lowest loss so as to only save the best model
        self.epochs = epochs
        self.epoch_best_model = 0  # tracks which epoch the lowest loss is in so as to only save the best model
        self.tabular = tabular

    def train(self, evaluate=False):
        """ Calls the batch iterator to train and optionally evaluate the model."""
        for epoch in range(self.epochs):
            self.iteration(epoch, self.train_dataloader)
            if evaluate and self.eval_dataloader is not None:
                self.iteration(epoch, self.eval_dataloader, train=False)

    def evaluate(self):
        """ Calls the batch iterator to evaluate the model."""
        epoch=0
        self.iteration(epoch, self.eval_dataloader, train=False)

    def iteration(self, epoch, data_loader, train=True):
        """ Iterates through one epoch of training or evaluation"""

        # initialize variables
        loss_accumulated = 0.
        correct_accumulated = 0
        samples_accumulated = 0
        preds_all = []
        labels_all = []

        self.model.train() if train else self.model.eval()

        # progress bar
        mode = "train" if train else "eval"
        batch_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc=f"EP ({mode}) {epoch}",
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )

        total_comp_time = 0
        # iterate through batches of the dataset
        for i, batch in batch_iter:
            # print("Batch: ", batch)
            # print(len(batch))
            # tokenize data
            # batch_t = self.tokenizer(
            #     batch[0],
            #     padding='max_length',
            #     max_length=512,
            #     truncation=True,
            #     return_tensors='pt',
            # )
            # batch_t = {key: value.to(self.device) for key, value in batch_t.items()}
            # batch_t["input_labels"] = batch["label"].to(self.device)
            # batch_t["tabular_vectors"] = batch["tabular"].to(self.device)
            # batch = {key: value.to(self.device) for key, value in batch.items()}
            # token_types = torch.zeros((len(batch[0][0])), dtype=torch.long).to(self.device)
            # forward pass - include tabular data if it is a tabular model

            start = time.time()
            logits = self.model(
                input_ids=batch[0],
                attention_mask=batch[1],
            ).logits
            total_comp_time += time.time() - start

            # calculate loss
            labels = batch[2].to(self.device)
            loss = self.loss_fn(logits, labels)

            # compute gradient and and update weights
            if train:
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            # calculate the number of correct predictions
            preds = logits.argmax(dim=-1)
            correct = preds.eq(labels).sum().item()

            # accumulate batch metrics and outputs
            loss_accumulated += loss.item()
            correct_accumulated += correct
            samples_accumulated += len(batch[2])
            preds_all.append(preds.detach())
            labels_all.append(labels.detach())


        average_comp_time = total_comp_time / len(batch_iter)
        print("Average Comp Time: ", average_comp_time)

        # concatenate all batch tensors into one tensor and move to cpu for compatibility with sklearn metrics
        preds_all = torch.cat(preds_all, dim=0).cpu()
        labels_all = torch.cat(labels_all, dim=0).cpu()

        # metrics
        accuracy = accuracy_score(labels_all, preds_all)
        precision = precision_score(labels_all, preds_all, average='macro')
        recall = recall_score(labels_all, preds_all, average='macro')
        f1 = f1_score(labels_all, preds_all, average='macro')
        avg_loss_epoch = loss_accumulated / len(data_loader)

        # print metrics to console
        print(
            f"samples={samples_accumulated}, \
            correct={correct_accumulated}, \
            acc={round(accuracy, 4)}, \
            recall={round(recall, 4)}, \
            prec={round(precision,4)}, \
            f1={round(f1, 4)}, \
            loss={round(avg_loss_epoch, 4)}"
        )

        # save the model if the evaluation loss is lower than the previous best epoch
        if self.save and not train and avg_loss_epoch < self.eval_loss:

            # create directory and filepaths
            dir_path = Path(self.output_dir)
            dir_path.mkdir(parents=True, exist_ok=True)
            file_path = dir_path / f"{self.output_filename}_epoch_{epoch}.pt"

            # delete previous best model from hard drive
            if epoch > 0:
                file_path_best_model = dir_path / f"{self.output_filename}_epoch_{self.epoch_best_model}.pt"
                !rm -f $file_path_best_model

            # save model
            torch.save({
                'model_state_dict': self.model.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict()
            }, file_path)

            # update the new best loss and epoch
            self.eval_loss = avg_loss_epoch
            self.epoch_best_model = epoch

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import time

# Load the DistilBERT tokenizer and model
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

In [None]:
trainer_distill_bert = BertTrainer(
    model,
    lr=0.001,
    epochs=10,
    train_dataloader=train_iterator,
    eval_dataloader=val_iterator,
    output_dir='../models/distill_bert',
    output_filename='distill_bert',
    save=True,
)

trainer_distill_bert.train(evaluate=True)

# Define a function to preprocess the input text
# def preprocess(text):
#     encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
#     return encoding

# Example usage
# text = "This movie was absolutely fantastic! I loved every minute of it."

# # Preprocess the input text
# encoding = preprocess(text)

# Pass the input through the DistilBERT model




# output = model(**encoding)

# # Get the predicted class label
# predicted_label = output.logits.argmax(-1).item()

# print(f"Predicted class label: {predicted_label}")

EP (train) 0: 100%|| 1417/1417 [38:10<00:00,  1.62s/it]


Average Comp Time:  0.5582960563893881
samples=45319,             correct=25637,             acc=0.5657,             recall=0.5,             prec=0.5,             f1=0.3736,             loss=0.6872


EP (eval) 0: 100%|| 355/355 [03:26<00:00,  1.72it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Average Comp Time:  0.47950864106836455
samples=11330,             correct=6441,             acc=0.5685,             recall=0.5,             prec=0.2842,             f1=0.3624,             loss=0.6853


EP (train) 1: 100%|| 1417/1417 [39:49<00:00,  1.69s/it]


Average Comp Time:  0.5635541629522051
samples=45319,             correct=25709,             acc=0.5673,             recall=0.5002,             prec=0.5124,             f1=0.3649,             loss=0.6844


EP (eval) 1: 100%|| 355/355 [03:31<00:00,  1.68it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Average Comp Time:  0.4942598383191606
samples=11330,             correct=6441,             acc=0.5685,             recall=0.5,             prec=0.2842,             f1=0.3624,             loss=0.6838


EP (train) 2: 100%|| 1417/1417 [42:25<00:00,  1.80s/it]


Average Comp Time:  0.5809059459387457
samples=45319,             correct=25704,             acc=0.5672,             recall=0.5001,             prec=0.5074,             f1=0.3653,             loss=0.6854


EP (eval) 2: 100%|| 355/355 [03:22<00:00,  1.76it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Average Comp Time:  0.47213033958220146
samples=11330,             correct=6441,             acc=0.5685,             recall=0.5,             prec=0.2842,             f1=0.3624,             loss=0.6838


EP (train) 3:  51%|| 721/1417 [19:00<20:04,  1.73s/it]

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import time

# Load the DistilBERT tokenizer and model
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

In [None]:
checkpoint = torch.load('distill_bert_epoch_1.pt')
model.load_state_dict(checkpoint['model_state_dict'])

In [None]:
model.eval()

In [None]:
total_comp_time = 0
count = 0
for tokens, attention_masks, labels in val_iterator:
    print(f'tokens: {tokens.shape}')
    print(f'atteniton masks: {attention_masks.shape}')
    print(f'labels: {labels.shape}')
    start = time.time()
    output = model(tokens)
    total_comp_time += time.time() - start
    count += 1
print(count)
print(len(val_iterator))
print("Average Time: ", total_comp_time/len(val_iterator))