# Libraries Import

In [38]:
import random
import os
import time
import itertools

import pandas as pd
import numpy as np

## PyTorch
import torch
from torch import nn

from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.utils.class_weight import compute_class_weight

from torch.utils.tensorboard import SummaryWriter
import torcheval.metrics as tm

## Sklearn
from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV

from sklearn import preprocessing, decomposition
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, ConfusionMatrixDisplay

## Saving, Loading and Plotting
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

# Var Selection

In [39]:
SEED = 42
FILENAME = "train_dataset.csv"

clf = 'rf'          # 'rf', 'svm' or 'knn'
pre = 'std'         # 'pca', 'lda' or 'std'
overfit = True      # True or False

ml = True           # DON'T CHANGE (True for Machine Learning, False for Deep Learning)

if clf == 'ffnn' or clf == 'tabnet' or clf == 'tabtransf':
    overfit = True
    ml = False

In [40]:
if not ml:
    if torch.cuda.is_available():
        device = torch.device('cuda')
    elif torch.backends.mps.is_available():
        device = torch.device('mps')
    else:
        device = torch.device('cpu')
        
    print("Device: {}".format(device))

In [41]:
if not ml:
    def fix_random(seed: int) -> None:
        """Fix all the possible sources of randomness.

        Args:
            seed: the seed to use. 
        """
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

# Dataset

## Parsing

In [42]:
df = pd.read_csv(FILENAME, sep=",", low_memory=False)
df = df.dropna()
df = df.drop(columns=["label"])

df["src_bytes"] = df["src_bytes"].replace("0.0.0.0", np.nan).astype(float)
mean_src_bytes = df["src_bytes"].mean()
df["src_bytes"] = df["src_bytes"].fillna(mean_src_bytes)

df.astype({'src_bytes': 'int64', 'ts': 'datetime64[ms]', 'dns_AA': 'bool', 'dns_RD': 'bool', 'dns_RA': 'bool', 'dns_rejected': 'bool', 'ssl_resumed': 'bool', 'ssl_established': 'bool', 'weird_notice': 'bool'}).dtypes

y = df["type"]
df = df.drop(columns=["type"])

oe = preprocessing.OrdinalEncoder()
df_oe = oe.fit_transform(df.select_dtypes(include=['object']))
df.loc[:, df.select_dtypes(include=['object']).columns] = df_oe
X = df.to_numpy()

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [43]:
if not ml:
    class MyDataset(Dataset):
        def __init__(self, X, y):
            
            self.X = torch.FloatTensor(X)
            self.y = torch.LongTensor(y)
            
            self.num_features = X.shape[1]
            self.num_classes = len(np.unique(y))
        
        def __len__(self):
            return self.X.shape[0]

        def __getitem__(self, idx):
            return self.X[idx, :], self.y[idx]

## Splitting

In [44]:
if overfit:
    indeces = np.arange(X.shape[0])
    train_idx, test_idx = train_test_split(indeces, test_size=0.1, stratify=y, random_state=SEED)
    X_test = X[test_idx,:]
    y_test = y[test_idx]
    X = X[train_idx,:]
    y = y[train_idx]

indeces = np.arange(X.shape[0])
train_idx, val_idx = train_test_split(indeces, test_size=0.2, stratify=y, random_state=SEED)

fold = np.zeros(X.shape[0])
fold[train_idx] = -1
fold[val_idx] = 0

ps = PredefinedSplit(fold)
ps.get_n_splits()

X_train = X[train_idx,:]
y_train = y[train_idx]
X_val = X[val_idx,:]
y_val = y[val_idx]

## Pre-Processing

In [45]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

X = scaler.transform(X)

if pre == 'pca' or pre == 'lda':
    if pre == 'pca':
        pre = decomposition.PCA(n_components='mle', svd_solver='full')
        pre.fit(X_train)
    else:
        pre = LinearDiscriminantAnalysis()
        pre.fit(X_train, y_train)

    X = pre.transform(X)

    X_train = pre.transform(X_train)
    X_val = pre.transform(X_val)

    if overfit:
        X_test = pre.transform(X_test)

In [46]:
if not ml:
    train_dataset = MyDataset(X_train, y_train)
    val_dataset = MyDataset(X_val, y_val)
    test_dataset = MyDataset(X_test, y_test)

    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    class_weights = dict(enumerate(class_weights))
    print(class_weights)

# Train

## Model Selection

### RF

In [47]:
if clf == 'rf':
    param_grid = {
        'n_estimators': [10, 20, 35, 50, 100, 200],
        'criterion': ['gini', 'entropy']
    }

    scoring = ['balanced_accuracy', 'f1_weighted']

    grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=ps, scoring=scoring, n_jobs=-1, verbose=10, refit='balanced_accuracy')

### SVM

In [48]:
if clf == 'svm':
    param_grid = {
        'C': [0.1, 1, 10, 100, 1000], 
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'gamma': ['scale', 'auto']  
    }

    scoring = ['balanced_accuracy', 'f1_weighted']

    grid = GridSearchCV(SVC(), param_grid, cv=ps, scoring=scoring, n_jobs=-1, verbose=10, refit='balanced_accuracy')

### KNN

In [49]:
if clf == 'knn':
    param_grid = {
        'n_neighbors': [10, 20, 50, 100, 500, 775, 900, 1000],
        'p': [1, 2]
    }

    scoring = ['balanced_accuracy', 'f1_weighted']

    grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=ps, scoring=scoring, n_jobs=-1, verbose=10, refit='balanced_accuracy')

### FFNN 

In [50]:
if not ml:
    # Architecture
    class FeedForwardPlus(nn.Module):
        def __init__(self, input_size, num_classes, hidden_size, depth=1, batch_norm=False, drop=0):
            super(FeedForwardPlus, self).__init__()
            
            model = []
            model += [nn.Linear(input_size, hidden_size)]
            if batch_norm:
                model += [nn.BatchNorm1d(hidden_size)]
            model += [nn.ReLU()]

            block = [
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU()
            ]

            block_batch_norm = [
                nn.Linear(hidden_size, hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU()
            ]

            block_dropout = [
                nn.Dropout(drop),
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU()
            ]

            for i in range(depth):
                if not batch_norm and drop == 0:
                    model += block
                elif batch_norm and drop == 0:
                    model += block_batch_norm
                elif drop > 0 and not batch_norm:
                    model += block_dropout
            
            self.model = nn.Sequential(*model)
            
            self.output = nn.Linear(hidden_size, num_classes)
            

        def forward(self, x):
            h = self.model(x)
            out = self.output(h)
            return out


    # Train function (f1 score with torcheval)
    def train_model(model, criterion, optimizer, epoch, scheduler, train_loader, val_loader, device, writer, log_name="model"):
        n_iter = 0
        best_valid_loss = float('inf')
        for epoch in range(epoch):
            model.train()
            
            for data, targets in train_loader:
                data, targets = data.to(device), targets.to(device)
                
                optimizer.zero_grad()

                # Forward pass
                y_pred = model(data)

                # Compute Loss
                loss = criterion(y_pred, targets)
            
                
                # Backward pass
                loss.backward()
                optimizer.step()

                n_iter += 1
            
            labels, _, y_pred = test_model(model, val_loader, device)
            loss_val = criterion(y_pred, labels)
            
            # Log the f1 score
            f1 = tm.MulticlassF1Score(num_classes=labels.max().item() + 1)
            f1.update(y_pred, labels)
            writer.add_scalar(log_name, f1.compute().item(), epoch)
            
            # Save the best model (based on the validation loss)    
            if loss_val.item() < best_valid_loss:
                best_valid_loss = loss_val.item()
                if not os.path.exists('models'):
                    os.makedirs('models')
                torch.save(model.state_dict(), 'models/'+log_name)
            
            (log_name, scheduler.get_last_lr()[0], epoch)
            
            scheduler.step()
                
        return model, best_valid_loss


    # Evaluate the performance on validation and test sets
    def test_model(model, data_loader, device):
        model.eval()
        y_pred = []
        y_test = []
        
        for data, targets in data_loader:
            data, targets = data.to(device), targets.to(device)
            y_pred += model(data)
            y_test += targets
        
        y_test = torch.stack(y_test).squeeze()
        y_pred = torch.stack(y_pred).squeeze()
        y_pred_c = y_pred.argmax(dim=1, keepdim=True).squeeze()
        
        return y_test, y_pred_c, y_pred


    # Train settings 
    batch_sizes = [256, 512]
    hidden_sizes = [16, 32, 64] # 64
    batch_norm_list = [False, True]
    drop = 0
    depths = [2, 4, 8, 16]
    num_epochs = 10
    learning_rate = 0.01
    gammas = [1, 0.5]
    step_size = num_epochs / 4

    hyperparameters = itertools.product(batch_sizes, hidden_sizes, depths, gammas, batch_norm_list)

    lowest_loss = float('inf')
    best_model_params = None
    best_model = None

## Fitting

In [None]:
if ml == True:
    grid.fit(X, y)
    print("---------------------------------")
    print("Best hyper: ", grid.best_estimator_)
    print("Best score: ", grid.best_score_)
    best_clf = grid.best_estimator_
else: 
    for batch_size, hidden_size, depth, gamma, batch_norm in hyperparameters:
        fix_random(SEED)
        
        start = time.time()
        log_name = "B"+str(batch_size)+"-dim"+str(hidden_size)+"-dp"+str(depth)+"-ep"+str(num_epochs)+"-lr"+str(learning_rate)+"-steplr"+str(step_size)+"-gamma"+str(gamma)+"-BN"+str(batch_norm)+"-drop"+str(drop)
        print(log_name, end=", ")
        
        writer = SummaryWriter('runs/'+log_name)

        # Create Dataloaders
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        # Define Architecture
        model = FeedForwardPlus(train_dataset.num_features, train_dataset.num_classes, hidden_size, depth, batch_norm=batch_norm)
        model.to(device)
                
        # Define Loss and Optimizer
        criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(list(class_weights.values())).to(device))
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

        # Train the model
        model, best_valid_loss = train_model(model, criterion, optimizer, num_epochs, scheduler, train_loader, val_loader, device, writer, log_name)

        writer.add_hparams({'hparam/bsize': batch_size, 'hparam/hidden size': hidden_size, 'hparam/depth':depth+2, 'hparam/scheduler': gamma,'hparam/batch norm': batch_norm}, {'best loss': best_valid_loss})
        writer.flush()

        if best_valid_loss < lowest_loss:
            best_model = model
            lowest_loss = best_valid_loss
            best_model_params = (batch_size, hidden_size, depth, gamma, batch_norm, log_name)

        # Log the elapsed time
        print("-- elpased time:", time.time() - start)
    writer.close()

    # Save the best model and its hyperparameters
    torch.save(best_model.state_dict(), 'models/best_model')
    with open('best_model_params.pkl', 'wb') as f:
        pickle.dump(best_model_params, f)

    # Load the best model
    if best_model_params:
        _, _, _, _, _, best_log_name = best_model_params
        model = FeedForwardPlus(train_dataset.num_features, train_dataset.num_classes, hidden_size, depth, batch_norm=batch_norm)

Fitting 1 folds for each of 12 candidates, totalling 12 fits
[CV 1/1; 5/12] START criterion=gini, n_estimators=100...........................
[CV 1/1; 8/12] START criterion=entropy, n_estimators=20.........................
[CV 1/1; 2/12] START criterion=gini, n_estimators=20............................
[CV 1/1; 7/12] START criterion=entropy, n_estimators=10.........................
[CV 1/1; 1/12] START criterion=gini, n_estimators=10............................
[CV 1/1; 4/12] START criterion=gini, n_estimators=50............................
[CV 1/1; 6/12] START criterion=gini, n_estimators=200...........................
[CV 1/1; 3/12] START criterion=gini, n_estimators=35............................
[CV 1/1; 7/12] END criterion=entropy, n_estimators=10; balanced_accuracy: (test=0.999) f1_weighted: (test=1.000) total time=  20.7s
[CV 1/1; 1/12] END criterion=gini, n_estimators=10; balanced_accuracy: (test=0.999) f1_weighted: (test=1.000) total time=  21.2s
[CV 1/1; 9/12] START criterion

## Testing

In [None]:
if ml:
    if overfit:
        cm = confusion_matrix(y_test, best_clf.predict(X_test), labels=best_clf.classes_)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_clf.classes_)
        disp.plot()
        print(classification_report(y_test, best_clf.predict(X_test)))
    else:
        cm = confusion_matrix(y_val, best_clf.predict(X_val), labels=best_clf.classes_)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_clf.classes_)
        disp.plot()
else:
    # Evaluate the best model on the test set
    test_loader = DataLoader(test_dataset, batch_size=batch_sizes[0])

    model.to(device)
    model.eval()
    y_test, y_pred_c, y_pred = test_model(model, test_loader, device)

    # Print Accuracy, Precision, F1 Score and Confusion Matrix
    print(f"Accuracy: {.4} --\t Precision: {.4} --\t F1: {.4} --\t Balanced Accuracy: {.4}", 
          tm.MulticlassAccuracy().update(y_pred_c, y_test).compute().item(), 
          tm.MulticlassPrecision(num_classes=y_test.max().item() + 1).update(y_pred, y_test).compute().item(),
          tm.MulticlassF1Score(num_classes=y_test.max().item() + 1, ).update(y_pred, y_test),
          balanced_accuracy_score(y_test.cpu().numpy(), y_pred_c.cpu().numpy()))

    print(classification_report(y_test.cpu().numpy(), y_pred_c.cpu().numpy()))
    conf_matrix = tm.MulticlassConfusionMatrix(num_classes=y_test.max().item() + 1)
    conf_matrix.update(y_pred_c, y_test)
    ConfusionMatrixDisplay(conf_matrix.compute().cpu().numpy()).plot()