In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

In [None]:
os.listdir("/kaggle/input/kdd-cup-1999-data")

In [None]:
!cat /kaggle/input/kdd-cup-1999-data/kddcup.names

In [None]:
col_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 
             'dst_bytes','land', 'wrong_fragment', 'urgent', 'hot', 
             'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
             'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 
             'num_access_files', 'num_outbound_cmds', 'is_host_login', 
             'is_guest_login', 'count', 'srv_count', 'serror_rate', 
             'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 
             'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 
             'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 
             'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
             'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 
             'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 
             'dst_host_srv_rerror_rate', 'label']

In [None]:
len(col_names)

In [None]:
df = pd.read_csv('/kaggle/input/kdd-cup-1999-data/kddcup.data.gz', names=col_names)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df['label'].value_counts()

In [None]:
df['label'].nunique()

In [None]:
num_cols = df._get_numeric_data().columns

cate_cols = list(set(df.columns)-set(num_cols))

cate_cols

In [None]:
df[cate_cols]

In [None]:
# Make a copy of the initial dataframe so we can use different approach later
init_df = df.copy(deep=True)

In [None]:
for category in cate_cols:
    print(df[category].value_counts())

In [None]:
category_mapping = {}
for category in cate_cols:
    labels, unique_values = pd.factorize(df[category])
    mapping = {value: label for label, value in enumerate(unique_values)}
    df[category] = labels
    category_mapping[category] = mapping

# Verify the category mappings
for category, mapping in category_mapping.items():
    print(f"Category: {category}")
    print(mapping)

In [None]:
category_mapping

In [None]:
for category in cate_cols:
    print(df[category].value_counts())

In [None]:
df = df[[col for col in df if df[col].nunique() > 1]]# keep columns where there are more than 1 unique values

corr = df.corr()

plt.figure(figsize=(8,5))
sns.heatmap(corr, cmap='Blues')

In [None]:
X = torch.tensor(df.drop('label',axis=1).values.astype(np.float32))
y = torch.tensor(df['label'].values, dtype=torch.long)
X,y

In [None]:
y.size()

In [None]:
X.shape

In [None]:
# OPT::::::2 ## stratify using train_test_split from sklearn
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=42)
train_ds,valid_ds = TensorDataset(X_train, y_train), TensorDataset(X_test, y_test)
# Make a dataloader
BATCH = 1024
train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=BATCH, shuffle=True)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(d, device) for d in data]
    return data.to(device, non_blocking=True)

In [None]:
class DeviceDL():
    
    def __init__(self, dl, dev):
        self.dl = dl
        self.dev = dev
    
    def __iter__(self):
        for batch in self.dl:
            yield to_device(batch, self.dev)
            
    def __len__(self):
        return len(self.dl)

In [None]:
train_dl = DeviceDL(train_dl, device)
valid_dl = DeviceDL(valid_dl, device)

In [None]:
def accuracy(label, preds):
    
    pred_labels = torch.argmax(preds, dim=1)
    # Check how many predictions match the ground truth labels
    correct_predictions = (pred_labels == label).sum() # Cast to int with .item()
    
    # Calculate accuracy
    accuracy = correct_predictions / len(pred_labels)
    return accuracy

In [None]:
class BaseNet(nn.Module):
    def get_loss(self, batch, loss_fn):
        features,labels = batch
        preds = self(features)
        loss = loss_fn(preds, labels)
        return loss
    
    def validate(self, batch, loss_fn):
        feature, labels = batch
        loss = self.get_loss(batch, loss_fn)
        pred = self(feature)
        
        acc = accuracy(labels, pred)
        return {'valid_loss' : loss , 'valid_acc' : acc}
    
    def average_validation(self, out):
        loss = torch.stack([l['valid_loss'] for l in out]).mean()
        acc = torch.stack([l['valid_acc'] for l in out]).mean()
        return {'valid_loss': loss.item() , 'valid_acc': acc.item()}
    
    def log_epoch(self, e, epoch, res):
        
        print('[{} / {}] epoch/s, training loss is {:.4f} validation loss is {:.4f}, validation accuracy is {:.4f} '\
              .format(e+1,epoch,
                      res['train_loss'],
                      res['valid_loss'],                
                      res['valid_acc']
                     )
             )

In [None]:
class ClassifierMLP(BaseNet):
    
    def __init__(self, activation, input_dim, hidden_1, hidden_2, hidden_3, out):
        super().__init__()
        self.hidden_1 = nn.Linear(input_dim, hidden_1)
        self.hidden_2 = nn.Linear(hidden_1, hidden_2)
        self.hidden_3 = nn.Linear(hidden_2, hidden_3)
        self.out = nn.Linear(hidden_3, out)
        self.activation = activation
        
    def forward(self, x):
        x1 = self.hidden_1(x)
        x2 = self.activation(x1)
        x3 = self.hidden_2(x2)
        x4 = self.activation(x3)
        x5 = self.hidden_3(x4)
        x6 = self.activation(x5)
        out = self.out(x6)
#         Note in pytorch we don't apply activation in final layer or use softmax because cross entropy
# loss function would do it automatically under the hood, we will rather only use softmax when interpreting
# the results in a human readable form. (not even while testing)
        return out
        
        

In [None]:
@torch.no_grad()
def valid(model, valid_dl, loss_fn):
    model.eval()
    out = [model.validate(batch, loss_fn) for batch in valid_dl]
    return model.average_validation(out)
    
def fit(model, train_dl, valid_dl, loss_fn, opt, EPOCHS):
    hist = []
    for e in range(EPOCHS):
        model.train()
        train_loss =[]
        for batch in train_dl:
            
            loss = model.get_loss(batch, loss_fn)
            train_loss.append(loss)
            loss.backward()
            opt.step()
            opt.zero_grad()
        
        res = valid(model, valid_dl, loss_fn)
        res['train_loss'] = torch.stack(train_loss).mean().item()
        
        model.log_epoch(e, EPOCHS, res)

        hist.append(res)
    return hist

In [None]:
activation = nn.ReLU()
input_dim = X.shape[1]
output_dim = df['label'].nunique()
hidden_1, hidden_2, hidden_3 = 256, 128, 128


model = ClassifierMLP(activation, input_dim, hidden_1, hidden_2, hidden_3, output_dim)
to_device(model, device)

In [None]:
loss_fn = nn.CrossEntropyLoss()
lr = 0.000001
opt = torch.optim.Adam(model.parameters(), lr=lr)
EPOCHS = 20

In [None]:
history = fit(model, train_dl, valid_dl, loss_fn, opt, EPOCHS)

In [None]:
epochs = range(1,len(history)+1)
val_acc_values = [entry['valid_acc'] for entry in history]
train_loss_values = [entry['train_loss'] for entry in history]
val_loss_values = [entry['valid_loss'] for entry in history]

In [None]:
!mkdir epoch_table

In [None]:
epoch_table_base_model = {
    "epochs" : list(range(1,21)),
    "Validation accuracy": val_acc_values,
    "Validation Loss": val_loss_values,
    "Train Loss" : train_loss_values
               }
pd.DataFrame(epoch_table_base_model).to_csv("epoch_table/EPOCH_DATA_BASE.csv", index = False)

In [None]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y.numpy())

print(dict(zip(category_mapping['label'].keys(),class_weights)))
print(init_df['label'].value_counts())

class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

In [None]:
model_weighted = ClassifierMLP(activation, input_dim, hidden_1, hidden_2, hidden_3, output_dim)
to_device(model_weighted, device)

In [None]:
loss_fn = nn.CrossEntropyLoss(class_weights)
lr = 0.000001
opt = torch.optim.Adam(model_weighted.parameters(), lr=lr)

In [None]:
history_weighted = fit(model_weighted, train_dl, valid_dl, loss_fn, opt, EPOCHS)

In [None]:
val_acc_weighted_values = [entry['valid_acc'] for entry in history_weighted]
train_loss_weighted_values = [entry['train_loss'] for entry in history_weighted]
val_loss_weighted_values = [entry['valid_loss'] for entry in history_weighted]

In [None]:
epoch_table_weighted_model = {
    "epochs" : list(range(1,21)),
    "Validation accuracy": val_acc_weighted_values,
    "Validation Loss": val_loss_weighted_values,
    "Train Loss" : train_loss_weighted_values
               }
pd.DataFrame(epoch_table_weighted_model).to_csv("epoch_table/EPOCH_DATA_WEIGHTED.csv", index = False)

In [None]:
!mkdir models

In [None]:
torch.save(model.state_dict(), 'models/model.pth')
torch.save(model_weighted.state_dict(), 'models/model_weighted.pth')

In [None]:
# free-up some ram
del df
del model
del model_weighted
del train_dl
del valid_dl
del X_train, X_test, y_train, y_test 
del train_ds, valid_ds

In [None]:
init_df['label'].value_counts()

In [None]:
label_mapping = {
    'back.': 1, 'land.': 1, 'neptune.': 1, 'pod.': 1, 'smurf.': 1, 'teardrop.': 1, 'apache2.': 1,
    'udpstorm.': 1, 'processtable.': 1, 'worm.': 1,
    
    'satan.': 2, 'ipsweep.': 2, 'nmap.': 2, 'portsweep.': 2, 'mscan.': 2, 'saint.': 2,
    
    'guess_passwd.': 3, 'ftp_write.': 3, 'imap.': 3, 'phf.': 3, 'multihop.': 3, 'warezmaster.': 3,
    'warezclient.': 3, 'spy.': 3, 'xlock.': 3, 'xsnoop.': 3, 'snmpguess.': 3, 'snmpgetattack.': 3,
    'httptunnel.': 3, 'sendmail.': 3, 'named.': 3, 'mailbomb.': 3,
    
    'buffer_overflow.': 3, 'loadmodule.': 3, 'rootkit.': 3, 'perl.': 3, 'sqlattack.': 3,
    'xterm.': 3, 'ps.': 3,
    
    'normal.': 0
}

init_df['label'] = init_df['label'].replace(label_mapping)


In [None]:
init_df['label'].value_counts()

In [None]:
cate_cols.remove('label')

In [None]:
category_mapping_truncated = {}
for category in cate_cols:
    labels, unique_values = pd.factorize(init_df[category])
    mapping = {value: label for label, value in enumerate(unique_values)}
    init_df[category] = labels
    category_mapping_truncated[category] = mapping

In [None]:
X = torch.tensor(init_df.drop('label',axis=1).values.astype(np.float32))
y = torch.tensor(init_df['label'].values, dtype=torch.long)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=42)
train_ds,valid_ds = TensorDataset(X_train, y_train), TensorDataset(X_test, y_test)
# Make a dataloader
BATCH = 1024
train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=BATCH, shuffle=True)

train_dl = DeviceDL(train_dl, device)
valid_dl = DeviceDL(valid_dl, device)

In [None]:
class TruncatedClassifierMLP(BaseNet):
    
    def __init__(self, activation, input_dim, hidden_1, hidden_2, out):
        super().__init__()
        self.hidden_1 = nn.Linear(input_dim, hidden_1)
        self.hidden_2 = nn.Linear(hidden_1, hidden_2)
        self.out = nn.Linear(hidden_2, out)
        self.activation = activation
        
    def forward(self, x):
        x1 = self.hidden_1(x)
        x2 = self.activation(x1)
        x3 = self.hidden_2(x2)
        x4 = self.activation(x3)
        out = self.out(x4)
        
        return out

In [None]:
activation = nn.ReLU()
input_dim = X.shape[1]
output_dim = init_df['label'].nunique()
hidden_1, hidden_2 = 128, 64


truncated_model = TruncatedClassifierMLP(activation, input_dim, hidden_1, hidden_2, output_dim)
to_device(truncated_model, device)

In [None]:
loss_fn = nn.CrossEntropyLoss()
lr = 0.000001
opt = torch.optim.Adam(truncated_model.parameters(), lr=lr)

In [None]:
history_truncated = fit(truncated_model, train_dl, valid_dl, loss_fn, opt, EPOCHS)

In [None]:
val_acc_values = [entry['valid_acc'] for entry in history_truncated]
train_loss_values = [entry['train_loss'] for entry in history_truncated]
val_loss_values = [entry['valid_loss'] for entry in history_truncated]

In [None]:
epoch_table_truncated_model = {
    "epochs" : list(range(1,21)),
    "Validation accuracy": val_acc_values,
    "Validation Loss": val_loss_values,
    "Train Loss" : train_loss_values
               }
pd.DataFrame(epoch_table_truncated_model).to_csv("epoch_table/EPOCH_DATA_TRUNCATED.csv", index = False)

In [None]:
torch.save(truncated_model.state_dict(), 'models/model_truncated_four_class.pth')
del truncated_model

In [None]:
truncated_weighted_model = TruncatedClassifierMLP(activation, input_dim, hidden_1, hidden_2, output_dim)
to_device(truncated_weighted_model, device)

In [None]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y.numpy())

print(dict(zip(init_df['label'].unique(),class_weights)))
print(init_df['label'].value_counts())

class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

In [None]:
loss_fn = nn.CrossEntropyLoss(class_weights)
lr = 0.000001
opt = torch.optim.Adam(truncated_weighted_model.parameters(), lr=lr)

In [None]:
history_weighted_truncated = fit(truncated_weighted_model, train_dl, valid_dl, loss_fn, opt, EPOCHS)

In [None]:
val_acc_weighted_values = [entry['valid_acc'] for entry in history_weighted_truncated]
train_loss_weighted_values = [entry['train_loss'] for entry in history_weighted_truncated]
val_loss_weighted_values = [entry['valid_loss'] for entry in history_weighted_truncated]

In [None]:
epoch_table_truncated_weighted_model = {
    "epochs" : list(range(1,21)),
    "Validation accuracy": val_acc_weighted_values,
    "Validation Loss": val_loss_weighted_values,
    "Train Loss" : train_loss_weighted_values
               }
pd.DataFrame(epoch_table_truncated_weighted_model).to_csv("epoch_table/EPOCH_DATA_WEIGHTED_TRUNCATED.csv", index = False)

In [None]:
torch.save(truncated_weighted_model.state_dict(), 'models/model_truncated_weighted_four_class.pth')

In [None]:
# all the testing is done on the following notebooks
# https://www.kaggle.com/ashimdahal/testing-for-cyber-innovations-lab
# and epoch table related graphs could be found on the following notebook
# https://github.com/ashimdahal/zero_day_ATTACK_detection/blob/main/Testing%20graphs%20for%20conference.ipynb

In [None]:
!tar czf output_folders.tar.gz *