In [None]:
import matplotlib.pyplot as plt
import numpy as np
import cv2
import pandas as pd
import os

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls '/content/drive/My Drive/hw2_aml/task2'

UNSW_NB15_testing-set.csv  UNSW_NB15_training-set.csv


In [None]:
path = '/content/drive/My Drive/hw2_aml/task2'

In [None]:
train = pd.read_csv((os.path.join(path,'UNSW_NB15_training-set.csv')))
test =pd.read_csv((os.path.join(path,'UNSW_NB15_testing-set.csv')))

In [None]:
train.drop(['label','id'],axis=1,inplace=True)
test.drop(['label','id'],axis=1,inplace=True)

In [None]:
print(f'shape is train csv is {train.shape}')
print(f'shape is test csv is {test.shape}')

shape is train csv is (82332, 43)
shape is test csv is (175341, 43)


In [None]:
X_train = train.loc[:, train.columns != 'attack_cat']
y_train = train['attack_cat']
X_test = test.loc[:, test.columns != 'attack_cat']
y_test = test['attack_cat']

In [None]:
catfeats = X_train.select_dtypes(include=['bool','object','datetime64']).columns.tolist()

In [None]:
cols = X_train.columns.tolist()
#cols

In [None]:
print(f' Number of missing values in training set is {X_train.isnull().sum().sum()}')
print(f' Number of missing values in testing set is {X_test.isnull().sum().sum()}')

 Number of missing values in training set is 0
 Number of missing values in testing set is 0


In [None]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)
enc.fit(X_train[catfeats])

OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

In [None]:
from sklearn.preprocessing import LabelEncoder
enc_y = LabelEncoder()
enc_y.fit(y_train)

LabelEncoder()

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

In [None]:
def transform_data(df,enc,scaler,feats,cols):
    new_feats = enc.transform(df[feats])
    new_cols = pd.DataFrame(new_feats, columns=feats,dtype=int)
    drop_df = df.drop(feats, axis=1)
    df.reset_index()
    drop_df.reset_index()
    new_cols.reset_index()
    new_df = pd.concat([drop_df, new_cols], axis=1)
    scaler.fit(new_df)
    scaled = scaler.transform(new_df)
    final_df = pd.DataFrame(scaled,columns = cols)
    #new_df.drop(features_name, axis=1, inplace=True)
    return final_df

In [None]:
def transform_y(df,enc):
  new_feat = enc.transform(df)
  df = pd.DataFrame(new_feat)
  df.rename(columns = {0:'attack_cat'},inplace=True)
  return df

In [None]:
new_X_train = transform_data(X_train,enc,scaler,catfeats,cols)
new_X_test = transform_data(X_test,enc,scaler,catfeats,cols)

In [None]:
print(f"Shape of new train is {new_X_train.shape}")
print(f"Shape of new test is {new_X_test.shape}")

Shape of new train is (82332, 42)
Shape of new test is (175341, 42)


In [None]:
y_train_f = transform_y(y_train,enc_y)
y_test_f = transform_y(y_test,enc_y)

In [None]:
def drop_useless(df):
  useless_cols = []
  l = df.columns 
  count = 0
  for i in l:
    res = df[i].value_counts(normalize=True)*100
  #print(typo)
    if any(x >= np.float64(99.0) for x in res.values):
      useless_cols.append(i)
  new_df = df.drop(useless_cols,axis=1)
  return new_df

In [None]:
X_train_f = drop_useless(new_X_train)
X_test_f = drop_useless(new_X_test)

In [None]:
print(f"Shape of new train is {X_train_f.shape}")
print(f"Shape of new test is {X_test_f.shape}")

Shape of new train is (82332, 40)
Shape of new test is (175341, 42)


Defining the cGAN model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary
from torch.utils.data import TensorDataset, DataLoader

In [None]:
#dataset class was created to use dataloader
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, X,Y):

        x = X.iloc[:,:].values
        y = Y.iloc[:].values
        self.x_train = torch.tensor(x,dtype=torch.float32)
        self.y_train = torch.tensor(y,dtype=torch.float32)
    def __len__(self):
        return len(self.x_train)
    def __getitem__(self,idx):
        X_1 = self.x_train[idx]
        y_1 = self.y_train[idx]
        return X_1,y_1

In [None]:
trainset = MyDataset(X_train_f,y_train_f)
trainloader = torch.utils.data.DataLoader(trainset, shuffle=True, batch_size=64)

In [None]:
testset = MyDataset(X_test_f,y_test_f)
testloader = torch.utils.data.DataLoader(testset, shuffle=False, batch_size=64)

In [None]:
im,l = next(iter(trainloader))
print(l.shape)

torch.Size([64, 1])


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Available device : {device}")

Available device : cuda:0


In [None]:
######## To print layer outputs ########
class PrintLayer(nn.Module):
    def __init__(self):
        super(PrintLayer, self).__init__()
                    
    def forward(self, x):
        # Do your print / debug stuff here
        print(x)
        return x

########################################


In [None]:
class Generator(nn.Module):
    # initializers
    def __init__(self,output_dim, noise_dim=32):
        super(Generator, self).__init__()
        input_out_feats = 64
        emb_dim = 5
        #output_dim = 20 + emb_dim
        
        self.input_layer = nn.Linear(in_features=noise_dim, out_features=input_out_feats)
        self.emb_y = nn.Sequential(nn.Embedding(num_embeddings=10,embedding_dim=emb_dim), nn.Flatten())
        self.model = nn.Sequential( nn.Linear(input_out_feats + emb_dim, 128), nn.ReLU(),  # 64 + 5
                                    nn.Linear(128, 256), nn.ReLU(),
                                    nn.Linear(256, 512), nn.ReLU(),
                                    nn.Linear(512, output_dim), nn.Tanh())
        
    def forward(self, x, y):
         x = self.input_layer(x)
         y = self.emb_y(y)
         return self.model(torch.cat((x, y), dim=1))
        
        
class Discriminator(nn.Module):
    def __init__(self,input_size):
        super(Discriminator, self).__init__()
        
        emb_dim = 5
        self.emb_y = nn.Sequential(nn.Embedding(num_embeddings=10,embedding_dim=emb_dim),nn.Flatten())
        self.model = nn.Sequential(nn.Linear(input_size + emb_dim, 512), nn.LeakyReLU(),
                                   nn.Linear(512, 256), nn.LeakyReLU(),
                                   nn.Linear(256, 128), nn.LeakyReLU(),
                                   nn.Linear(128, 1), nn.Sigmoid())

    def forward(self, x, y):
        y = self.emb_y(y)
        #print("Shape :", y.shape)
        #print("shape of x from dis is ", x.shape)
        #print("shape of cat from dis is ", torch.cat((x, y),dim=1).shape)
        return self.model(torch.cat((x, y),dim=1))

In [None]:
def gen_noise(BATCH_SIZE, z_noise):
    return torch.tensor(np.random.uniform(0., 1., size=[BATCH_SIZE, z_noise]))

gen_noise(64,32).shape

torch.Size([64, 32])

In [None]:
from torch.nn import init
def init_weights(m):
  if isinstance(m, nn.Linear):
        torch.nn.init.normal_(m.weight,0,1)
        m.bias.data.fill_(1)
  elif isinstance(m,nn.BatchNorm1d):
        torch.nn.init.normal_(m.weight,0,1)
        m.bias.data.zero_()

In [None]:
# label-smoothing
REAL = 0.9
FAKE = 0.1
BATCH_SIZE = 64
Z_DIM = 32
learning_rate = 0.0005
n_epochs = 10

In [None]:
# BUILD The NETWORK

def Build_network(Z_DIM):

  D = Discriminator(input_size=40).to(device).float()
  G = Generator(output_dim=40, noise_dim=Z_DIM).to(device).float()

  D.apply(init_weights)
  G.apply(init_weights)

  print("Generator model is, ", G)
  print("Discriminator model is ", D)

  return D,G

In [None]:
D, G = Build_network(Z_DIM)

Generator model is,  Generator(
  (input_layer): Linear(in_features=32, out_features=64, bias=True)
  (emb_y): Sequential(
    (0): Embedding(10, 5)
    (1): Flatten(start_dim=1, end_dim=-1)
  )
  (model): Sequential(
    (0): Linear(in_features=69, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=40, bias=True)
    (7): Tanh()
  )
)
Discriminator model is  Discriminator(
  (emb_y): Sequential(
    (0): Embedding(10, 5)
    (1): Flatten(start_dim=1, end_dim=-1)
  )
  (model): Sequential(
    (0): Linear(in_features=45, out_features=512, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): LeakyReLU(negative_slope=0.01)
    

In [None]:
loss_module = nn.BCELoss()

D_optimizer = optim.SGD(D.parameters(), learning_rate)
G_optimizer = optim.SGD(G.parameters(), learning_rate)

In [None]:
def gen_noise():
    return torch.tensor(np.random.uniform(0., 1., size=[BATCH_SIZE, Z_DIM]))

def g_training_step(x, y):
    # generate the noise
    G_optimizer.zero_grad()
    z = gen_noise().to(torch.float32).to(device)
    y = y.to(device)
    # get fake samples from generator
    x_fake = G(z, y)
    # generate labels for fake data - we are training for the G, so we will lie to the D and say that this is real data
    y_fake = torch.tensor([[REAL]] * BATCH_SIZE).to(device)
    # get D's verdict
    D_verdict = D(x_fake, y)
    # calculate loss
    G_loss = loss_module(D_verdict, y_fake)
    # update the model
    G_loss.backward()
    G_optimizer.step()
    return G_loss.item() # 

def d_training_step(x, y):
    # this time we are training the D, so first - we give it real data
    D_optimizer.zero_grad()
    x_real, y_real = x, torch.tensor([[REAL]] * BATCH_SIZE).to(device)
    y = y.to(device)
    #print("X type", x_real.dtype)
    #print("Y type", y.dtype)
    # get its verdict
    D_verdict = D(x_real.to(device), y)
    # calculate the loss
    D_real_loss = loss_module(D_verdict, y_real)

    # now we give it the fake data and tell it that it's fake
    # for the first few epochs we could tell it that it is actually REAL as well, just to prevent it from learning too fast
    z = gen_noise().to(torch.float32).to(device)
    x_fake, y_fake = G(z, y), torch.tensor([[FAKE]] * BATCH_SIZE).to(device)
    D_verdict = D(x_fake, y)
    # update the model
    D_fake_loss = loss_module(D_verdict, y_fake)
    # its final loss is the sum of two losses
    D_loss = D_real_loss + D_fake_loss
    D_loss.backward()
    D_optimizer.step()
    return D_loss.item()

In [None]:
gen_noise().dtype

torch.float64

In [None]:
print_every = 1
def freeze_model(model):
    for param in model.parameters():
        param.requires_grad = False

def unfreeze_model(model):
    for param in model.parameters():
        param.requires_grad = True

def train(n_epochs,dataloader):
    for epoch in range(1, n_epochs+1):
        D_losses, G_losses = [], []
        for x,y in dataloader:
            # first we train ONLY the D
            freeze_model(G), unfreeze_model(D)
            d_loss = d_training_step(x, y.int())
            D_losses.append(d_loss)
            # then we train ONLY the G
            freeze_model(D), unfreeze_model(G)
            g_loss = g_training_step(x, y.int())
            G_losses.append(g_loss)

        if epoch % print_every == 0 or epoch == 1 or epoch == n_epochs:
            print(f'{epoch}:\tloss_d: {round(np.mean(D_losses), 4)}\tloss_g: {round(np.mean(G_losses), 4)}')

In [None]:
train(2000,trainloader)

ValueError: ignored

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier()
clf.fit(x_train, y_train)

"\nimport torch\n\n# Check for a GPU\ntrain_on_gpu = torch.cuda.is_available()\nif not train_on_gpu:\n    print('No GPU found. Please use a GPU to train your neural network.')\nelse:\n    print('Training on GPU!')\n"

Explainable Boosting

In [None]:
from interpret.glassbox import ExplainableBoostingClassifier

In [None]:
ebm = ExplainableBoostingClassifier(random_state=seed)
ebm.fit(X_train, y_train)

In [None]:
y_predebm = clf.predict(x_test)
accuracy_score(y_test, y_predebm)

Neural Net

In [None]:
import torch.nn as nn


class SimpleNN(nn.Module):
    def __init__(self,inp,H,output):
        super(SimpleNN,self).__init__()
        self.linear1=nn.Linear(inp,H)
        self.linear2=nn.Linear(H,output)
        
    def forward(self,x):
        x=self.linear1(x) 
        x=torch.sigmoid(self.linear2(x))
        return x