In [1]:
!pip install seaborn

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import time
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sklearn import metrics
import scipy.spatial as sp
from torch.autograd import Variable
import argparse
import scipy.io
import pandas as pd 
import seaborn as sns

In [3]:
import torch
torch.cuda.empty_cache()

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: NVIDIA A100-SXM-80GB, n_gpu: 1


NVIDIA A100-SXM-80GB with CUDA capability sm_80 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA A100-SXM-80GB GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [4]:
import torch

# check if PyTorch supports sm_80
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    if torch.cuda.get_device_capability(device)[0] < 8:
        print("NVIDIA A100-SXM-80GB with CUDA capability sm_80 is not compatible with the current PyTorch installation.")
        print("The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.")
        print("Please check the instructions at https://pytorch.org/get-started/locally/ to install a compatible version.")
    else:
        print("PyTorch is compatible with sm_80.")
else:
    print("CUDA is not available on this system.")

PyTorch is compatible with sm_80.


In [5]:
df = pd.read_csv('kdd.txt',delimiter = ',',header = None)

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,119,120
0,0,181,5450,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
1,0,239,486,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
2,0,235,1337,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
3,0,219,1337,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
4,0,217,2032,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0


In [7]:
df_label = pd.read_csv('kddlabel.txt',delimiter = ',',header = None)

In [8]:
df_label.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494021 entries, 0 to 494020
Columns: 121 entries, 0 to 120
dtypes: float64(15), int64(106)
memory usage: 456.1 MB


In [10]:
df_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494021 entries, 0 to 494020
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   0       494021 non-null  int64
dtypes: int64(1)
memory usage: 3.8 MB


In [11]:
label_counts = df_label[0].value_counts()

In [12]:
label_counts_dict = label_counts.to_dict()
label_counts_dict

{1: 396743, 0: 97278}

In [13]:
corr_matrix = df.corr().abs()
corr_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,119,120
0,1.000000,0.004258,0.005440,0.003235,0.003786,0.013213,0.005239,0.058095,0.021340,0.055853,...,0.002990,0.021548,0.000998,0.000452,0.000452,0.017265,0.017265,,0.023424,0.023424
1,0.004258,1.000000,0.000002,0.000139,0.000005,0.004483,0.000027,0.000119,0.000022,0.000010,...,0.002325,0.001725,0.000045,0.000020,0.000020,0.001701,0.001701,,0.000082,0.000082
2,0.005440,0.000002,1.000000,0.001254,0.016288,0.004365,0.049330,0.023298,0.031680,0.075656,...,0.000474,0.014036,0.000387,0.000175,0.000175,0.047814,0.047814,,0.001289,0.001289
3,0.003235,0.000139,0.001254,1.000000,0.000123,0.002106,0.000467,0.000271,0.000504,0.000223,...,0.000215,0.026372,0.000702,0.000318,0.000318,0.019908,0.019908,,0.001778,0.001778
4,0.003786,0.000005,0.016288,0.000123,1.000000,0.000356,0.141996,0.014285,0.034790,0.000012,...,0.000012,0.001421,0.000038,0.000017,0.000017,0.006164,0.006164,,0.000096,0.000096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,0.017265,0.001701,0.047814,0.019908,0.006164,0.105305,0.001145,0.013612,0.025293,0.011207,...,0.006986,0.227275,0.006140,0.002784,0.002784,1.000000,1.000000,,0.089318,0.089318
117,0.017265,0.001701,0.047814,0.019908,0.006164,0.105305,0.001145,0.013612,0.025293,0.011207,...,0.006986,0.227275,0.006140,0.002784,0.002784,1.000000,1.000000,,0.089318,0.089318
118,,,,,,,,,,,...,,,,,,,,,,
119,0.023424,0.000082,0.001289,0.001778,0.000096,0.843572,0.000365,0.000212,0.000393,0.000174,...,0.000168,0.020336,0.000548,0.000249,0.000249,0.089318,0.089318,,1.000000,1.000000


In [14]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

In [15]:
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
to_drop

[10, 18, 20, 26, 30, 31, 32, 33, 34, 51, 104, 108, 112, 115, 117, 120]

In [16]:
df.drop(to_drop, axis=1, inplace=True)

In [17]:
df = df.sample(frac=1).reset_index(drop=True)

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
scaler = StandardScaler()

X_trn = scaler.fit_transform(df)

In [20]:
X_trn.shape

(494021, 105)

In [23]:
import pandas as pd

total = len(X_trn)

num_train = int(total * .8)
num_val = int(total * .1)
num_test = total - num_train - num_val

X_trn_df = pd.DataFrame(X_trn)

train_set = X_trn_df.iloc[:num_train]
val_set = X_trn_df.iloc[num_train:num_train+num_val]
test_set = X_trn_df.iloc[num_train+num_val:]

train_label = df_label.iloc[:num_train]
val_label = df_label.iloc[num_train:num_train+num_val]
test_label = df_label.iloc[num_train+num_val:]


In [24]:
print(len(train_label))
print(len(val_label))
print(len(test_label))

395216
49402
49403


In [28]:
class MemStream(nn.Module):
    def __init__(self, in_dim, params):
        super(MemStream, self).__init__()
        self.params = params
        self.in_dim = in_dim
        self.out_dim = in_dim*2
        self.memory_len = params['memory_len']
        self.max_thres = torch.tensor(params['beta']).to(device)
        self.memory = torch.randn(self.memory_len, self.out_dim).to(device)
        self.mem_data = torch.randn(self.memory_len, self.in_dim).to(device)
        self.memory.requires_grad = False
        self.mem_data.requires_grad = False
        self.batch_size = params['memory_len']
        self.num_mem_update = 0
        self.encoder = nn.Sequential(
            nn.Linear(self.in_dim, self.out_dim),
            nn.Tanh(),
        ).to(device)
        self.decoder = nn.Sequential(
            nn.Linear(self.out_dim, self.in_dim)
        ).to(device)
        self.clock = 0
        self.last_update = -1
        self.optimizer = torch.optim.Adam(self.parameters(), lr=params['lr'])
        self.loss_fn = nn.MSELoss()
        self.count = 0
        
    def train_autoencoder(self, data, epochs, val_set):
        self.mean, self.std = self.mem_data.mean(0), self.mem_data.std(0)
        new = (data - self.mean) / self.std
        new[:, self.std == 0] = 0
        new = Variable(new)

        best_val_loss = float('inf')

        for epoch in range(epochs):
            self.optimizer.zero_grad()
            output = self.decoder(self.encoder(new + 0.001*torch.randn_like(new).to(device)))
            loss = self.loss_fn(output, new)
            loss.backward()
            self.optimizer.step()
            total_train_loss += loss.item()

            if (epoch + 1) % validation_steps == 0:
                val_loss = 0
                for i in range(0, len(val_set), batch_size):
                    batch = val_set[i:i + batch_size]
                    x_val = torch.stack([data[0] for data in batch])
                    x_val = (x_val - self.mean) / self.std
                    x_val[:, self.std == 0] = 0
                    x_val = Variable(x_val).to(device)
                    with torch.no_grad():
                        val_output = self.decoder(self.encoder(x_val))
                        val_loss += self.loss_fn(val_output, x_val).item()
                val_loss /= len(val_set) // batch_size + 1
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    self.save_checkpoint()

            if loss.item() < 0.0001:
                break

        return total_train_loss / len(data)



    def update_memory(self, output_loss, encoder_output, data):
        if output_loss <= self.max_thres:
            least_used_pos = self.count%self.memory_len
            self.memory[least_used_pos] = encoder_output
            self.mem_data[least_used_pos] = data
            self.mean, self.std = self.mem_data.mean(0), self.mem_data.std(0)
            self.count += 1
            return 1
        return 0

    def initialize_memory(self, x):
        mean, std = model.mem_data.mean(0), model.mem_data.std(0)
        new = (x - mean) / std
        new[:, std == 0] = 0
        self.memory = self.encoder(new)
        self.memory.requires_grad = False
        self.mem_data = x

    def forward(self, x):
        new = (x - self.mean) / self.std
        new[:, self.std == 0] = 0
        encoder_output = self.encoder(new)
        loss_values = torch.norm(self.memory - encoder_output, dim=1, p=1).min()
        self.update_memory(loss_values, encoder_output, x)
        return loss_values

In [34]:
import random
import torch
from torch.utils.data import DataLoader

# training loop

params = {
          'beta': 1, 'memory_len': 256, 'batch_size':1, 'lr':0.01
         }

model = MemStream(X_trn[0].shape[0],params).to(device)

# For each epoch...
for epoch_i in range(0, 100):
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, 100))
    print('Training...')

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    num_batches = len(X_trn) // batch_size + 1

    for i in range(num_batches):
        end_index = min(batch_size * (i+1), len(X_trn))

        batch = numeric[i*batch_size:end_index]

        if len(batch) == 0: 
            continue

        input_tensors = torch.stack([torch.Tensor(data).long() for data in batch])

        # Move tensors to the GPU
        input_tensors = input_tensors.to(device)

        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(input_tensors)
        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        # Clear the previously calculated gradient
        model.zero_grad()     

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set. Implement this function in the cell above.
    print(f"Total loss: {total_train_loss}")
    val_acc = get_validation_performance(val_set)
    print(f"Validation accuracy: {val_acc}")
    
print("")
print("Training complete!")

# Initialize the DataLoader
batch_size = args.batch_size
data_loader = DataLoader(numeric, batch_size=batch_size)

# Set the model to evaluation mode
model.eval()

# Initialize the memory with the first N instances from the 0 label class
N = args.memlen
init_data = X_trn[labels == 0][:N].to(device)
model.initialize_memory(init_data)

# Disable gradient calculation for memory retrieval
torch.set_grad_enabled(False)

# Compute the reconstruction loss on the entire dataset
recon_losses = []
for data in data_loader:
    output = model(data.to(device))
    loss = output.loss.item()
    recon_losses.append(loss)
    print("Reconstruction loss:", loss)

# Calculate the area under the ROC curve using reconstruction losses as scores
scores = torch.Tensor(recon_losses)
auc = metrics.roc_auc_score(labels, scores)
print("ROC-AUC", auc)



Training...


NameError: name 'numeric' is not defined