In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle

In [None]:
df1=pd.read_csv("/kaggle/input/iot23preprocesseddata/iot23_combined_new.csv")
df1.head()

In [None]:
df=df1.copy()

In [None]:
df=df.iloc[:1000000,:]

In [None]:
cat_vars = ['uid', 'id.orig_h', 'id.resp_h', 'proto', 'service', 'conn_state', 'local_orig', 'local_resp', 'history']
df[cat_vars] = df[cat_vars].replace('-', 'unknown')

columns_to_clean = ['duration', 'orig_bytes', 'resp_bytes']


df[columns_to_clean] = df[columns_to_clean].replace('-', 0)

In [None]:
pct_anomalies = 0.05
le = LabelEncoder()
le.fit(df.label)

def reduce_anomalies(df, pct_anomalies=.01):
    labels = df['label'].copy()
    is_anomaly = labels != 'Benign'
    num_normal = np.sum(~is_anomaly)
    num_anomalies = int(pct_anomalies * num_normal)
    all_anomalies = labels[labels != 'Benign']
    anomalies_to_keep = np.random.choice(all_anomalies.index, size=num_anomalies, replace=False)
    anomalous_data = df.iloc[anomalies_to_keep].copy()
    normal_data = df[~is_anomaly].copy()
    new_df = pd.concat([normal_data, anomalous_data], axis=0)
    return new_df

df = reduce_anomalies(df, pct_anomalies=pct_anomalies)

In [None]:
cat_data = pd.get_dummies(df[cat_vars])

numeric_vars = list(set(df.columns.values.tolist()) - set(cat_vars))
numeric_vars.remove('label')
numeric_data = df[numeric_vars].copy()

numeric_cat_data = pd.concat([numeric_data, cat_data], axis=1)

labels = df['label'].copy()

integer_labels = le.transform(labels)

numeric_cat_data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(numeric_cat_data,
                                                    integer_labels,
                                                    test_size=.25, 
                                                    random_state=42)
set(y_train)

In [None]:
import matplotlib.pyplot as plt
from IPython.display import Image
import os, datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_curve, auc, confusion_matrix
from sklearn.cluster import KMeans

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import pickle
import random

random.seed(123)
torch.manual_seed(123)

In [None]:
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)

In [None]:
input_dim = x_train.shape[1]

batch_size = 512

latent_dim = 16

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Encoder, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, 96)
        self.bn1 = nn.BatchNorm1d(96)
        self.dropout1 = nn.Dropout(0.2)
        
        self.fc2 = nn.Linear(96, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.2)
        
        self.fc3 = nn.Linear(64, 48)
        self.bn3 = nn.BatchNorm1d(48)
        self.dropout3 = nn.Dropout(0.2)
        
        self.fc4 = nn.Linear(48, 16)
        self.bn4 = nn.BatchNorm1d(16)
        self.dropout4 = nn.Dropout(0.2)
        
        self.fc_latent = nn.Linear(16, latent_dim)
        
        self.activation = nn.ReLU()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.activation(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.activation(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.activation(x)
        x = self.dropout3(x)
        
        x = self.fc4(x)
        x = self.bn4(x)
        x = self.activation(x)
        x = self.dropout4(x)
        
        latent_encoding = self.fc_latent(x)
        
        return latent_encoding

In [None]:
class Decoder(nn.Module):
    def __init__(self, latent_dim, input_dim):
        super(Decoder, self).__init__()
        
        self.fc1 = nn.Linear(latent_dim, 16)
        self.bn1 = nn.BatchNorm1d(16)
        self.dropout1 = nn.Dropout(0.2)
        
        self.fc2 = nn.Linear(16, 48)
        self.bn2 = nn.BatchNorm1d(48)
        self.dropout2 = nn.Dropout(0.2)
        
        self.fc3 = nn.Linear(48, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.dropout3 = nn.Dropout(0.2)
        
        self.fc4 = nn.Linear(64, 96)
        self.bn4 = nn.BatchNorm1d(96)
        self.dropout4 = nn.Dropout(0.2)
        
        self.fc_output = nn.Linear(96, input_dim)
        
        self.activation = nn.ReLU()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.activation(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.activation(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.activation(x)
        x = self.dropout3(x)
        
        x = self.fc4(x)
        x = self.bn4(x)
        x = self.activation(x)
        x = self.dropout4(x)
        
        reconstructed_data = self.fc_output(x)
        
        return reconstructed_data.squeeze()

In [None]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = Encoder(input_dim, latent_dim)
        self.decoder = Decoder(latent_dim, input_dim)
    
    def forward(self, x):
        latent_encoding = self.encoder(x)
        reconstructed_data = self.decoder(latent_encoding)
        return reconstructed_data.squeeze()

In [None]:
model=AutoEncoder(input_dim, latent_dim)

In [None]:
x_train = torch.tensor(x_train, dtype=torch.float32)
x_test = torch.tensor(x_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [None]:
train_dataset = TensorDataset(x_train, y_train)
test_dataset = TensorDataset(x_test, y_test)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.0001)


criterion = nn.L1Loss()
model = model.to(device)

num_epochs = 20 
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
   
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs = inputs.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        
      
        outputs = model(inputs)
        targets = targets.unsqueeze(1)  # Reshape targets to [batch_size, 1]
        targets = targets.expand(-1, outputs.shape[1])  # Expand to [batch_size, input_dim]
     
        
       
        loss = criterion(outputs, targets)
        
      
        loss.backward()
        optimizer.step()
        
        # Accumulate loss
        running_loss += loss.item()
    

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')

In [None]:
import torch.nn.functional as F
model.eval() 

x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
x_test_recon = torch.tensor(x_test, dtype=torch.float32)

x_test_tensor = x_test_tensor.to(device)
x_test = np.array(x_test)  
x_test_recon = np.array(x_test_recon)  

with torch.no_grad():
    x_test_recon = model(x_test_tensor).cpu().numpy()

reconstruction_scores = np.mean((x_test - x_test_recon) ** 2, axis=1)

anomaly_data = pd.DataFrame({'recon_score': reconstruction_scores})

print(anomaly_data.describe())

plt.xlabel('Reconstruction Score')
anomaly_data['recon_score'].plot(kind='hist',bins=20, range=[-0.001, 0.005])

In [None]:
def convert_label_to_binary(label_encoder, labels):
    normal_idx = np.where(label_encoder.classes_ == 'Benign')[0][0]
    my_labels = labels.numpy().copy()
    my_labels[my_labels != normal_idx] = 1  # Anomalous
    my_labels[my_labels == normal_idx] = 0  # Normal
    return my_labels


binary_labels = convert_label_to_binary(le, y_test)


anomaly_data['binary_labels'] = binary_labels

print(anomaly_data.groupby('binary_labels').describe())

In [None]:
fpr, tpr, thresholds = roc_curve(binary_labels, reconstruction_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='lime', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
optimal_threshold_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_threshold_idx]
print("Optimal Threshold:", optimal_threshold)

# Use the optimal threshold for predictions
pred_labels = (reconstruction_scores > optimal_threshold).astype(int)

recon_mean = np.mean(reconstruction_scores)
recon_stddev = np.std(reconstruction_scores)
stats_threshold = recon_mean + 5 * recon_stddev
print("Statistical Threshold:", stats_threshold)

results = confusion_matrix(binary_labels, pred_labels)

In [None]:
def plot_confusion_matrix(cm, target_names, title='Confusion Matrix', cmap=plt.cm.Greens):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target_names))
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()

    width, height = cm.shape
    for x in range(width):
        for y in range(height):
            plt.annotate(str(cm[x][y]), xy=(y, x), 
                         horizontalalignment='center',
                         verticalalignment='center')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

# Plot the confusion matrix
plot_confusion_matrix(results, ['Normal', 'Anomaly'])
results

In [None]:
torch.save(model, "model.pth")