Network Activity Anomaly Detection, by Consulting and Analytics Club, IIT Guwahati
    By- Aadeep Aggarwal

Step 1: Importing Libraries

    Importing all necessary libraries for data processing, model building, and evaluation.


In [1]:
# Importing all necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten


Step 2: Loading the Data

    Loading the training and testing datasets and displaying the first few rows of the training dataset.


In [2]:
# Load the training and testing datasets
train_data = pd.read_csv('Train_Data.csv')
test_data = pd.read_csv('Test_Data.csv')

# Display the first few rows of the training dataset
train_data.head()


Unnamed: 0,duration,protocoltype,service,flag,srcbytes,dstbytes,land,wrongfragment,urgent,hot,...,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,lastflag,attack
0,0,tcp,netbios_dgm,REJ,0,0,0,0,0,0,...,0.06,0.06,0.0,0.0,0.0,0.0,1.0,1.0,21,neptune
1,0,tcp,smtp,SF,1239,400,0,0,0,0,...,0.45,0.04,0.0,0.0,0.11,0.0,0.02,0.0,18,normal
2,0,tcp,http,SF,222,945,0,0,0,0,...,1.0,0.0,0.02,0.03,0.0,0.0,0.0,0.0,21,normal
3,0,tcp,http,SF,235,1380,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,normal
4,0,tcp,uucp_path,REJ,0,0,0,0,0,0,...,0.01,0.08,0.0,0.0,0.0,0.0,1.0,1.0,19,neptune


Step 3: Preprocessing the Data

    One-Hot Encoding

        Performing one-hot encoding on categorical columns to convert text data into numerical data and aligning the test dataset columns to match the training dataset.


In [3]:
# Perform one-hot encoding on categorical columns in the training set
train_data = pd.get_dummies(train_data, columns=['protocoltype', 'service', 'flag', 'lastflag'])

# Perform one-hot encoding on categorical columns in the test set
test_data = pd.get_dummies(test_data, columns=['protocoltype', 'service', 'flag', 'lastflag'])

# Align the columns of the test set to match the training set
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)


    Separating Features and Labels

        Separating the features and labels for the training data and preparing the test data for prediction.


In [5]:
# Separate the features and labels for training
X_train = train_data.drop(columns=['attack'])
y_train = train_data['attack'].apply(lambda x: 1 if x == 'neptune' else 0)

# Separate the features for testing
X_test = test_data.drop(columns=['attack'])


In [4]:
# Separate the features and labels for training in bool type
X_train = train_data.drop(columns=['attack'])
y_train = train_data['attack'].apply(lambda x: True if x == 'neptune' else False)

# Separate the features for testing
X_test = test_data.drop(columns=['attack'])


Step 4: Engineering Features

 Scaling Features

    Normalizing the features for better model performance.


In [5]:
# Normalize the features for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Reducing Dimensionality 

    Using PCA for dimensionality reduction to reduce the number of features.


In [6]:
# Use PCA for dimensionality reduction
pca = PCA(n_components=30)  # Number of components can be tuned
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)


Step 5: Training the Model

    Using Convolutional Neural Network

        Defining and training a Convolutional Neural Network (CNN) for binary classification.


In [7]:
# Reshape data for Conv1D
X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Define the model architecture
model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_reshaped.shape[1], 1)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32, validation_split=0.2)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 7ms/step - accuracy: 0.9953 - loss: 0.0228 - val_accuracy: 0.9997 - val_loss: 0.0018
Epoch 2/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - accuracy: 0.9995 - loss: 0.0023 - val_accuracy: 0.9999 - val_loss: 0.0015
Epoch 3/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.9997 - loss: 0.0014 - val_accuracy: 0.9999 - val_loss: 0.0012
Epoch 4/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - accuracy: 1.0000 - loss: 1.8663e-04 - val_accuracy: 0.9998 - val_loss: 0.0034
Epoch 5/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 6ms/step - accuracy: 0.9998 - loss: 0.0014 - val_accuracy: 0.9998 - val_loss: 0.0017
Epoch 6/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 14ms/step - accuracy: 1.0000 - loss: 6.6977e-05 - val_accuracy: 0.9998 - val_loss: 0.0022
E

<keras.src.callbacks.history.History at 0x2aa49ae00b0>

Step 6: Performing Cross-Validation 

    Setting up and performing K-Fold cross-validation to evaluate model performance.


In [12]:
# Set up K-Fold cross-validation for integer type
kf = KFold(n_splits=5)
cv_scores = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train_reshaped[train_index], X_train_reshaped[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
    # Train the model on each fold
    model.fit(X_train_fold, y_train_fold, epochs=10, batch_size=32)
    
    # Make predictions on the validation fold
    y_val_pred_prob = model.predict(X_val_fold)
    y_val_pred = (y_val_pred_prob > 0.5).astype(int)
    
    # Calculate and store the accuracy score
    cv_scores.append(accuracy_score(y_val_fold, y_val_pred))

# Print the cross-validation scores
print(f'Cross-validation scores: {cv_scores}')


Epoch 1/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.9999 - loss: 6.8681e-04
Epoch 2/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 14ms/step - accuracy: 1.0000 - loss: 1.5589e-06
Epoch 3/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 15ms/step - accuracy: 1.0000 - loss: 1.8757e-07
Epoch 4/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 15ms/step - accuracy: 1.0000 - loss: 4.2408e-08
Epoch 5/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 14ms/step - accuracy: 1.0000 - loss: 2.5855e-08
Epoch 6/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 15ms/step - accuracy: 1.0000 - loss: 1.7224e-08
Epoch 7/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 13ms/step - accuracy: 1.0000 - loss: 7.9447e-09
Epoch 8/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 12ms/step - accuracy: 1.0

In [8]:
# Set up K-Fold cross-validation
kf = KFold(n_splits=5)
cv_scores = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train_reshaped[train_index], X_train_reshaped[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
    # Train the model on each fold
    model.fit(X_train_fold, y_train_fold, epochs=10, batch_size=32)
    
    # Make predictions on the validation fold
    y_val_pred_prob = model.predict(X_val_fold)
    y_val_pred = (y_val_pred_prob > 0.5).astype(bool)
    
    # Calculate and store the accuracy score
    cv_scores.append(accuracy_score(y_val_fold, y_val_pred))

# Print the cross-validation scores
print(f'Cross-validation scores: {cv_scores}')


Epoch 1/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 6ms/step - accuracy: 1.0000 - loss: 7.6805e-04
Epoch 2/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0021
Epoch 3/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 9ms/step - accuracy: 0.9999 - loss: 4.1147e-04
Epoch 4/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - accuracy: 1.0000 - loss: 2.5193e-05
Epoch 5/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.9999 - loss: 7.0693e-04
Epoch 6/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 9ms/step - accuracy: 0.9998 - loss: 8.2106e-04
Epoch 7/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 7ms/step - accuracy: 0.9999 - loss: 4.3547e-04
Epoch 8/10
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 1.0000 - loss:

Step 7: Making Predictions

    Using the trained model to make predictions on the test set and saving the results to a CSV file.


In [9]:
# Make predictions on the test set
y_test_pred = model.predict(X_test_reshaped)
y_test_pred = [1 if pred == 1 else 0 for pred in y_test_pred]

# Save predictions to a CSV file
submission_aadeep = pd.DataFrame({'attack': y_test_pred})
submission_aadeep.to_csv('submission_aadeep.csv', index=False)


[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
