In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/coordinated-attack-dataset-for-cids-nf/CoAt_NF-UQ-NIDS-V2.parquet


In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# Load dataset
df = pd.read_parquet('/kaggle/input/coordinated-attack-dataset-for-cids-nf/CoAt_NF-UQ-NIDS-V2.parquet', engine='pyarrow')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43642589 entries, 2 to 62672010
Data columns (total 43 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   L4_SRC_PORT                  int16  
 1   L4_DST_PORT                  int16  
 2   PROTOCOL                     int16  
 3   L7_PROTO                     float32
 4   IN_BYTES                     int32  
 5   IN_PKTS                      int32  
 6   OUT_BYTES                    int32  
 7   OUT_PKTS                     int32  
 8   TCP_FLAGS                    int16  
 9   CLIENT_TCP_FLAGS             int16  
 10  SERVER_TCP_FLAGS             int16  
 11  FLOW_DURATION_MILLISECONDS   int32  
 12  DURATION_IN                  int32  
 13  DURATION_OUT                 int32  
 14  MIN_TTL                      int16  
 15  MAX_TTL                      int16  
 16  LONGEST_FLOW_PKT             int32  
 17  SHORTEST_FLOW_PKT            int16  
 18  MIN_IP_PKT_LEN               int16  
 19  MAX

In [5]:
# Separate features and target
y = df['Attack'].values
df = df.drop(['Label', 'Attack'], axis=1).values

In [6]:
# Split the dataset
df, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=42, stratify=y)

In [7]:
# Standardize the data
scaler = QuantileTransformer(output_distribution='normal')
df = scaler.fit_transform(df)
X_test = scaler.transform(X_test)

In [8]:
# Build a neural network for binary classification
model = Sequential([
    Dense(64, input_dim=df.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:
# Train the model
history = model.fit(df, y_train, epochs=10, validation_split=0.2, batch_size=32)

Epoch 1/10
[1m763746/763746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1251s[0m 2ms/step - accuracy: 0.9667 - loss: 0.0863 - val_accuracy: 0.9759 - val_loss: 0.0940
Epoch 2/10
[1m763746/763746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1241s[0m 2ms/step - accuracy: 0.9734 - loss: 0.0702 - val_accuracy: 0.9766 - val_loss: 0.0733
Epoch 3/10
[1m763746/763746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1248s[0m 2ms/step - accuracy: 0.9740 - loss: 0.0693 - val_accuracy: 0.9767 - val_loss: 0.0769
Epoch 4/10
[1m763746/763746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1245s[0m 2ms/step - accuracy: 0.9739 - loss: 0.0696 - val_accuracy: 0.9756 - val_loss: 0.0839
Epoch 5/10
[1m763746/763746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1239s[0m 2ms/step - accuracy: 0.9738 - loss: 0.0695 - val_accuracy: 0.9766 - val_loss: 0.0744
Epoch 6/10
[1m763746/763746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1253s[0m 2ms/step - accuracy: 0.9747 - loss: 0.0684 - val_accuracy: 

In [10]:
# Evaluate the model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Metrics and confusion matrix
print(classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

[1m409150/409150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m411s[0m 1ms/step
              precision    recall  f1-score   support

           0       0.98      0.97      0.97   6223147
           1       0.97      0.98      0.98   6869630

    accuracy                           0.98  13092777
   macro avg       0.98      0.98      0.98  13092777
weighted avg       0.98      0.98      0.98  13092777

Confusion Matrix:
[[6016277  206870]
 [ 104623 6765007]]
