In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/coordinated-attack-dataset-for-cids/CoAt_CIC-ToN-IoT-V2.parquet
/kaggle/input/coordinated-attack-dataset-for-cids/CoAt_CIC-IDS2017-V2.parquet
/kaggle/input/coordinated-attack-dataset-for-cids/CoAt_CIC-UNSW-NB15_Feeded-V2.parquet
/kaggle/input/coordinated-attack-dataset-for-cids/CoAt_CSE-CIC-IDS2018_Feeded.parquet
/kaggle/input/coordinated-attack-dataset-for-cids/CoAt_CIC-BoT-IoT-V2.parquet


In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# Load dataset
df = pd.read_parquet('/kaggle/input/coordinated-attack-dataset-for-cids/CoAt_CSE-CIC-IDS2018_Feeded.parquet', engine='pyarrow')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6369664 entries, 0 to 361854
Data columns (total 79 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   Protocol                  int8   
 1   Flow Duration             int64  
 2   Total Fwd Packets         int32  
 3   Total Backward Packets    int32  
 4   Fwd Packets Length Total  float64
 5   Bwd Packets Length Total  float64
 6   Fwd Packet Length Max     float64
 7   Fwd Packet Length Min     float32
 8   Fwd Packet Length Mean    float32
 9   Fwd Packet Length Std     float32
 10  Bwd Packet Length Max     float64
 11  Bwd Packet Length Min     float32
 12  Bwd Packet Length Mean    float32
 13  Bwd Packet Length Std     float32
 14  Flow Bytes/s              float64
 15  Flow Packets/s            float64
 16  Flow IAT Mean             float32
 17  Flow IAT Std              float32
 18  Flow IAT Max              float64
 19  Flow IAT Min              float64
 20  Fwd IAT Total             floa

In [5]:
# Separate features and target
X = df.drop(['Label', 'Attack'], axis=1).values
y = df['Attack'].values

In [6]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [7]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# Build a neural network for binary classification
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2, batch_size=32)

Epoch 1/10
[1m111470/111470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 2ms/step - accuracy: 0.9701 - loss: 0.1075 - val_accuracy: 0.9769 - val_loss: 0.0847
Epoch 2/10
[1m111470/111470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 2ms/step - accuracy: 0.9759 - loss: 0.0907 - val_accuracy: 0.9768 - val_loss: 0.0852
Epoch 3/10
[1m111470/111470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 2ms/step - accuracy: 0.9763 - loss: 0.0881 - val_accuracy: 0.9782 - val_loss: 0.0813
Epoch 4/10
[1m111470/111470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 2ms/step - accuracy: 0.9766 - loss: 0.0886 - val_accuracy: 0.9784 - val_loss: 0.0808
Epoch 5/10
[1m111470/111470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 2ms/step - accuracy: 0.9770 - loss: 0.0865 - val_accuracy: 0.9797 - val_loss: 0.0802
Epoch 6/10
[1m111470/111470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 2ms/step - accuracy: 0.9777 - loss: 0.0873 - val_accuracy: 0.9794

In [10]:
# Evaluate the model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Metrics and confusion matrix
print(classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

[1m59716/59716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 1ms/step
              precision    recall  f1-score   support

           0       0.97      1.00      0.99   1598703
           1       1.00      0.85      0.92    312197

    accuracy                           0.98   1910900
   macro avg       0.99      0.92      0.95   1910900
weighted avg       0.98      0.98      0.97   1910900

Confusion Matrix:
[[1598423     280]
 [  46898  265299]]
