In [1]:

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Step 2: Load the Dataset
data = pd.read_csv("/Users/supak/Desktop/DDoS 2018/02-15-2018.csv")
print(data.head())
# Randomly sample 1/10 of the data
data = data.sample(frac=0.1, random_state=42)  # frac=0.1 means 10%, random_state ensures reproducibility
# Preview the sampled data
print(data.head())

   Dst Port  Protocol            Timestamp  Flow Duration  Tot Fwd Pkts  \
0         0         0  15/02/2018 08:25:18      112641158             3   
1        22         6  15/02/2018 08:29:05       37366762            14   
2     47514         6  15/02/2018 08:29:42            543             2   
3         0         0  15/02/2018 08:28:07      112640703             3   
4         0         0  15/02/2018 08:30:56      112640874             3   

   Tot Bwd Pkts  TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  \
0             0                0                0                0   
1            12             2168             2993              712   
2             0               64                0               64   
3             0                0                0                0   
4             0                0                0                0   

   Fwd Pkt Len Min  ...  Fwd Seg Size Min  Active Mean     Active Std  \
0                0  ...                 0          0.0 

In [3]:
# Step 3: Preprocess the Data
data.columns = data.columns.str.strip()
data = data.dropna()
# Encode the target column ('Label')
encoder = LabelEncoder()
data['Label'] = encoder.fit_transform(data['Label'])
# Select only numeric columns for scaling
numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
numeric_columns.remove('Label')  # Exclude the target column
# Check for infinite or extremely large values
data[numeric_columns] = data[numeric_columns].replace([np.inf, -np.inf], np.nan)
data = data.dropna(subset=numeric_columns)
# Scale the numeric feature columns
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])
X = data[numeric_columns]
y = data['Label']

In [4]:
# Step 4: Split the Dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [5]:
# Step 5: Initialize Random Forest (can add more parameters)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [6]:
# Step 6: Train the Model
rf_model.fit(X_train, y_train)


In [7]:
# Step 7: Validate the Model
y_val_pred = rf_model.predict(X_val)
print("Validation Results")
print(classification_report(y_val, y_val_pred))

Validation Results
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14828
           1       1.00      1.00      1.00       602
           2       1.00      1.00      1.00       176

    accuracy                           1.00     15606
   macro avg       1.00      1.00      1.00     15606
weighted avg       1.00      1.00      1.00     15606



In [8]:
# Step 8: Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}


In [9]:
# Step 9: Test the Model
y_test_pred = rf_model.predict(X_test)
print("Test Results")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))
print("Accuracy:", accuracy_score(y_test, y_test_pred))

Test Results
[[14828     1     0]
 [    0   618     0]
 [    0     0   160]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14829
           1       1.00      1.00      1.00       618
           2       1.00      1.00      1.00       160

    accuracy                           1.00     15607
   macro avg       1.00      1.00      1.00     15607
weighted avg       1.00      1.00      1.00     15607

Accuracy: 0.9999359261869674
