In [1]:
import pandas as pd
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

Benign-Monday-no-metadata.parquet → Normal traffic (no attack)

Botnet-Friday-no-metadata.parquet → Botnet attacks

Bruteforce-Tuesday-no-metadata.parquet → Brute force attacks (usually SSH/FTP brute force)

DDoS-Friday-no-metadata.parquet → Distributed Denial of Service

DoS-Wednesday-no-metadata.parquet → Denial of Service (single source)

Infiltration-Thursday-no-metadata.parquet → Infiltration attacks (malware/backdoors)

Portscan-Friday-no-metadata.parquet → Port scanning attacks

WebAttacks-Thursday-no-metadata.parquet → Web attacks (SQL injection, XSS, Command injection, etc.)

In [2]:
print("Loading dataset...")

data_path = "C:/Users/vipul/Downloads/intrudtion"  # update this with actual folder path

# List of parquet files
parquet_files = [
    "Benign-Monday-no-metadata.parquet",
    "Botnet-Friday-no-metadata.parquet",
    "Bruteforce-Tuesday-no-metadata.parquet",
    "DDoS-Friday-no-metadata.parquet",
    "DoS-Wednesday-no-metadata.parquet",
    "Infiltration-Thursday-no-metadata.parquet",
    "Portscan-Friday-no-metadata.parquet",
    "WebAttacks-Thursday-no-metadata.parquet"
]

# Load and label datasets
dataframes = []
for file in parquet_files:
    file_path = os.path.join(data_path, file)
    df = pd.read_parquet(file_path)

    # Extract label from file name
    label = file.split("-")[0]  # e.g. "Benign", "Botnet", "DDoS"...
    df["label"] = label

    dataframes.append(df)

# Combine all datasets
full_df = pd.concat(dataframes, ignore_index=True)
print("Combined dataset shape:", full_df.shape)

Loading dataset...
Combined dataset shape: (2313810, 79)


In [3]:
print("Preprocessing data...")

# Drop duplicate rows
full_df.drop_duplicates(inplace=True)

# Handle missing values
print("Missing values per column:\n", full_df.isnull().sum().sort_values(ascending=False).head())
data = full_df.dropna()

Preprocessing data...
Missing values per column:
 Protocol                    0
Flow Duration               0
Total Fwd Packets           0
Total Backward Packets      0
Fwd Packets Length Total    0
dtype: int64


In [4]:
data.head()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,label
0,6,4,2,0,12,0,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,Benign,Benign
1,6,1,2,0,12,0,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,Benign,Benign
2,6,3,2,0,12,0,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,Benign,Benign
3,6,1,2,0,12,0,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,Benign,Benign
4,6,609,7,4,484,414,233,0,69.14286,111.967896,...,0.0,0.0,0,0,0.0,0.0,0,0,Benign,Benign


In [5]:
data['Label'].value_counts()

Label
Benign                        1977318
DoS Hulk                       172846
DDoS                           128014
DoS GoldenEye                   10286
FTP-Patator                      5931
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
PortScan                         1956
Web Attack � Brute Force         1470
Bot                              1437
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64

In [7]:
data['Label'] = data['Label'].str.lower()
data['Label'] = data['Label'].str.strip()

In [8]:
data['Label'].value_counts()

Label
benign                        1977318
dos hulk                       172846
ddos                           128014
dos goldeneye                   10286
ftp-patator                      5931
dos slowloris                    5385
dos slowhttptest                 5228
ssh-patator                      3219
portscan                         1956
web attack � brute force         1470
bot                              1437
web attack � xss                  652
infiltration                       36
web attack � sql injection         21
heartbleed                         11
Name: count, dtype: int64

In [12]:
data['Label'] = data['Label'].apply(lambda x: x if x in ['benign' , 'dos hulk' , 'ddos' , 'dos goldeneye'] else 'other_attacks')

In [13]:
data['Label'].value_counts()

Label
benign           1977318
dos hulk          172846
ddos              128014
other_attacks      25346
dos goldeneye      10286
Name: count, dtype: int64

In [14]:
data.columns

Index(['Protocol', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Fwd Packets Length Total',
       'Bwd Packets Length Total', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Fla

In [16]:
# Separate features and label
#target_column = "label"  # Adjust if your dataset has a different target column
X = data.drop(columns=["Label", "label"])
y = data["Label"]

# Encode categorical labels
le = LabelEncoder()
y = le.fit_transform(y)

In [17]:
X.columns

Index(['Protocol', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Fwd Packets Length Total',
       'Bwd Packets Length Total', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Fla

In [18]:
X.head(2)

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Act Data Packets,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,6,4,2,0,12,0,6,6,6.0,0.0,...,1,20,0.0,0.0,0,0,0.0,0.0,0,0
1,6,1,2,0,12,0,6,6,6.0,0.0,...,1,20,0.0,0.0,0,0,0.0,0.0,0,0


In [19]:
# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# =============================
# 3. Train-Test Split
# =============================
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)
print("Train shape:", X_train.shape, " Test shape:", X_test.shape)

Train shape: (1619667, 77)  Test shape: (694143, 77)


In [20]:
print("Training Logistic Regression...")
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=500, n_jobs=-1)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

'''
print("Training Random Forest...")
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
'''

Training Logistic Regression...


'\nprint("Training Random Forest...")\nfrom sklearn.ensemble import RandomForestClassifier\nrf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)\nrf.fit(X_train, y_train)\ny_pred_rf = rf.predict(X_test)\n'

In [22]:
print("\nModel Performance:")
print("Accuracy:", accuracy_score(y_pred_lr, y_test))
print("Classification Report:\n", classification_report(y_pred_lr, y_test))


Model Performance:
Accuracy: 0.9852393526982193
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99    599260
           1       0.97      1.00      0.98     37467
           2       0.85      0.97      0.90      2697
           3       0.94      0.97      0.96     50342
           4       0.52      0.91      0.66      4377

    accuracy                           0.99    694143
   macro avg       0.86      0.97      0.90    694143
weighted avg       0.99      0.99      0.99    694143

