In [2]:
# Cell 1: Install imbalanced-learn (for SMOTE, ADASYN etc.)
!pip install imbalanced-learn


Collecting imbalanced-learn
  Using cached imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.14.0



[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: C:\Users\sharm\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
# Cell 1: Import necessary libraries & set display options

import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns

# For balancing later
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE

# Suppress warnings for clean output
import warnings
warnings.filterwarnings('ignore')

# Show more columns while exploring
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)

# Path to datasets folder
DATASET_PATH = "datasets"

# List all CSV files (sorted by date)
csv_files = sorted(glob.glob(os.path.join(DATASET_PATH, "*.csv")))

print("Found", len(csv_files), "files")
print(csv_files[:5])  # preview first 5 files


Found 9 files
['datasets\\02-14-2018.csv', 'datasets\\02-15-2018.csv', 'datasets\\02-16-2018.csv', 'datasets\\02-21-2018.csv', 'datasets\\02-22-2018.csv']


In [4]:
# Cell 2: Load a sample CSV to explore structure and memory usage

sample_file = csv_files[0]  # pick the first file
print("Loading sample file:", sample_file)

# Load with low_memory and no dtype guessing issues
df_sample = pd.read_csv(sample_file, low_memory=False)

print("Shape:", df_sample.shape)
print("\nColumns:", df_sample.columns.tolist())
print("\nData Types:\n", df_sample.dtypes)
print("\nMemory Usage (MB):", round(df_sample.memory_usage(deep=True).sum() / 1024**2, 2))

# Preview top rows
df_sample.head()


Loading sample file: datasets\02-14-2018.csv
Shape: (1048575, 80)

Columns: ['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Se

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.026633,56320860.0,139.300036,56320958,56320761,112641719,56320860.0,139.300036,56320958,56320761,0,0.0,0.0,0,0,0,0,0,0,0,0,0.026633,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.026633,56320730.0,114.551299,56320814,56320652,112641466,56320730.0,114.551299,56320814,56320652,0,0.0,0.0,0,0,0,0,0,0,0,0,0.026633,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.026634,56319310.0,301.934596,56319525,56319098,112638623,56319310.0,301.934596,56319525,56319098,0,0.0,0.0,0,0,0,0,0,0,0,0,0.026634,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,82.6,196.741237,976,0,227.3,371.677892,544.161528,3.873587,268915.2,247443.778966,673900,22,6453966,460997.6,123109.423588,673900,229740,5637902,626433.555556,455082.214224,1167293,554,0,0,0,0,488,328,2.324152,1.549435,0,976,135.076923,277.83476,77192.153846,0,0,0,1,0,0,0,0,0,140.48,82.6,227.3,0,0,0,0,0,0,15,1239,10,2273,65535,233,6,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,81.642857,203.745545,976,0,200.818182,362.249864,380.733175,2.839597,366836.1,511356.609733,1928102,21,8804066,677235.8,532416.970959,1928102,246924,7715481,771548.1,755543.082717,2174893,90,0,0,0,0,456,360,1.590174,1.249423,0,976,128.923077,279.763032,78267.353846,0,0,0,1,0,0,0,0,0,134.08,81.642857,200.818182,0,0,0,0,0,0,14,1143,11,2209,5808,233,6,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign


In [5]:
# Cell 3: Efficient loading of all CSVs with dtype optimization

def optimize_dtypes(df):
    """
    Downcast numerical columns to save memory.
    """
    for col in df.select_dtypes(include=["int64"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")
    for col in df.select_dtypes(include=["float64"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="float")
    return df

# Load and optimize all datasets
all_data = []
for file in csv_files:
    print(f"Loading {file} ...")
    chunk = pd.read_csv(file, low_memory=False)
    chunk = optimize_dtypes(chunk)
    all_data.append(chunk)

# Concatenate into one DataFrame
df = pd.concat(all_data, ignore_index=True)

print("\nFinal Shape:", df.shape)
print("Memory Usage (MB):", round(df.memory_usage(deep=True).sum() / 1024**2, 2))
df.head()


Loading datasets\02-14-2018.csv ...
Loading datasets\02-15-2018.csv ...
Loading datasets\02-16-2018.csv ...
Loading datasets\02-21-2018.csv ...
Loading datasets\02-22-2018.csv ...
Loading datasets\02-23-2018.csv ...
Loading datasets\02-28-2018.csv ...
Loading datasets\03-01-2018.csv ...
Loading datasets\03-02-2018.csv ...

Final Shape: (8284254, 80)
Memory Usage (MB): 26198.08


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.026633,56320859.5,139.300036,56320958,56320761,112641719,56320859.5,139.300036,56320958,56320761,0,0.0,0.0,0,0,0,0,0,0,0,0,0.026633,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.026633,56320733.0,114.551299,56320814,56320652,112641466,56320733.0,114.551299,56320814,56320652,0,0.0,0.0,0,0,0,0,0,0,0,0,0.026633,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.026634,56319311.5,301.934596,56319525,56319098,112638623,56319311.5,301.934596,56319525,56319098,0,0.0,0.0,0,0,0,0,0,0,0,0,0.026634,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,82.599998,196.741241,976,0,227.300003,371.677887,544.161528,3.873587,268915.25,247443.778966,673900,22,6453966,460997.571429,123109.423588,673900,229740,5637902,626433.555556,455082.214224,1167293,554,0,0,0,0,488,328,2.324152,1.549435,0,976,135.07692,277.834747,77192.153846,0,0,0,1,0,0,0,0,0,140.479996,82.599998,227.300003,0,0,0,0,0,0,15,1239,10,2273,65535,233,6,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,81.64286,203.745544,976,0,200.818176,362.249878,380.733175,2.839597,366836.083333,511356.609733,1928102,21,8804066,677235.846154,532416.970959,1928102,246924,7715481,771548.1,755543.082717,2174893,90,0,0,0,0,456,360,1.590174,1.249423,0,976,128.92308,279.763031,78267.353846,0,0,0,1,0,0,0,0,0,134.080002,81.64286,200.818176,0,0,0,0,0,0,14,1143,11,2209,5808,233,6,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign


In [6]:
# Cell 4: Basic preprocessing (NaN handling, Timestamp parsing, Label encoding)

# 1. Check missing values
missing_summary = df.isnull().sum()
print("Missing values per column (top 10):\n", missing_summary.sort_values(ascending=False).head(10))

# 2. Parse Timestamp to datetime + extract time-based features
df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")

df["Year"] = df["Timestamp"].dt.year
df["Month"] = df["Timestamp"].dt.month
df["Day"] = df["Timestamp"].dt.day
df["Hour"] = df["Timestamp"].dt.hour
df["Minute"] = df["Timestamp"].dt.minute
df["Second"] = df["Timestamp"].dt.second

# 3. Encode Label (Normal vs Attack types etc.)
df["Label"] = df["Label"].astype("category")
df["Label_Code"] = df["Label"].cat.codes

print("\nUnique Labels:", df["Label"].unique())
print("Label Mapping:", dict(enumerate(df["Label"].cat.categories)))

# 4. Final memory check
print("\nMemory Usage (MB):", round(df.memory_usage(deep=True).sum() / 1024**2, 2))

# Preview after preprocessing
df.head()


Missing values per column (top 10):
 Flow Byts/s        22954
Dst Port               0
Timestamp              0
Protocol               0
Tot Fwd Pkts           0
Tot Bwd Pkts           0
TotLen Fwd Pkts        0
Flow Duration          0
Fwd Pkt Len Max        0
Fwd Pkt Len Min        0
dtype: int64

Unique Labels: ['Benign', 'FTP-BruteForce', 'SSH-Bruteforce', 'DoS attacks-GoldenEye', 'DoS attacks-Slowloris', ..., 'Brute Force -Web', 'Brute Force -XSS', 'SQL Injection', 'Infilteration', 'Bot']
Length: 15
Categories (15, object): ['Benign', 'Bot', 'Brute Force -Web', 'Brute Force -XSS', ..., 'Infilteration', 'Label', 'SQL Injection', 'SSH-Bruteforce']
Label Mapping: {0: 'Benign', 1: 'Bot', 2: 'Brute Force -Web', 3: 'Brute Force -XSS', 4: 'DDOS attack-HOIC', 5: 'DDOS attack-LOIC-UDP', 6: 'DoS attacks-GoldenEye', 7: 'DoS attacks-Hulk', 8: 'DoS attacks-SlowHTTPTest', 9: 'DoS attacks-Slowloris', 10: 'FTP-BruteForce', 11: 'Infilteration', 12: 'Label', 13: 'SQL Injection', 14: 'SSH-Bruteforce

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Year,Month,Day,Hour,Minute,Second,Label_Code
0,0,0,2018-02-14 08:31:01,112641719,3,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.026633,56320859.5,139.300036,56320958,56320761,112641719,56320859.5,139.300036,56320958,56320761,0,0.0,0.0,0,0,0,0,0,0,0,0,0.026633,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,Benign,2018.0,2.0,14.0,8.0,31.0,1.0,0
1,0,0,2018-02-14 08:33:50,112641466,3,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.026633,56320733.0,114.551299,56320814,56320652,112641466,56320733.0,114.551299,56320814,56320652,0,0.0,0.0,0,0,0,0,0,0,0,0,0.026633,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,Benign,2018.0,2.0,14.0,8.0,33.0,50.0,0
2,0,0,2018-02-14 08:36:39,112638623,3,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.026634,56319311.5,301.934596,56319525,56319098,112638623,56319311.5,301.934596,56319525,56319098,0,0.0,0.0,0,0,0,0,0,0,0,0,0.026634,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,Benign,2018.0,2.0,14.0,8.0,36.0,39.0,0
3,22,6,2018-02-14 08:40:13,6453966,15,10,1239,2273,744,0,82.599998,196.741241,976,0,227.300003,371.677887,544.161528,3.873587,268915.25,247443.778966,673900,22,6453966,460997.571429,123109.423588,673900,229740,5637902,626433.555556,455082.214224,1167293,554,0,0,0,0,488,328,2.324152,1.549435,0,976,135.07692,277.834747,77192.153846,0,0,0,1,0,0,0,0,0,140.479996,82.599998,227.300003,0,0,0,0,0,0,15,1239,10,2273,65535,233,6,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign,2018.0,2.0,14.0,8.0,40.0,13.0,0
4,22,6,2018-02-14 08:40:23,8804066,14,11,1143,2209,744,0,81.64286,203.745544,976,0,200.818176,362.249878,380.733175,2.839597,366836.083333,511356.609733,1928102,21,8804066,677235.846154,532416.970959,1928102,246924,7715481,771548.1,755543.082717,2174893,90,0,0,0,0,456,360,1.590174,1.249423,0,976,128.92308,279.763031,78267.353846,0,0,0,1,0,0,0,0,0,134.080002,81.64286,200.818176,0,0,0,0,0,0,14,1143,11,2209,5808,233,6,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign,2018.0,2.0,14.0,8.0,40.0,23.0,0


In [8]:
# Cell 5 (Fix): Ensure numeric columns before feature engineering

# List of columns we need for interaction features
num_cols = ["Tot Fwd Pkts", "Tot Bwd Pkts", "TotLen Fwd Pkts", "TotLen Bwd Pkts", 
            "Flow Duration", "Flow Byts/s"]

# Convert to numeric (force errors to NaN, then fill with 0)
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)

# 1. Fill missing values (already handled but double-check Flow Byts/s)
df["Flow Byts/s"].fillna(0, inplace=True)

# 2. Interaction Features (ratios and differences)
df["Pkt_Ratio_Fwd_Bwd"] = (df["Tot Fwd Pkts"] + 1) / (df["Tot Bwd Pkts"] + 1)
df["Len_Ratio_Fwd_Bwd"] = (df["TotLen Fwd Pkts"] + 1) / (df["TotLen Bwd Pkts"] + 1)

df["Pkt_Diff_Fwd_Bwd"] = df["Tot Fwd Pkts"] - df["Tot Bwd Pkts"]
df["Len_Diff_Fwd_Bwd"] = df["TotLen Fwd Pkts"] - df["TotLen Bwd Pkts"]

# 3. Rolling Features (group by Hour to capture traffic bursts)
df = df.sort_values("Timestamp")

df["Fwd_Pkts_RollMean"] = df.groupby("Hour")["Tot Fwd Pkts"].transform(lambda x: x.rolling(1000, min_periods=1).mean())
df["Bwd_Pkts_RollMean"] = df.groupby("Hour")["Tot Bwd Pkts"].transform(lambda x: x.rolling(1000, min_periods=1).mean())
df["Flow_Duration_RollStd"] = df.groupby("Hour")["Flow Duration"].transform(lambda x: x.rolling(1000, min_periods=1).std())

print("✅ Feature engineering successful. New columns added.")
df.head()


✅ Feature engineering successful. New columns added.


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Year,Month,Day,Hour,Minute,Second,Label_Code,Pkt_Ratio_Fwd_Bwd,Len_Ratio_Fwd_Bwd,Pkt_Diff_Fwd_Bwd,Len_Diff_Fwd_Bwd,Fwd_Pkts_RollMean,Bwd_Pkts_RollMean,Flow_Duration_RollStd
410956,0,0,1970-01-05 03:01:17,-11873000000.0,2.0,0.0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,-0.000168,-11873000000.0,0.0,-11873000000,-11873000000,-11873000000,-11873000000.0,0.0,-11873000000,-11873000000,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,2,0,0,0,-1,-1,0,0,0.0,0.0,0,0,0.0,0.0,0,0,Benign,1970.0,1.0,5.0,3.0,1.0,17.0,0,3.0,1.0,2.0,0.0,2.0,0.0,
410957,0,0,1970-01-08 07:32:33,-681402000000.0,3.0,0.0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,-4e-06,-340701000000.0,256344592956.434,-159438000000,-521964000000,-681402000000,-340701000000.0,256344592956.434,-159438000000,-521964000000,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0.0,0.0,0,0,0.0,0.0,0,0,Benign,1970.0,1.0,8.0,7.0,32.0,33.0,0,4.0,1.0,3.0,0.0,3.0,0.0,
4440733,0,0,1970-01-10 03:04:26,-188505000000.0,9.0,0.0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,-4.8e-05,-23563125000.0,279482181562.075,324807000000,-449709000000,-188505000000,-23563125000.0,279482181562.075,324807000000,-449709000000,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,9,0,0,0,-1,-1,0,0,0.0,0.0,0,0,197474750000.0,141251680160.804,324807000000,21756000000,Benign,1970.0,1.0,10.0,3.0,4.0,26.0,0,10.0,1.0,9.0,0.0,5.5,0.0,124897700000.0
4440736,0,0,1970-01-11 03:51:32,-828220000000.0,2.0,0.0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,-2e-06,-828220000000.0,0.0,-828220000000,-828220000000,-828220000000,-828220000000.0,0.0,-828220000000,-828220000000,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,2,0,0,0,-1,-1,0,0,0.0,0.0,0,0,0.0,0.0,0,0,Benign,1970.0,1.0,11.0,3.0,51.0,32.0,0,3.0,1.0,2.0,0.0,4.333333,0.0,429506800000.0
4440735,0,0,1970-01-11 05:12:30,-4834000000.0,43.0,0.0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,-0.008895,-115095238.09524,445870232660.056,846458000000,-935931000000,-4834000000,-115095238.09524,445870232660.056,846458000000,-935931000000,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,43,0,0,0,-1,-1,0,0,0.0,0.0,0,0,375976047619.048,262247866338.59897,846458000000,21291000000,Benign,1970.0,1.0,11.0,5.0,12.0,30.0,0,44.0,1.0,43.0,0.0,43.0,0.0,


In [10]:
# Cell 6: Clean features & Apply ADASYN

from collections import Counter
from imblearn.over_sampling import ADASYN

# Drop non-numeric columns (like Timestamp, Ports, IPs, etc.)
X = df.drop(columns=["Label", "Label_Code", "Timestamp"])
X = X.select_dtypes(include=["int64", "float64"])  # keep only numeric features
y = df["Label_Code"]

print("Original Class Distribution:", Counter(y))
print("Shape before balancing:", X.shape)

# Convert to float32 for memory efficiency
X = X.astype("float32")

# Apply ADASYN (adaptive synthetic sampling)
adasyn = ADASYN(sampling_strategy="minority", n_neighbors=5, random_state=42, n_jobs=-1)

X_resampled, y_resampled = adasyn.fit_resample(X, y)

print("Shape after balancing:", X_resampled.shape)
print("Resampled Class Distribution:", Counter(y_resampled))


Original Class Distribution: Counter({0: 6112151, 4: 686012, 7: 461912, 1: 286191, 10: 193360, 14: 187589, 11: 161934, 8: 139890, 6: 41508, 9: 10990, 5: 1730, 2: 611, 3: 230, 13: 87, 12: 59})
Shape before balancing: (8284254, 19)


TypeError: ADASYN.__init__() got an unexpected keyword argument 'n_jobs'

In [12]:
import pandas as pd
import numpy as np

# Remove rows with NaN
X = pd.DataFrame(X).dropna()
y = y[X.index]  # keep y aligned

X = X.values.astype("float32")


In [13]:
print("Any NaN left in X? ", pd.DataFrame(X).isnull().sum().sum())


Any NaN left in X?  0


In [15]:
import numpy as np

print("Any +inf in X? ", np.isinf(X).sum().sum())
print("Any -inf in X? ", np.isneginf(X).sum().sum())


Any +inf in X?  13353
Any -inf in X?  0


In [16]:
X = np.where(np.isinf(X), np.nan, X)   # replace inf with NaN
X = np.nan_to_num(X, nan=0.0)          # replace NaN with 0 (or mean later)


In [17]:
adasyn = ADASYN(sampling_strategy="minority", n_neighbors=5, random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X, y)

print("Shape after balancing:", X_resampled.shape)
print("Resampled class distribution:", Counter(y_resampled))


Shape after balancing: (14396206, 19)
Resampled class distribution: Counter({0: 6112139, 13: 6112110, 4: 686012, 7: 461912, 1: 286191, 10: 193360, 14: 187589, 11: 161934, 8: 139890, 6: 41508, 9: 10990, 5: 1730, 2: 611, 3: 230})


In [18]:
adasyn = ADASYN(sampling_strategy={2: 5000, 3: 5000, 5: 5000}, random_state=42)


In [5]:
import glob

# List all parquet files in your data_parquet folder
parquet_files = glob.glob("data_parquet/*.parquet")
print("Found parquet files:", parquet_files)


Found parquet files: []


In [8]:
import pandas as pd
import glob
import os

# Path where your daily CSVs are stored
DATASET_PATH = "datasets"

# Get all CSV files
csv_files = sorted(glob.glob(os.path.join(DATASET_PATH, "*.csv")))
print("Found CSV files:", len(csv_files))

# Load and concatenate
dfs = []
for file in csv_files:
    print(f"Loading {file} ...")
    df = pd.read_csv(file)
    dfs.append(df)

# Combine into one DataFrame
data = pd.concat(dfs, ignore_index=True)

print("Final shape:", data.shape)
print(data.head())


Found CSV files: 9
Loading datasets\02-14-2018.csv ...
Loading datasets\02-15-2018.csv ...
Loading datasets\02-16-2018.csv ...


  df = pd.read_csv(file)


Loading datasets\02-21-2018.csv ...
Loading datasets\02-22-2018.csv ...
Loading datasets\02-23-2018.csv ...
Loading datasets\02-28-2018.csv ...


  df = pd.read_csv(file)


Loading datasets\03-01-2018.csv ...


  df = pd.read_csv(file)


Loading datasets\03-02-2018.csv ...
Final shape: (8284254, 80)
  Dst Port Protocol            Timestamp Flow Duration Tot Fwd Pkts  \
0        0        0  14/02/2018 08:31:01     112641719            3   
1        0        0  14/02/2018 08:33:50     112641466            3   
2        0        0  14/02/2018 08:36:39     112638623            3   
3       22        6  14/02/2018 08:40:13       6453966           15   
4       22        6  14/02/2018 08:40:23       8804066           14   

  Tot Bwd Pkts TotLen Fwd Pkts TotLen Bwd Pkts Fwd Pkt Len Max  \
0            0               0               0               0   
1            0               0               0               0   
2            0               0               0               0   
3           10            1239            2273             744   
4           11            1143            2209             744   

  Fwd Pkt Len Min  ... Fwd Seg Size Min Active Mean Active Std Active Max  \
0               0  ...              

In [12]:
import pandas as pd
import os

CSV_PATH = "datasets"
PARQUET_PATH = "data_parquet"
os.makedirs(PARQUET_PATH, exist_ok=True)

# Loop over CSV files and convert to Parquet
for file in os.listdir(CSV_PATH):
    if file.endswith(".csv"):
        csv_file = os.path.join(CSV_PATH, file)
        parquet_file = os.path.join(PARQUET_PATH, file.replace(".csv", ".parquet"))
        
        print(f"Converting {file} -> {parquet_file}")
        
        # Read CSV as strings to avoid dtype errors
        df = pd.read_csv(csv_file, dtype=str, low_memory=False)
        
        # Save to parquet
        df.to_parquet(parquet_file, engine="fastparquet", index=False)

print("✅ Conversion done. All CSVs saved as Parquet in data_parquet/")


Converting 02-14-2018.csv -> data_parquet\02-14-2018.parquet
Converting 02-15-2018.csv -> data_parquet\02-15-2018.parquet
Converting 02-16-2018.csv -> data_parquet\02-16-2018.parquet
Converting 02-21-2018.csv -> data_parquet\02-21-2018.parquet
Converting 02-22-2018.csv -> data_parquet\02-22-2018.parquet
Converting 02-23-2018.csv -> data_parquet\02-23-2018.parquet
Converting 02-28-2018.csv -> data_parquet\02-28-2018.parquet
Converting 03-01-2018.csv -> data_parquet\03-01-2018.parquet
Converting 03-02-2018.csv -> data_parquet\03-02-2018.parquet
✅ Conversion done. All CSVs saved as Parquet in data_parquet/


In [13]:
import pandas as pd
import glob
import os

# Path to your parquet files
PARQUET_PATH = "data_parquet"

# Get all parquet files
parquet_files = sorted(glob.glob(os.path.join(PARQUET_PATH, "*.parquet")))
print("Found parquet files:", parquet_files)

# Load and concatenate
dfs = []
for file in parquet_files:
    print(f"Loading {file} ...")
    df = pd.read_parquet(file)
    dfs.append(df)

# Merge into single DataFrame
merged_df = pd.concat(dfs, ignore_index=True)

print("Merged shape:", merged_df.shape)
print("Columns:", merged_df.columns.tolist())

# Save merged file
output_file = os.path.join(PARQUET_PATH, "merged_dataset.parquet")
merged_df.to_parquet(output_file, index=False)

print(f"Merged parquet saved to {output_file}")


Found parquet files: ['data_parquet\\02-14-2018.parquet', 'data_parquet\\02-15-2018.parquet', 'data_parquet\\02-16-2018.parquet', 'data_parquet\\02-21-2018.parquet', 'data_parquet\\02-22-2018.parquet', 'data_parquet\\02-23-2018.parquet', 'data_parquet\\02-28-2018.parquet', 'data_parquet\\03-01-2018.parquet', 'data_parquet\\03-02-2018.parquet']
Loading data_parquet\02-14-2018.parquet ...
Loading data_parquet\02-15-2018.parquet ...
Loading data_parquet\02-16-2018.parquet ...
Loading data_parquet\02-21-2018.parquet ...
Loading data_parquet\02-22-2018.parquet ...
Loading data_parquet\02-23-2018.parquet ...
Loading data_parquet\02-28-2018.parquet ...
Loading data_parquet\03-01-2018.parquet ...
Loading data_parquet\03-02-2018.parquet ...
Merged shape: (8284254, 80)
Columns: ['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 

: 

In [2]:
pip install polars


Collecting polars
  Downloading polars-1.32.3-cp39-abi3-win_amd64.whl.metadata (15 kB)
Downloading polars-1.32.3-cp39-abi3-win_amd64.whl (38.0 MB)
   ---------------------------------------- 0.0/38.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/38.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/38.0 MB ? eta -:--:--
    --------------------------------------- 0.5/38.0 MB 985.5 kB/s eta 0:00:39
    --------------------------------------- 0.8/38.0 MB 987.4 kB/s eta 0:00:38
    --------------------------------------- 0.8/38.0 MB 987.4 kB/s eta 0:00:38
   - -------------------------------------- 1.0/38.0 MB 968.5 kB/s eta 0:00:39
   - -------------------------------------- 1.3/38.0 MB 958.5 kB/s eta 0:00:39
   - -------------------------------------- 1.6/38.0 MB 964.5 kB/s eta 0:00:38
   - -------------------------------------- 1.6/38.0 MB 964.5 kB/s eta 0:00:38
   - -------------------------------------- 1.8/38.0 MB 958.5 kB/s eta 0:00:38
   -- -----


[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: C:\Users\sharm\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
import polars as pl
import glob

parquet_files = sorted(glob.glob("data_parquet/*.parquet"))

# Scan (lazy) and concatenate
dfs = [pl.scan_parquet(file) for file in parquet_files]
df = pl.concat(dfs)

# Save final merged parquet
df.sink_parquet("merged_polars.parquet")


In [4]:
!pip install imbalanced-learn





[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: C:\Users\sharm\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [7]:
df_small = df.sample(frac=0.1, random_state=42)  # 10% data


In [10]:
print(df.columns.tolist())


['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd By

In [11]:
df_small["Label"].value_counts()


Label
Benign                      61041
DDOS attack-HOIC             6809
DoS attacks-Hulk             4725
Bot                          2852
FTP-BruteForce               1923
SSH-Bruteforce               1909
Infilteration                1642
DoS attacks-SlowHTTPTest     1373
DoS attacks-GoldenEye         431
DoS attacks-Slowloris         106
DDOS attack-LOIC-UDP           18
Brute Force -Web               11
Brute Force -XSS                2
SQL Injection                   1
Name: count, dtype: int64

In [17]:
# Ultra Memory-Efficient SMOTE for 38GB Dataset
# This approach avoids creating copies of the large dataframe

import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import gc

def ultra_efficient_smote(df, sample_size=30000, random_state=42):
    """
    Memory-efficient SMOTE that works with minimal memory overhead
    """
    
    print(f"🎯 Dataset shape: {df.shape}")
    print(f"🔍 Analyzing class distribution...")
    
    # Step 1: Get class distribution without copying data
    label_counts = df['Label'].value_counts()
    print("Class distribution:")
    for label, count in label_counts.items():
        print(f"  {label}: {count:,}")
    
    # Step 2: Create label encoder
    print(f"\n🔤 Creating label encoder...")
    label_encoder = LabelEncoder()
    unique_labels = df['Label'].unique()
    label_encoder.fit(unique_labels)
    
    print("Label mapping:")
    for i, label in enumerate(label_encoder.classes_):
        print(f"  {i}: {label}")
    
    # Step 3: Sample indices directly (most memory efficient)
    print(f"\n📉 Sampling {sample_size:,} indices...")
    
    # Get sample indices without creating intermediate arrays
    total_rows = len(df)
    np.random.seed(random_state)
    
    # Try stratified sampling with minimal memory
    try:
        # Calculate samples per class proportionally
        min_samples_per_class = 50  # Minimum samples per class
        samples_per_class = {}
        
        for label in unique_labels:
            class_count = label_counts[label]
            if class_count < min_samples_per_class:
                samples_per_class[label] = class_count
            else:
                proportion = class_count / total_rows
                samples_per_class[label] = max(min_samples_per_class, 
                                             int(sample_size * proportion))
        
        # Adjust if total exceeds sample_size
        total_samples = sum(samples_per_class.values())
        if total_samples > sample_size:
            ratio = sample_size / total_samples
            samples_per_class = {k: max(1, int(v * ratio)) 
                               for k, v in samples_per_class.items()}
        
        print("Samples per class:", samples_per_class)
        
        # Collect indices for each class
        sample_indices = []
        for label, n_samples in samples_per_class.items():
            if n_samples > 0:
                class_indices = df[df['Label'] == label].index.tolist()
                if len(class_indices) >= n_samples:
                    selected = np.random.choice(class_indices, n_samples, replace=False)
                else:
                    selected = class_indices
                sample_indices.extend(selected)
        
        print(f"Selected {len(sample_indices)} samples")
        
    except Exception as e:
        print(f"⚠️ Stratified sampling failed: {e}")
        print("Using random sampling...")
        sample_indices = np.random.choice(total_rows, sample_size, replace=False)
    
    # Step 4: Extract sample data efficiently
    print(f"\n📊 Extracting sample data...")
    
    # Get sample using .iloc with the indices
    df_sample = df.iloc[sample_indices].copy()
    
    # Encode labels
    y_sample = label_encoder.transform(df_sample['Label'])
    
    # Prepare features (drop non-feature columns)
    feature_cols = [col for col in df_sample.columns 
                   if col not in ['Label', 'Timestamp']]
    X_sample = df_sample[feature_cols].copy()
    
    # Handle categorical columns if any
    categorical_cols = X_sample.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print(f"Converting categorical columns: {list(categorical_cols)}")
        for col in categorical_cols:
            le = LabelEncoder()
            X_sample[col] = le.fit_transform(X_sample[col].astype(str))
    
    # Ensure all numeric
    X_sample = X_sample.select_dtypes(include=[np.number])
    
    print(f"Sample features shape: {X_sample.shape}")
    print(f"Sample class distribution: {dict(Counter(y_sample))}")
    
    # Clear memory
    del df_sample
    gc.collect()
    
    # Step 5: Apply SMOTE
    print(f"\n🔄 Applying SMOTE...")
    
    # Calculate k_neighbors
    class_counts = Counter(y_sample)
    min_class_count = min(class_counts.values())
    k_neighbors = min(5, max(1, min_class_count - 1))
    
    print(f"Using k_neighbors = {k_neighbors}")
    
    try:
        smote = SMOTE(
            sampling_strategy='auto',
            random_state=random_state,
            k_neighbors=k_neighbors,
            n_jobs=1  # Use single core to reduce memory pressure
        )
        
        X_resampled, y_resampled = smote.fit_resample(X_sample, y_sample)
        
    except Exception as e:
        print(f"⚠️ SMOTE failed: {e}")
        
        try:
            print("🔄 Trying BorderlineSMOTE...")
            smote = BorderlineSMOTE(
                random_state=random_state,
                k_neighbors=max(1, k_neighbors//2),
                n_jobs=1
            )
            X_resampled, y_resampled = smote.fit_resample(X_sample, y_sample)
            
        except Exception as e2:
            print(f"⚠️ BorderlineSMOTE failed: {e2}")
            
            # Last resort: just return the sample without SMOTE
            print("📋 Returning original sample without SMOTE...")
            X_resampled = X_sample.values
            y_resampled = y_sample
    
    print("✅ Resampling completed!")
    print(f"Final shape: {X_resampled.shape}")
    print(f"Final class distribution: {dict(Counter(y_resampled))}")
    
    # Convert to DataFrame
    X_resampled_df = pd.DataFrame(X_resampled, columns=X_sample.columns)
    
    return X_resampled_df, y_resampled, label_encoder

# Alternative: Even more memory-efficient approach using chunking
def chunk_based_sample(df, sample_size=20000, chunk_size=100000, random_state=42):
    """
    Sample from dataframe using chunks to minimize memory usage
    """
    print(f"🔄 Processing dataset in chunks of {chunk_size:,}...")
    
    np.random.seed(random_state)
    samples_collected = 0
    sample_data = []
    
    # Calculate how many samples to take from each chunk
    total_chunks = (len(df) + chunk_size - 1) // chunk_size
    samples_per_chunk = sample_size // total_chunks
    
    print(f"Taking ~{samples_per_chunk} samples from each of {total_chunks} chunks")
    
    for i in range(0, len(df), chunk_size):
        if samples_collected >= sample_size:
            break
            
        print(f"Processing chunk {i//chunk_size + 1}/{total_chunks}...")
        
        # Get chunk
        chunk = df.iloc[i:i+chunk_size]
        
        # Sample from chunk
        remaining_samples = sample_size - samples_collected
        chunk_sample_size = min(samples_per_chunk, remaining_samples, len(chunk))
        
        if chunk_sample_size > 0:
            chunk_indices = np.random.choice(len(chunk), chunk_sample_size, replace=False)
            chunk_sample = chunk.iloc[chunk_indices]
            sample_data.append(chunk_sample)
            samples_collected += len(chunk_sample)
        
        # Clear chunk from memory
        del chunk
        gc.collect()
    
    # Combine all samples
    print("🔗 Combining samples...")
    df_sample = pd.concat(sample_data, ignore_index=True)
    del sample_data
    gc.collect()
    
    print(f"✅ Collected {len(df_sample)} samples")
    return df_sample

# Main execution with multiple fallback strategies
print("🚀 Starting Ultra Memory-Efficient SMOTE...")
print("="*60)

# Try the most efficient approach first
try:
    print("🎯 Attempt 1: Direct sampling approach...")
    X_resampled, y_resampled, label_encoder = ultra_efficient_smote(
        df, 
        sample_size=20000,  # Start smaller
        random_state=42
    )
    
    success = True
    
except MemoryError as e:
    print(f"❌ Memory error in direct approach: {e}")
    success = False
    
    try:
        print("\n🎯 Attempt 2: Chunk-based sampling...")
        df_sample = chunk_based_sample(df, sample_size=15000, chunk_size=50000)
        
        # Apply SMOTE to the chunk-based sample
        print("🔄 Applying SMOTE to chunk-based sample...")
        
        # Encode labels
        label_encoder = LabelEncoder()
        y_sample = label_encoder.fit_transform(df_sample['Label'])
        
        # Prepare features
        X_sample = df_sample.drop(columns=['Label', 'Timestamp'], errors='ignore')
        X_sample = X_sample.select_dtypes(include=[np.number])
        
        # Apply SMOTE
        k_neighbors = min(3, max(1, min(Counter(y_sample).values()) - 1))
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors, n_jobs=1)
        X_resampled, y_resampled = smote.fit_resample(X_sample, y_sample)
        
        success = True
        
    except Exception as e2:
        print(f"❌ Chunk-based approach also failed: {e2}")
        success = False

if success:
    print("\n🎉 SUCCESS! SMOTE completed successfully!")
    print(f"Resampled shape: {X_resampled.shape}")
    print(f"Memory usage: {X_resampled.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Create final balanced dataset
    df_balanced = pd.DataFrame(X_resampled, columns=X_sample.columns)
    df_balanced['Label_Encoded'] = y_resampled
    df_balanced['Label'] = label_encoder.inverse_transform(y_resampled)
    
    print(f"\nFinal balanced dataset:")
    print(f"Shape: {df_balanced.shape}")
    print(f"Class distribution:")
    print(df_balanced['Label'].value_counts())
    
else:
    print("\n❌ All approaches failed due to memory constraints.")
    print("💡 Recommendations:")
    print("1. Use a machine with more RAM (64GB+)")
    print("2. Work with a smaller subset of your data")
    print("3. Use cloud computing with high-memory instances")
    print("4. Consider using algorithms that can handle imbalanced data without SMOTE")

🚀 Starting Ultra Memory-Efficient SMOTE...
🎯 Attempt 1: Direct sampling approach...
🎯 Dataset shape: (8284254, 80)
🔍 Analyzing class distribution...
Class distribution:
  Benign: 6,112,151
  DDOS attack-HOIC: 686,012
  DoS attacks-Hulk: 461,912
  Bot: 286,191
  FTP-BruteForce: 193,360
  SSH-Bruteforce: 187,589
  Infilteration: 161,934
  DoS attacks-SlowHTTPTest: 139,890
  DoS attacks-GoldenEye: 41,508
  DoS attacks-Slowloris: 10,990
  DDOS attack-LOIC-UDP: 1,730
  Brute Force -Web: 611
  Brute Force -XSS: 230
  SQL Injection: 87
  Label: 59

🔤 Creating label encoder...
Label mapping:
  0: Benign
  1: Bot
  2: Brute Force -Web
  3: Brute Force -XSS
  4: DDOS attack-HOIC
  5: DDOS attack-LOIC-UDP
  6: DoS attacks-GoldenEye
  7: DoS attacks-Hulk
  8: DoS attacks-SlowHTTPTest
  9: DoS attacks-Slowloris
  10: FTP-BruteForce
  11: Infilteration
  12: Label
  13: SQL Injection
  14: SSH-Bruteforce

📉 Sampling 20,000 indices...
Samples per class: {'Benign': 14565, 'FTP-BruteForce': 459, 'SSH-B

NameError: name 'X_sample' is not defined

In [18]:


# Get feature column names (exclude Label and Timestamp)
feature_columns = [col for col in df.columns if col not in ['Label', 'Timestamp']]

# Create final balanced dataset
df_balanced = pd.DataFrame(X_resampled, columns=feature_columns)
df_balanced['Label_Encoded'] = y_resampled
df_balanced['Label'] = label_encoder.inverse_transform(y_resampled)

print(f"\n📊 Final Balanced Dataset:")
print(f"Shape: {df_balanced.shape}")
print(f"Columns: {len(df_balanced.columns)} ({len(feature_columns)} features + 2 label columns)")

print(f"\n📈 Class Distribution After SMOTE:")
class_dist = df_balanced['Label'].value_counts()
print(class_dist)

print(f"\n✅ Your balanced dataset is ready!")
print("Available variables:")
print("- X_resampled: Feature matrix (numpy array/DataFrame)")
print("- y_resampled: Target labels (encoded)")
print("- df_balanced: Complete balanced dataframe with original labels")
print("- label_encoder: For encoding/decoding labels")

print(f"\n🔍 Dataset Info:")
print(f"Features shape: {df_balanced[feature_columns].shape}")
print(f"Total samples: {len(df_balanced):,}")
print(f"Number of classes: {len(df_balanced['Label'].unique())}")

# Show first few rows
print(f"\n👀 Sample of balanced data:")
print(df_balanced[['Label', 'Label_Encoded'] + feature_columns[:5]].head())

print(f"\n🎯 Ready for Model Training!")


📊 Final Balanced Dataset:
Shape: (20000, 80)
Columns: 80 (78 features + 2 label columns)

📈 Class Distribution After SMOTE:
Label
Benign                      14795
DDOS attack-HOIC             1599
DoS attacks-Hulk             1134
Bot                           672
SSH-Bruteforce                476
FTP-BruteForce                474
Infilteration                 388
DoS attacks-SlowHTTPTest      330
DoS attacks-GoldenEye         104
DoS attacks-Slowloris          20
DDOS attack-LOIC-UDP            4
Brute Force -Web                2
Brute Force -XSS                2
Name: count, dtype: int64

✅ Your balanced dataset is ready!
Available variables:
- X_resampled: Feature matrix (numpy array/DataFrame)
- y_resampled: Target labels (encoded)
- df_balanced: Complete balanced dataframe with original labels
- label_encoder: For encoding/decoding labels

🔍 Dataset Info:
Features shape: (20000, 78)
Total samples: 20,000
Number of classes: 13

👀 Sample of balanced data:
              Label  Labe