In [7]:
# config
# === CONFIG (edit these if you like) ===
KAGGLE_URL = "https://www.kaggle.com/datasets/rodrigorosasilva/cic-ddos2019-30gb-full-dataset-csv-files"

# how much to read (keeps it light)
PER_FILE_ROWS = 150_000   # rows to read from each CSV (Colab-friendly)
MAX_ROWS_TOTAL = 600_000  # stop once we reach this many rows total

# what we’ll try as the target column (CICDDoS variants differ a bit)
TARGET_CANDIDATES = ["Label", "label", "Attack", "attack", "Class", "class", "Target", "target"]

# output
OUTPUT_CSV = "cleaned_cicddos2019_sample.csv"

In [8]:
# import libraries
!pip -q install opendatasets pandas numpy scikit-learn matplotlib seaborn

import os, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import opendatasets as od

from sklearn.preprocessing import LabelEncoder, StandardScaler

sns.set_theme()
pd.set_option("display.max_columns", 100)

In [9]:
#Download dataset
DATA_DIR = "/content/data"
od.download(KAGGLE_URL, data_dir=DATA_DIR)

csv_files = glob.glob(f"{DATA_DIR}/**/*.csv", recursive=True)
if not csv_files:
    csv_files = glob.glob(f"{DATA_DIR}/*.csv")

print(f"Found {len(csv_files)} CSV file(s). Showing a few:")
for p in csv_files[:5]:
    print(" •", p)

Skipping, found downloaded files in "/content/data\cic-ddos2019-30gb-full-dataset-csv-files" (use force=True to force download)
Found 13 CSV file(s). Showing a few:
 • /content/data\cic-ddos2019-30gb-full-dataset-csv-files\01-12\DrDoS_DNS.csv
 • /content/data\cic-ddos2019-30gb-full-dataset-csv-files\01-12\DrDoS_LDAP.csv
 • /content/data\cic-ddos2019-30gb-full-dataset-csv-files\01-12\DrDoS_MSSQL.csv
 • /content/data\cic-ddos2019-30gb-full-dataset-csv-files\01-12\DrDoS_NetBIOS.csv
 • /content/data\cic-ddos2019-30gb-full-dataset-csv-files\01-12\DrDoS_NTP.csv


In [10]:
#load a small,simple sample(straightforward concatenation)

frames = []
rows_loaded = 0

for p in csv_files:
    if rows_loaded >= MAX_ROWS_TOTAL:
        break
    # read just a slice from each file to keep memory small
    take = min(PER_FILE_ROWS, MAX_ROWS_TOTAL - rows_loaded)
    try:
        part = pd.read_csv(p, nrows=take, low_memory=False)
        frames.append(part)
        rows_loaded += len(part)
        print(f"Loaded {len(part):,} from {os.path.basename(p)}  (total: {rows_loaded:,})")
    except Exception as e:
        print("Skip (read error):", p, e)

if not frames:
    raise RuntimeError("No CSVs could be read. Check dataset structure or increase PER_FILE_ROWS.")

df = pd.concat(frames, ignore_index=True)
print("Combined shape:", df.shape)
df.head()

Loaded 150,000 from DrDoS_DNS.csv  (total: 150,000)
Loaded 150,000 from DrDoS_LDAP.csv  (total: 300,000)
Loaded 150,000 from DrDoS_MSSQL.csv  (total: 450,000)
Loaded 150,000 from DrDoS_NetBIOS.csv  (total: 600,000)
Combined shape: (600000, 88)


Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,425,172.16.0.5-192.168.50.1-634-60495-17,172.16.0.5,634,192.168.50.1,60495,17,2018-12-01 10:51:39.813448,28415,97,0,42680.0,0.0,440.0,440.0,440.0,0.0,0.0,0.0,0.0,0.0,1502024.0,3413.689952,295.989583,500.959301,3596.0,1.0,28415.0,295.989583,500.959301,3596.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,-97,0,3413.689952,0.0,440.0,440.0,440.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,444.536082,440.0,0.0,-97,0,0,0,0,0,0,97,42680,0,0,-1,-1,96,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_DNS
1,430,172.16.0.5-192.168.50.1-60495-634-17,192.168.50.1,634,172.16.0.5,60495,17,2018-12-01 10:51:39.820842,2,2,0,880.0,0.0,440.0,440.0,440.0,0.0,0.0,0.0,0.0,0.0,440000000.0,1000000.0,2.0,0.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,-2,0,1000000.0,0.0,440.0,440.0,440.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,660.0,440.0,0.0,-2,0,0,0,0,0,0,2,880,0,0,-1,-1,1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,DrDoS_DNS
2,1654,172.16.0.5-192.168.50.1-634-46391-17,172.16.0.5,634,192.168.50.1,46391,17,2018-12-01 10:51:39.852499,48549,200,0,88000.0,0.0,440.0,440.0,440.0,0.0,0.0,0.0,0.0,0.0,1812602.0,4119.549321,243.964824,578.101371,5418.0,1.0,48549.0,243.964824,578.101371,5418.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,-200,0,4119.549321,0.0,440.0,440.0,440.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,442.2,440.0,0.0,-200,0,0,0,0,0,0,200,88000,0,0,-1,-1,199,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_DNS
3,2927,172.16.0.5-192.168.50.1-634-11894-17,172.16.0.5,634,192.168.50.1,11894,17,2018-12-01 10:51:39.890213,48337,200,0,88000.0,0.0,440.0,440.0,440.0,0.0,0.0,0.0,0.0,0.0,1820552.0,4137.617146,242.899497,485.292695,3337.0,1.0,48337.0,242.899497,485.292695,3337.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,-200,0,4137.617146,0.0,440.0,440.0,440.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,442.2,440.0,0.0,-200,0,0,0,0,0,0,200,88000,0,0,-1,-1,199,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_DNS
4,694,172.16.0.5-192.168.50.1-634-27878-17,172.16.0.5,634,192.168.50.1,27878,17,2018-12-01 10:51:39.941151,32026,200,0,88000.0,0.0,440.0,440.0,440.0,0.0,0.0,0.0,0.0,0.0,2747767.0,6244.925998,160.934673,196.891271,1236.0,0.0,32026.0,160.934673,196.891271,1236.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,-200,0,6244.925998,0.0,440.0,440.0,440.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,442.2,440.0,0.0,-200,0,0,0,0,0,0,200,88000,0,0,-1,-1,199,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_DNS


In [11]:
#EDA
print(df.info())
print("\nMissing (top 15):\n", df.isna().sum().sort_values(ascending=False).head(15))

# try to detect target column
target_col = None
for cand in TARGET_CANDIDATES:
    for c in df.columns:
        if c.lower() == cand.lower():
            target_col = c
            break
    if target_col:
        break

print("\nDetected target column:", target_col)

if target_col:
    print("\nClass counts:\n", df[target_col].value_counts(dropna=False))
    sns.countplot(x=df[target_col])
    plt.title("Class distribution")
    plt.xticks(rotation=45)
    plt.show()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 88 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Unnamed: 0                    600000 non-null  int64  
 1   Flow ID                       600000 non-null  object 
 2    Source IP                    600000 non-null  object 
 3    Source Port                  600000 non-null  int64  
 4    Destination IP               600000 non-null  object 
 5    Destination Port             600000 non-null  int64  
 6    Protocol                     600000 non-null  int64  
 7    Timestamp                    600000 non-null  object 
 8    Flow Duration                600000 non-null  int64  
 9    Total Fwd Packets            600000 non-null  int64  
 10   Total Backward Packets       600000 non-null  int64  
 11  Total Length of Fwd Packets   600000 non-null  float64
 12   Total Length of Bwd Packets  600000 non-nul

In [12]:
#basic cleaning
before = len(df)
df = df.drop_duplicates()
print(f"Dropped duplicates: {before - len(df)}")

# numeric → fill NaN with 0; categorical → fill NaN with "unknown"
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in df.columns if c not in num_cols]

df[num_cols] = df[num_cols].fillna(0)
for c in cat_cols:
    df[c] = df[c].astype(str).fillna("unknown").replace({"nan": "unknown"})

print("Remaining missing values:", int(df.isna().sum().sum()))

Dropped duplicates: 0
Remaining missing values: 0


In [13]:
#encode target + encode other categoricals

# target first
if target_col is not None and df[target_col].dtype == object:
    le_y = LabelEncoder()
    df[target_col] = le_y.fit_transform(df[target_col])
    print(f"Encoded target '{target_col}':", dict(zip(le_y.classes_, le_y.transform(le_y.classes_))))
else:
    print("Target already numeric or not found; no target encoding done.")

# other categorical predictors
cat_predictors = [c for c in df.columns if df[c].dtype == object and c != target_col]
for c in cat_predictors:
    df[c] = LabelEncoder().fit_transform(df[c].astype(str))

print(f"Encoded {len(cat_predictors)} categorical predictor columns.")


Target already numeric or not found; no target encoding done.
Encoded 6 categorical predictor columns.


In [15]:
#scale numeric features
scaler = StandardScaler()

# First define X and y
if target_col:
    X = df.drop(columns=[target_col]).copy()
    y = df[target_col].copy()
else:
    X = df.copy()
    y = None

# Replace infinities with NaN
X = X.replace([np.inf, -np.inf], np.nan)

# Drop rows with NaN (or fill them with 0/mean if you prefer)
X = X.dropna()

# Now get numeric columns and scale them
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
X[num_cols] = scaler.fit_transform(X[num_cols])

df_clean = X.copy()
if target_col:
    df_clean[target_col] = y

print("Cleaned shape:", df_clean.shape)
df_clean.head()

Cleaned shape: (581613, 88)


Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,-0.81878,-0.022612,-0.084344,-0.599931,0.010236,1.46153,0.069283,-1.737906,-0.018015,0.22367,-0.027334,3.439851,-0.012137,-0.950231,-0.941322,-0.948191,-0.043051,-0.030924,-0.039058,-0.033481,-0.028521,-1.343403,-2.295521,-0.019654,-0.024833,-0.02482,-0.004979,-0.017067,-0.022439,-0.024722,-0.023853,-0.00492,-0.023911,-0.019751,-0.021141,-0.021584,-0.030684,-0.027795,0.0,0.0,0.0,0.062187,0.002622,-2.295272,-0.008046,-0.941295,-0.950482,-0.948911,-0.042613,-0.023269,0.0,-0.008894,-0.027795,0.0,-0.037391,-0.05731,-0.044394,0.0,-0.05716,-1.202411,-0.948191,-0.033481,0.062187,0.0,0.0,0.0,0.0,0.0,0.0,0.22367,3.439851,-0.027334,-0.012137,-0.035548,-0.025175,3.494683,0.275565,-0.011704,-0.004716,-0.008445,-0.011364,-0.024276,-0.008069,-0.024366,-0.023907,-0.014272,0.081031,-1.327339
1,-0.818706,-0.569552,10.814726,-0.599931,-14.299329,1.46153,0.069283,-1.7379,-0.029238,-0.013848,-0.027334,-0.231981,-0.012137,-0.950231,-0.941322,-0.948191,-0.043051,-0.030924,-0.039058,-0.033481,-0.028521,-0.953329,-0.843019,-0.021189,-0.026297,-0.027713,-0.004971,-0.028399,-0.023823,-0.026001,-0.026781,-0.004911,-0.023911,-0.019751,-0.021141,-0.021584,-0.030684,-0.027795,0.0,0.0,0.0,0.062187,0.002622,-0.842878,-0.008046,-0.941295,-0.950482,-0.948911,-0.042613,-0.023269,0.0,-0.008894,-0.027795,0.0,-0.037391,-0.05731,-0.044394,0.0,-0.05716,-0.937525,-0.948191,-0.033481,0.062187,0.0,0.0,0.0,0.0,0.0,0.0,-0.013848,-0.231981,-0.027334,-0.012137,-0.035548,-0.025175,-0.140838,0.275565,-0.011704,-0.004716,-0.008445,-0.011364,-0.024276,-0.008069,-0.024366,-0.023907,-0.014272,-12.340912,-1.327339
2,-0.800518,-0.037917,-0.084344,-0.599931,0.010236,0.719963,0.069283,-1.737894,-0.010061,0.481189,-0.027334,7.42089,-0.012137,-0.950231,-0.941322,-0.948191,-0.043051,-0.030924,-0.039058,-0.033481,-0.028521,-1.343127,-2.294493,-0.019926,-0.024608,-0.023353,-0.004979,-0.009037,-0.022684,-0.024526,-0.022369,-0.00492,-0.023911,-0.019751,-0.021141,-0.021584,-0.030684,-0.027795,0.0,0.0,0.0,0.062187,0.002622,-2.294243,-0.008046,-0.941295,-0.950482,-0.948911,-0.042613,-0.023269,0.0,-0.008894,-0.027795,0.0,-0.037391,-0.05731,-0.044394,0.0,-0.05716,-1.205283,-0.948191,-0.033481,0.062187,0.0,0.0,0.0,0.0,0.0,0.0,0.481189,7.42089,-0.027334,-0.012137,-0.035548,-0.025175,7.436353,0.275565,-0.011704,-0.004716,-0.008445,-0.011364,-0.024276,-0.008069,-0.024366,-0.023907,-0.014272,0.081031,-1.327339
3,-0.781601,-0.075173,-0.084344,-0.599931,0.010236,-1.093838,0.069283,-1.737888,-0.010145,0.481189,-0.027334,7.42089,-0.012137,-0.950231,-0.941322,-0.948191,-0.043051,-0.030924,-0.039058,-0.033481,-0.028521,-1.34312,-2.294466,-0.019931,-0.024879,-0.025028,-0.004979,-0.009122,-0.022689,-0.024762,-0.024064,-0.00492,-0.023911,-0.019751,-0.021141,-0.021584,-0.030684,-0.027795,0.0,0.0,0.0,0.062187,0.002622,-2.294217,-0.008046,-0.941295,-0.950482,-0.948911,-0.042613,-0.023269,0.0,-0.008894,-0.027795,0.0,-0.037391,-0.05731,-0.044394,0.0,-0.05716,-1.205283,-0.948191,-0.033481,0.062187,0.0,0.0,0.0,0.0,0.0,0.0,0.481189,7.42089,-0.027334,-0.012137,-0.035548,-0.025175,7.436353,0.275565,-0.011704,-0.004716,-0.008445,-0.011364,-0.024276,-0.008069,-0.024366,-0.023907,-0.014272,0.081031,-1.327339
4,-0.814783,-0.058125,-0.084344,-0.599931,0.010236,-0.253423,0.069283,-1.737882,-0.016588,0.481189,-0.027334,7.42089,-0.012137,-0.950231,-0.941322,-0.948191,-0.043051,-0.030924,-0.039058,-0.033481,-0.028521,-1.342295,-2.291395,-0.020359,-0.025722,-0.026719,-0.004988,-0.015627,-0.023075,-0.025498,-0.025776,-0.004928,-0.023911,-0.019751,-0.021141,-0.021584,-0.030684,-0.027795,0.0,0.0,0.0,0.062187,0.002622,-2.291146,-0.008046,-0.941295,-0.950482,-0.948911,-0.042613,-0.023269,0.0,-0.008894,-0.027795,0.0,-0.037391,-0.05731,-0.044394,0.0,-0.05716,-1.205283,-0.948191,-0.033481,0.062187,0.0,0.0,0.0,0.0,0.0,0.0,0.481189,7.42089,-0.027334,-0.012137,-0.035548,-0.025175,7.436353,0.275565,-0.011704,-0.004716,-0.008445,-0.011364,-0.024276,-0.008069,-0.024366,-0.023907,-0.014272,0.081031,-1.327339


In [17]:
#save and download the cleaned csv
df_clean.to_csv(OUTPUT_CSV, index=False)
print("Saved:", OUTPUT_CSV)

#from google.colab import files
files.download(OUTPUT_CSV)


KeyboardInterrupt: 

In [None]:
!git config --global user.email "ann.wangari@strathmore.edu"
!git config --global user.name "annkimani-ICS"


In [None]:
!git clone https://github.com/annKimani-ICS/Random-Forest-Based-IDPS.git
%cd Random-Forest-Based-IDPS

Cloning into 'Random-Forest-Based-IDPS'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 12 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (12/12), 4.74 KiB | 4.74 MiB/s, done.
Resolving deltas: 100% (1/1), done.
/content/Random-Forest-Based-IDPS


In [None]:
!git fetch origin
!git checkout -b feat/s1-data-cleaning origin/feat/s1-data-cleaning

Branch 'feat/s1-data-cleaning' set up to track remote branch 'feat/s1-data-cleaning' from 'origin'.
Switched to a new branch 'feat/s1-data-cleaning'
