In [1]:
import os
import pandas as pd
import numpy as np


In [2]:
DATA_DIR = "."

FILES = {
    "monday": "Monday-WorkingHours.pcap_ISCX.csv",
    "tuesday": "Tuesday-WorkingHours.pcap_ISCX.csv",
    "friday": "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
}

def load_csv(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {os.path.abspath(path)}")
    return pd.read_csv(path, low_memory=True)

paths = {k: os.path.join(DATA_DIR, v) for k, v in FILES.items()}

df_mon_raw = load_csv(paths["monday"])
df_tue_raw = load_csv(paths["tuesday"])
df_fri_raw = load_csv(paths["friday"])

print(df_mon_raw.shape, df_tue_raw.shape, df_fri_raw.shape)


(529918, 79) (445909, 79) (225745, 79)


In [3]:
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    return df

df_mon = clean_column_names(df_mon_raw)
df_tue = clean_column_names(df_tue_raw)
df_fri = clean_column_names(df_fri_raw)

print("Example columns (after strip):")
print(df_mon.columns[:10])


Example columns (after strip):
Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std'],
      dtype='object')


In [4]:
# Columns must match
assert list(df_mon.columns) == list(df_tue.columns) == list(df_fri.columns), "Columns do not match across files."

# Label column check
assert "Label" in df_mon.columns, "Label column not found after cleaning column names."

print("Columns consistent across all three:", df_mon.shape[1])
print("Label column found ✅")


Columns consistent across all three: 79
Label column found ✅


In [5]:
def replace_inf_and_coerce_numeric(df: pd.DataFrame, label_col: str = "Label") -> pd.DataFrame:
    df = df.copy()

    # Replace inf/-inf with NaN first (safer), then handle NaNs later
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Try to coerce all non-label columns to numeric where possible
    feature_cols = [c for c in df.columns if c != label_col]
    for c in feature_cols:
        # If column is already numeric, this is fast. If it is object, this will attempt conversion.
        df[c] = pd.to_numeric(df[c], errors="coerce")

    return df

df_mon = replace_inf_and_coerce_numeric(df_mon)
df_tue = replace_inf_and_coerce_numeric(df_tue)
df_fri = replace_inf_and_coerce_numeric(df_fri)

print("Done: inf handled and numeric coercion attempted.")


Done: inf handled and numeric coercion attempted.


In [6]:
def missing_report(df: pd.DataFrame) -> pd.DataFrame:
    rep = pd.DataFrame({
        "missing_count": df.isna().sum(),
        "missing_pct": (df.isna().mean() * 100).round(3)
    }).sort_values("missing_pct", ascending=False)
    return rep

miss_mon = missing_report(df_mon)
miss_tue = missing_report(df_tue)
miss_fri = missing_report(df_fri)

print("Top missing columns (Monday):")
display(miss_mon.head(15))


Top missing columns (Monday):


Unnamed: 0,missing_count,missing_pct
Flow Packets/s,437,0.082
Flow Bytes/s,437,0.082
Destination Port,0,0.0
Average Packet Size,0,0.0
Fwd Avg Bulk Rate,0,0.0
Fwd Avg Packets/Bulk,0,0.0
Fwd Avg Bytes/Bulk,0,0.0
Fwd Header Length.1,0,0.0
Avg Bwd Segment Size,0,0.0
Avg Fwd Segment Size,0,0.0


In [7]:
def drop_all_nan_and_constant(df: pd.DataFrame, label_col: str = "Label"):
    df = df.copy()

    feature_cols = [c for c in df.columns if c != label_col]

    # Drop columns that are all NaN
    all_nan_cols = [c for c in feature_cols if df[c].isna().all()]

    # Drop constant columns (nunique ignoring NaN <= 1)
    constant_cols = [c for c in feature_cols if df[c].nunique(dropna=True) <= 1]

    drop_cols = sorted(set(all_nan_cols + constant_cols))
    df.drop(columns=drop_cols, inplace=True)

    return df, drop_cols

df_mon, dropped_cols = drop_all_nan_and_constant(df_mon)
df_tue = df_tue.drop(columns=dropped_cols)
df_fri = df_fri.drop(columns=dropped_cols)

print("Dropped columns count:", len(dropped_cols))
print("Dropped columns (first 20):", dropped_cols[:20])
print("New column count:", df_mon.shape[1])


Dropped columns count: 10
Dropped columns (first 20): ['Bwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd PSH Flags', 'Bwd URG Flags', 'CWE Flag Count', 'Fwd Avg Bulk Rate', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd URG Flags']
New column count: 69


In [8]:
def fill_nans_with_median(df: pd.DataFrame, label_col: str = "Label") -> pd.DataFrame:
    df = df.copy()
    feature_cols = [c for c in df.columns if c != label_col]

    medians = df[feature_cols].median(numeric_only=True)
    df[feature_cols] = df[feature_cols].fillna(medians)

    # If any NaNs still exist (rare), fill with 0
    df[feature_cols] = df[feature_cols].fillna(0)

    return df

df_mon = fill_nans_with_median(df_mon)
df_tue = fill_nans_with_median(df_tue)
df_fri = fill_nans_with_median(df_fri)

print("NaNs after filling (Monday):", int(df_mon.isna().sum().sum()))
print("NaNs after filling (Tuesday):", int(df_tue.isna().sum().sum()))
print("NaNs after filling (Friday):", int(df_fri.isna().sum().sum()))


NaNs after filling (Monday): 0
NaNs after filling (Tuesday): 0
NaNs after filling (Friday): 0


In [9]:
def clean_labels(df: pd.DataFrame, label_col: str = "Label") -> pd.DataFrame:
    df = df.copy()
    df[label_col] = df[label_col].astype(str).str.strip()
    df["is_attack"] = (df[label_col].str.upper() != "BENIGN").astype(int)
    return df

df_mon = clean_labels(df_mon)
df_tue = clean_labels(df_tue)
df_fri = clean_labels(df_fri)

print("Monday is_attack %:", round(df_mon["is_attack"].mean()*100, 3))
print("Tuesday is_attack %:", round(df_tue["is_attack"].mean()*100, 3))
print("Friday is_attack %:", round(df_fri["is_attack"].mean()*100, 3))


Monday is_attack %: 0.0
Tuesday is_attack %: 3.103
Friday is_attack %: 56.713


In [10]:
def sanity_check(df: pd.DataFrame, label_col: str = "Label"):
    feature_cols = [c for c in df.columns if c not in [label_col, "is_attack"]]
    # check NaNs
    total_nans = int(df[feature_cols].isna().sum().sum())
    # check inf
    total_infs = int(np.isinf(df[feature_cols].to_numpy()).sum())
    return total_nans, total_infs, len(feature_cols)

for name, d in [("Monday", df_mon), ("Tuesday", df_tue), ("Friday", df_fri)]:
    nans, infs, nfeat = sanity_check(d)
    print(name, "=> features:", nfeat, "| NaNs:", nans, "| Infs:", infs)


Monday => features: 68 | NaNs: 0 | Infs: 0
Tuesday => features: 68 | NaNs: 0 | Infs: 0
Friday => features: 68 | NaNs: 0 | Infs: 0


In [11]:
PROCESSED_DIR = os.path.join(".", "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)

out_mon = os.path.join(PROCESSED_DIR, "monday_clean.csv")
out_tue = os.path.join(PROCESSED_DIR, "tuesday_clean.csv")
out_fri = os.path.join(PROCESSED_DIR, "friday_clean.csv")

df_mon.to_csv(out_mon, index=False)
df_tue.to_csv(out_tue, index=False)
df_fri.to_csv(out_fri, index=False)

print("Saved:")
print(out_mon)
print(out_tue)
print(out_fri)


Saved:
.\processed\monday_clean.csv
.\processed\tuesday_clean.csv
.\processed\friday_clean.csv


In [12]:
summary = pd.DataFrame({
    "dataset": ["monday", "tuesday", "friday"],
    "rows": [len(df_mon), len(df_tue), len(df_fri)],
    "cols": [df_mon.shape[1], df_tue.shape[1], df_fri.shape[1]],
    "attack_pct": [
        round(df_mon["is_attack"].mean()*100, 3),
        round(df_tue["is_attack"].mean()*100, 3),
        round(df_fri["is_attack"].mean()*100, 3),
    ]
})
summary


Unnamed: 0,dataset,rows,cols,attack_pct
0,monday,529918,70,0.0
1,tuesday,445909,70,3.103
2,friday,225745,70,56.713
