In [None]:
import os
import numpy as np
import pandas as pd
import re
import json


DATA_FILES = ['02-14-2018.csv', '02-16-2018.csv', '02-21-2018.csv']
OUTPUT_FILE = 'dataset_unified.csv'
RANDOM_STATE = 42
SAMPLE_SIZE_PER_CLASS = 150000


feature_index_content = {
  "feature_order": [
    "url_length", "num_dots", "num_slashes", "num_hyphens", "num_parameters",
    "suspicious_tokens_count", "has_ip_address", "entropy", "tld_length",
    "domain_length", "path_length", "query_length", "is_https",
    "js_total_functions", "js_eval_count", "js_settimeout_count",
    "js_setinterval_count", "js_function_length_avg", "script_count",
    "script_external_count", "img_count", "iframe_count", "anchor_count",
    "form_count", "input_count", "button_count", "css_count",
    "dom_total_nodes", "dom_mutation_rate", "dom_depth", "text_length",
    "has_login_keyword", "has_verify_keyword", "has_bank_keyword",
    "has_pay_keyword", "has_wallet_keyword", "if_window_open",
    "if_fetch_intercept", "if_cookie_access", "if_localstorage_access",
    "if_clipboard_access"
  ]
}
FULL_FEATURE_ORDER = feature_index_content["feature_order"]

SYNTHETIC_WEB_FEATURES = [
    'url_length', 'query_length', 'entropy_query', 'has_single_quote',
    'has_script_tag', 'content_length', 'is_post_method', 'num_dots'
]

print("Starting data processing with corrected script...")


dfs = []
for f in DATA_FILES:
    if os.path.exists(f):
        print(f"Loading {f}...")
        dfs.append(pd.read_csv(f, low_memory=False))
    else:
        print(f"Error: CICIDS data file {f} not found. Ensure all 3 files are in the working directory.")
    

if not dfs:
    raise SystemExit('No CICIDS data files loaded. Please check file paths.')

df = pd.concat(dfs, ignore_index=True)
del dfs
print(f"Total rows before cleaning: {len(df)}")


def clean_column_name(col):
    col = col.strip()
    col = re.sub(r'\s+', ' ', col)
    return col.lower()

df.columns = [clean_column_name(col) for col in df.columns]


df.drop_duplicates(inplace=True)
COLS_TO_CONVERT = [col for col in df.columns if col not in ['label'] and df[col].dtype == 'object']
for col in COLS_TO_CONVERT:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df.replace([np.inf, -np.inf], np.nan, inplace=True)

df.fillna(0, inplace=True)

def unify_label(x):
    s = str(x).lower()
    if 'benign' in s or s.strip() == '0':
        return 0
    return 1
df['Label'] = df['label'].apply(unify_label)
df.drop(columns=['label'], inplace=True, errors='ignore')

print(f"Total rows after cleaning and NaN fill: {len(df)}. Applying aggressive sampling...")

df_benign = df[df['Label'] == 0].copy()
df_attack = df[df['Label'] == 1].copy()
del df

if len(df_benign) > SAMPLE_SIZE_PER_CLASS:
    df_benign = df_benign.sample(n=SAMPLE_SIZE_PER_CLASS, random_state=RANDOM_STATE).copy()

if len(df_attack) > SAMPLE_SIZE_PER_CLASS:
    df_attack = df_attack.sample(n=SAMPLE_SIZE_PER_CLASS, random_state=RANDOM_STATE).copy()

df = pd.concat([df_benign, df_attack], ignore_index=True).copy()
del df_benign, df_attack

print(f"Total rows after aggressive sampling: {len(df)}")
if len(df) == 0:
    raise SystemExit("Fatal Error: DataFrame is empty after sampling. Check data quality.")

N_attack = df['Label'].sum()
N_benign = len(df) - N_attack
synthetic_data = np.zeros((len(df), len(SYNTHETIC_WEB_FEATURES)))
is_attack = df['Label'].values == 1
is_benign = df['Label'].values == 0

print(f"Generating synthetic web features for {N_attack} attacks and {N_benign} benign flows...")

# CONTENT_LENGTH - index 5
content_signal_col = 'totlen fwd pkts'
if content_signal_col in df.columns and 'tot fwd pkts' in df.columns:
    fwd_pkts_safe = df['tot fwd pkts'].values.copy()
    fwd_pkts_safe[fwd_pkts_safe == 0] = 1
    content_signal = df[content_signal_col].values / fwd_pkts_safe
    synthetic_data[:, 5] = np.clip(content_signal, 0, 1000)
else:
    synthetic_data[:, 5] = np.random.randint(50, 500, len(df))

# IS_POST_METHOD - index 6
post_signal_col_1 = 'tot fwd pkts'
post_signal_col_2 = 'tot bwd pkts'
if post_signal_col_1 in df.columns and post_signal_col_2 in df.columns:
    post_signal = (df[post_signal_col_1].values > df[post_signal_col_2].values * 1.5)
    synthetic_data[:, 6] = np.where(post_signal, 1, 0)
    synthetic_data[:, 6][is_attack] = np.random.choice([1, 0], N_attack, p=[0.5, 0.5])
else:
    synthetic_data[:, 6] = np.random.choice([1, 0], len(df), p=[0.2, 0.8])

synthetic_data[:, 0][is_benign] = np.random.randint(40, 100, N_benign)
synthetic_data[:, 0][is_attack] = np.random.randint(90, 250, N_attack)
synthetic_data[:, 1][is_benign] = np.random.randint(0, 5, N_benign)
synthetic_data[:, 1][is_attack] = np.random.randint(15, 60, N_attack)
synthetic_data[:, 2][is_benign] = np.random.uniform(1.0, 3.0, N_benign)
synthetic_data[:, 2][is_attack] = np.random.uniform(3.5, 5.5, N_attack)
synthetic_data[:, 3][is_attack] = np.random.choice([1, 0], N_attack, p=[0.6, 0.4])
synthetic_data[:, 3][is_benign] = 0
synthetic_data[:, 4][is_attack] = np.random.choice([1, 0], N_attack, p=[0.4, 0.6])
synthetic_data[:, 4][is_benign] = 0
synthetic_data[:, 7] = np.random.randint(1, 4, len(df))


df_synth_features = pd.DataFrame(synthetic_data, columns=SYNTHETIC_WEB_FEATURES, index=df.index)
df_core = df.drop(columns=SYNTHETIC_WEB_FEATURES, errors='ignore')
df_unified = pd.concat([df_core, df_synth_features], axis=1)

missing_features = [f for f in FULL_FEATURE_ORDER if f not in df_unified.columns]
for feature in missing_features:
    df_unified[feature] = 0

final_columns_order = FULL_FEATURE_ORDER + ['Label']
df_final = df_unified[final_columns_order].copy()

df_final.to_csv(OUTPUT_FILE, index=False)
print(f"\nSuccessfully created {OUTPUT_FILE} with {len(df_final)} rows.")
print(f"Final features exported: {df_final.shape[1]} columns (41 features + Label)")

Starting data processing with corrected script...
Loading 02-14-2018.csv...
Loading 02-16-2018.csv...
Loading 02-21-2018.csv...
Total rows before cleaning: 1647602
Total rows after cleaning and NaN fill: 1273816. Applying aggressive sampling...
Total rows after aggressive sampling: 300000
Generating synthetic web features for 150000 attacks and 150000 benign flows...

Successfully created dataset_unified.csv with 300000 rows.
Final features exported: 42 columns (41 features + Label)
