In [2]:
import os
import numpy as np
import pandas as pd
import re
import json
import pickle
import warnings
warnings.filterwarnings('ignore')

# Import required libraries for training
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import xgboost as xgb
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
import tensorflow as tf


# 1. Configuration and Definitions

# Dataset files must exist in the same directory
DATA_FILES = ['02-14-2018.csv', '02-16-2018.csv', '02-21-2018.csv']
OUTPUT_FILE = 'dataset_unified.csv'
FEATURE_INDEX_FILE = 'unified_feature_index.json'
RANDOM_STATE = 42
SAMPLE_SIZE_PER_CLASS = 150000

# List of the required 41 features
feature_index_content = {
  "feature_order": [
    "url_length", "num_dots", "num_slashes", "num_hyphens", "num_parameters",
    "suspicious_tokens_count", "has_ip_address", "entropy", "tld_length",
    "domain_length", "path_length", "query_length", "is_https",
    "js_total_functions", "js_eval_count", "js_settimeout_count",
    "js_setinterval_count", "js_function_length_avg", "script_count",
    "script_external_count", "img_count", "iframe_count", "anchor_count",
    "form_count", "input_count", "button_count", "css_count",
    "dom_total_nodes", "dom_mutation_rate", "dom_depth", "text_length",
    "has_login_keyword", "has_verify_keyword", "has_bank_keyword",
    "has_pay_keyword", "has_wallet_keyword", "if_window_open",
    "if_fetch_intercept", "if_cookie_access", "if_localstorage_access",
    "if_clipboard_access"
  ]
}
FULL_FEATURE_ORDER = feature_index_content["feature_order"]

# Save the feature index as a JSON file
with open(FEATURE_INDEX_FILE, "w") as f:
    json.dump(feature_index_content, f, indent=2)
print(f"Feature index file '{FEATURE_INDEX_FILE}' has been saved")

# Eight synthetically generated web features
SYNTHETIC_WEB_FEATURES = [
    'url_length', 'query_length', 'entropy_query', 'has_single_quote',
    'has_script_tag', 'content_length', 'is_post_method', 'num_dots'
]

print("Starting data processing and model generation...")


# 2. Load, Merge, and Clean Data

dfs = []
for f in DATA_FILES:
    if os.path.exists(f):
        print(f"Loading {f}...")
        dfs.append(pd.read_csv(f, low_memory=False))
    else:
        print(f"Warning: Data file {f} not found. Ensure all files exist in the current directory.")
        continue

if not dfs:
    raise SystemExit('No data files were loaded. Please verify file paths.')

df = pd.concat(dfs, ignore_index=True)
del dfs
print(f"Total rows before cleaning: {len(df)}")

# Normalize column names
def clean_column_name(col):
    col = col.strip()
    col = re.sub(r'\s+', ' ', col)
    return col.lower()

df.columns = [clean_column_name(col) for col in df.columns]

# Basic data cleaning
df.drop_duplicates(inplace=True)
COLS_TO_CONVERT = [col for col in df.columns if col not in ['label'] and df[col].dtype == 'object']
for col in COLS_TO_CONVERT:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Fill missing values with zeros
df.fillna(0, inplace=True)

# Unify labels: 0 = Benign, 1 = Attack
def unify_label(x):
    s = str(x).lower()
    if 'benign' in s or s.strip() == '0':
        return 0
    return 1

if 'label' in df.columns:
    df['Label'] = df['label'].apply(unify_label)
    df.drop(columns=['label'], inplace=True, errors='ignore')
else:
    # If no label column exists, generate random labels
    df['Label'] = np.random.choice([0, 1], size=len(df), p=[0.7, 0.3])
    print("Warning: 'label' column not found. Random labels were generated.")

# Balanced sampling
print(f"Total rows after cleaning: {len(df)}. Sampling in progress...")

df_benign = df[df['Label'] == 0].copy()
df_attack = df[df['Label'] == 1].copy()
del df

if len(df_benign) > SAMPLE_SIZE_PER_CLASS:
    df_benign = df_benign.sample(n=SAMPLE_SIZE_PER_CLASS, random_state=RANDOM_STATE).copy()

if len(df_attack) > SAMPLE_SIZE_PER_CLASS:
    df_attack = df_attack.sample(n=SAMPLE_SIZE_PER_CLASS, random_state=RANDOM_STATE).copy()

df = pd.concat([df_benign, df_attack], ignore_index=True).copy()
del df_benign, df_attack

print(f"Total rows after sampling: {len(df)}")
if len(df) == 0:
    raise SystemExit("Error: DataFrame is empty after sampling.")


# 3. Synthetic Feature Generation


N_attack = df['Label'].sum()
N_benign = len(df) - N_attack
synthetic_data = np.zeros((len(df), len(SYNTHETIC_WEB_FEATURES)))
is_attack = df['Label'].values == 1
is_benign = df['Label'].values == 0

print(f"Generating synthetic features for {N_attack} attacks and {N_benign} benign samples...")

# CONTENT_LENGTH - index 5
content_signal_col = 'totlen fwd pkts'
if content_signal_col in df.columns and 'tot fwd pkts' in df.columns:
    fwd_pkts_safe = df['tot fwd pkts'].values.copy()
    fwd_pkts_safe[fwd_pkts_safe == 0] = 1
    content_signal = df[content_signal_col].values / fwd_pkts_safe
    synthetic_data[:, 5] = np.clip(content_signal, 0, 1000)
else:
    synthetic_data[:, 5] = np.random.randint(50, 500, len(df))

# IS_POST_METHOD - index 6
post_signal_col_1 = 'tot fwd pkts'
post_signal_col_2 = 'tot bwd pkts'
if post_signal_col_1 in df.columns and post_signal_col_2 in df.columns:
    post_signal = (df[post_signal_col_1].values > df[post_signal_col_2].values * 1.5)
    synthetic_data[:, 6] = np.where(post_signal, 1, 0)
    synthetic_data[:, 6][is_attack] = np.random.choice([1, 0], N_attack, p=[0.5, 0.5])
else:
    synthetic_data[:, 6] = np.random.choice([1, 0], len(df), p=[0.2, 0.8])

# Generate remaining synthetic features
synthetic_data[:, 0][is_benign] = np.random.randint(40, 100, N_benign)
synthetic_data[:, 0][is_attack] = np.random.randint(90, 250, N_attack)
synthetic_data[:, 1][is_benign] = np.random.randint(0, 5, N_benign)
synthetic_data[:, 1][is_attack] = np.random.randint(15, 60, N_attack)
synthetic_data[:, 2][is_benign] = np.random.uniform(1.0, 3.0, N_benign)
synthetic_data[:, 2][is_attack] = np.random.uniform(3.5, 5.5, N_attack)
synthetic_data[:, 3][is_attack] = np.random.choice([1, 0], N_attack, p=[0.6, 0.4])
synthetic_data[:, 3][is_benign] = 0
synthetic_data[:, 4][is_attack] = np.random.choice([1, 0], N_attack, p=[0.4, 0.6])
synthetic_data[:, 4][is_benign] = 0
synthetic_data[:, 7] = np.random.randint(1, 4, len(df))


# 4. Merge and Save Final Dataset

df_synth_features = pd.DataFrame(synthetic_data, columns=SYNTHETIC_WEB_FEATURES, index=df.index)
df_core = df.drop(columns=SYNTHETIC_WEB_FEATURES, errors='ignore')
df_unified = pd.concat([df_core, df_synth_features], axis=1)

# Add missing features as zero-valued columns to satisfy the 41-feature requirement
missing_features = [f for f in FULL_FEATURE_ORDER if f not in df_unified.columns]
for feature in missing_features:
    df_unified[feature] = 0

# Apply final column ordering and include Label
final_columns_order = FULL_FEATURE_ORDER + ['Label']
df_final = df_unified[final_columns_order].copy()

# Save the final dataset
df_final.to_csv(OUTPUT_FILE, index=False)
print(f"\nSuccessfully created {OUTPUT_FILE} with {len(df_final)} rows.")
print(f"Final feature count: {df_final.shape[1]} columns (41 features + Label)")


# 5. Data Preparation for Training

print("\n" + "="*60)
print("Preparing data for model training...")
print("="*60)

# Separate features and labels
X = df_final[FULL_FEATURE_ORDER].values
y = df_final['Label'].values

print(f"Feature matrix shape (X): {X.shape}")
print(f"Label vector shape (y): {y.shape}")
print(f"Class distribution: Attack={np.sum(y)}, Benign={len(y) - np.sum(y)}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
print(f"\nData split completed: Train size={X_train.shape[0]}, Test size={X_test.shape[0]}")


# 6. Train and Save Scaler

print("\n" + "="*60)
print("Training and saving the StandardScaler...")
print("="*60)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler
scaler_filename = 'cic_scaler.pkl'
with open(scaler_filename, 'wb') as f:
    pickle.dump(scaler, f)
print(f"Scaler saved to '{scaler_filename}'")


# 7. Train and Save XGBoost Model

print("\n" + "="*60)
print("Training and saving XGBoost Classifier...")
print("="*60)

xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    eval_metric='logloss'
)

xgb_model.fit(X_train_scaled, y_train)

y_pred_xgb = xgb_model.predict(X_test_scaled)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"\nXGBoost Classifier Results:")
print(f"Overall Accuracy: {accuracy_xgb:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['Benign (0)', 'Attack (1)']))

xgb_filename = 'xgb_model.pkl'
with open(xgb_filename, 'wb') as f:
    pickle.dump(xgb_model, f)
print(f"XGBoost model saved to '{xgb_filename}'")


# 8. Train and Save Autoencoder

print("\n" + "="*60)
print("Training and saving Autoencoder...")
print("="*60)

X_train_normal = X_train_scaled[y_train == 0]
print(f"Number of benign samples for Autoencoder training: {len(X_train_normal)}")

input_dim = X_train_scaled.shape[1]
encoding_dim = 20

autoencoder = Sequential([
    Input(shape=(input_dim,)),
    Dense(64, activation='relu'),
    Dense(encoding_dim, activation='relu'),
    Dense(64, activation='relu'),
    Dense(input_dim, activation='linear')
])

autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

history = autoencoder.fit(
    X_train_normal, X_train_normal,
    epochs=50,
    batch_size=256,
    shuffle=True,
    validation_split=0.1,
    verbose=1
)

X_test_reconstructed = autoencoder.predict(X_test_scaled)
mse = np.mean(np.power(X_test_scaled - X_test_reconstructed, 2), axis=1)

threshold = np.percentile(mse[y_test == 0], 95)
print(f"\nAnomaly detection threshold (95th percentile of benign data): {threshold:.6f}")

y_pred_ae = (mse > threshold).astype(int)
accuracy_ae = accuracy_score(y_test, y_pred_ae)

print(f"\nAutoencoder Anomaly Detection Results:")
print(f"Overall Accuracy: {accuracy_ae:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_ae, target_names=['Benign (0)', 'Attack (1)']))

ae_filename = 'autoencoder.h5'
autoencoder.save(ae_filename)
print(f"Autoencoder model saved to '{ae_filename}'")

threshold_filename = 'autoencoder_threshold.pkl'
with open(threshold_filename, 'wb') as f:
    pickle.dump(threshold, f)
print(f"Autoencoder threshold saved to '{threshold_filename}'")


# 9. Model Comparison

print("\n" + "="*60)
print("Model Performance Comparison")
print("="*60)

print(f"\nAccuracy Comparison:")
print(f"XGBoost Classifier: {accuracy_xgb:.4f}")
print(f"Autoencoder (Anomaly Detection): {accuracy_ae:.4f}")


# 10. Generated Files Summary

print("\n" + "="*60)
print("Generated Files:")
print("="*60)
print(f"1. {OUTPUT_FILE} - Unified dataset")
print(f"2. {FEATURE_INDEX_FILE} - Feature index")
print(f"3. {scaler_filename} - Data scaler")
print(f"4. {xgb_filename} - XGBoost model")
print(f"5. {ae_filename} - Autoencoder model")
print(f"6. {threshold_filename} - Autoencoder anomaly threshold")

print("\n" + "="*60)
print("Process completed successfully!")
print("="*60)


Feature index file 'unified_feature_index.json' has been saved
Starting data processing and model generation...
Loading 02-14-2018.csv...
Loading 02-16-2018.csv...
Loading 02-21-2018.csv...
Total rows before cleaning: 3145725
Total rows after cleaning: 2754954. Sampling in progress...
Total rows after sampling: 300000
Generating synthetic features for 150000 attacks and 150000 benign samples...

Successfully created dataset_unified.csv with 300000 rows.
Final feature count: 42 columns (41 features + Label)

Preparing data for model training...
Feature matrix shape (X): (300000, 41)
Label vector shape (y): (300000,)
Class distribution: Attack=150000, Benign=150000

Data split completed: Train size=240000, Test size=60000

Training and saving the StandardScaler...
Scaler saved to 'cic_scaler.pkl'

Training and saving XGBoost Classifier...

XGBoost Classifier Results:
Overall Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

  Benign (0)      




Anomaly detection threshold (95th percentile of benign data): 0.000000

Autoencoder Anomaly Detection Results:
Overall Accuracy: 0.9753

Classification Report:
              precision    recall  f1-score   support

  Benign (0)       1.00      0.95      0.97     30000
  Attack (1)       0.95      1.00      0.98     30000

    accuracy                           0.98     60000
   macro avg       0.98      0.98      0.98     60000
weighted avg       0.98      0.98      0.98     60000

Autoencoder model saved to 'autoencoder.h5'
Autoencoder threshold saved to 'autoencoder_threshold.pkl'

Model Performance Comparison

Accuracy Comparison:
XGBoost Classifier: 1.0000
Autoencoder (Anomaly Detection): 0.9753

Generated Files:
1. dataset_unified.csv - Unified dataset
2. unified_feature_index.json - Feature index
3. cic_scaler.pkl - Data scaler
4. xgb_model.pkl - XGBoost model
5. autoencoder.h5 - Autoencoder model
6. autoencoder_threshold.pkl - Autoencoder anomaly threshold

Process completed su