In [None]:
import pandas as pd
import numpy as np
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
from sklearn.metrics import (
    classification_report, confusion_matrix, f1_score, accuracy_score,
    recall_score, precision_score, log_loss, roc_curve, auc
)
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

# --- CONFIGURATION ---
RANDOM_STATE = 42
DATASET_PATH = '/Users/bodapati/Downloads/wataiData 3/csv/CICIoT2023'
SAMPLING_FRACTION = 0.1  # Adjust if kernel crashes
TOP_N_FEATURES = 20

# --- HELPER FUNCTION FOR PLOTTING ---
def plot_confusion_matrix(cm, classes, title):
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=classes, yticklabels=classes)
    plt.title(title, fontsize=16)
    plt.ylabel('True Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

# --- STEP 1: LOAD & SAMPLE CSVs ---
print("\nStep 1: Loading CSV files with sampling to avoid memory issues...")
all_files = glob.glob(os.path.join(DATASET_PATH, "*.csv"))
if not all_files:
    raise FileNotFoundError(f"No CSV files found in {DATASET_PATH}")
print(f"Found {len(all_files)} CSV files.")

df_list = []
for file in all_files:
    df = pd.read_csv(file).sample(frac=SAMPLING_FRACTION, random_state=RANDOM_STATE)
    df_list.append(df)
main_df = pd.concat(df_list, ignore_index=True)
main_df.dropna(axis=1, how='all', inplace=True)
main_df.dropna(inplace=True)
main_df.drop_duplicates(inplace=True)
print(f"Data loaded. Shape after sampling and cleaning: {main_df.shape}")

# --- STEP 2: LABEL HANDLING (34 CLASSES) ---
print(f"Labels retained as original. Unique classes: {main_df['label'].nunique()}")
print(f"Classes: {main_df['label'].unique()}")
main_df.dropna(subset=['label'], inplace=True)

# --- STEP 2.1: PRINT CLASS COUNTS ---
print("\n=== Class Distribution ===")
class_counts = main_df['label'].value_counts()
for label, count in class_counts.items():
    print(f"{label}: {count}")

normal_count = main_df[main_df['label'] == 'BenignTraffic'].shape[0]
attack_count = main_df[main_df['label'] != 'BenignTraffic'].shape[0]
print(f"\nNormal (BenignTraffic): {normal_count}")
print(f"Total Attacks: {attack_count}")


# --- STEP 4: TRAIN-TEST SPLIT & FEATURE SCALING ---
cols_to_drop = ['label', 'device', 'device_category', 'ts']
existing_cols_to_drop = [col for col in cols_to_drop if col in main_df.columns]

X = main_df.drop(columns=existing_cols_to_drop).select_dtypes(include=np.number)
y = main_df['label']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=RANDOM_STATE, stratify=y_encoded
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- STEP 5: TRAIN LIGHTGBM ---
print("Training LightGBM model for 34-class classification...")
lgb_train = lgb.Dataset(X_train_scaled, label=y_train)
lgb_test = lgb.Dataset(X_test_scaled, label=y_test, reference=lgb_train)

params = {
    'objective': 'multiclass',
    'num_class': len(le.classes_),
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'num_leaves': 64,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': -1,
    'random_state': RANDOM_STATE,
    'n_jobs': -1
}

lgb_model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_test],
    num_boost_round=200,
    callbacks=[lgb.early_stopping(stopping_rounds=20), lgb.log_evaluation(period=50)]
)

# --- STEP 6: EVALUATION ---
y_pred_proba = lgb_model.predict(X_test_scaled, num_iteration=lgb_model.best_iteration)
y_pred = np.argmax(y_pred_proba, axis=1)

print("\n=== 34-Class LightGBM Evaluation ===")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred, average='macro', zero_division=0))
print("Precision:", precision_score(y_test, y_pred, average='macro', zero_division=0))
print("F1-score :", f1_score(y_test, y_pred, average='macro', zero_division=0))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

# --- STEP 6.1: TRAIN & TEST ACCURACY/LOSS ---
train_pred_proba = lgb_model.predict(X_train_scaled, num_iteration=lgb_model.best_iteration)
train_pred = np.argmax(train_pred_proba, axis=1)

train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, y_pred)

train_loss = log_loss(y_train, train_pred_proba)
test_loss = log_loss(y_test, y_pred_proba)

print("\n=== Model Performance ===")
print(f"Train Accuracy : {train_acc:.4f}")
print(f"Test Accuracy  : {test_acc:.4f}")
print(f"Train Loss     : {train_loss:.4f}")
print(f"Test Loss      : {test_loss:.4f}")

# --- STEP 7: CONFUSION MATRIX ---
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes=le.classes_, title='Confusion Matrix - 34-Class LightGBM')

# --- STEP 8: FEATURE IMPORTANCE ---
importance = lgb_model.feature_importance(importance_type='gain')
feature_names = X.columns
indices = np.argsort(importance)[::-1]

print("\nTop 20 important features:")
for i in range(min(TOP_N_FEATURES, len(feature_names))):
    print(f"{i+1}. {feature_names[indices[i]]}: {importance[indices[i]]:.4f}")

# --- STEP 9: FEATURE IMPORTANCE HEATMAP ---
importance_df = pd.DataFrame([importance], columns=feature_names)
top_features = importance_df.T.sort_values(by=0, ascending=False).head(TOP_N_FEATURES)
plt.figure(figsize=(12, 8))
sns.heatmap(top_features, annot=True, cmap='viridis', fmt=".4f")
plt.title("Top Feature Importances - LightGBM", fontsize=16)
plt.ylabel("Feature", fontsize=12)
plt.xlabel("Importance", fontsize=12)
plt.tight_layout()
plt.show()

# --- STEP 10: MULTI-CLASS ROC CURVE ---
y_test_bin = label_binarize(y_test, classes=np.arange(len(le.classes_)))
n_classes = y_test_bin.shape[1]

fpr, tpr, roc_auc = {}, {}, {}
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

plt.figure(figsize=(10, 8))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], lw=2,
             label=f'ROC curve of {le.classes_[i]} (AUC = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate", fontsize=12)
plt.ylabel("True Positive Rate", fontsize=12)
plt.title("Multi-class ROC Curve - 34-Class LightGBM", fontsize=16)
plt.legend(loc="lower right", fontsize=8)
plt.tight_layout()
plt.show()


Step 1: Loading CSV files with sampling to avoid memory issues...
Found 169 CSV files.
Data loaded. Shape after sampling and cleaning: (4668657, 47)
Labels retained as original. Unique classes: 34
Classes: ['DoS-SYN_Flood' 'DDoS-UDP_Flood' 'DDoS-TCP_Flood'
 'DDoS-SynonymousIP_Flood' 'DDoS-ICMP_Flood' 'DDoS-SYN_Flood'
 'DDoS-RSTFINFlood' 'DDoS-PSHACK_Flood' 'Mirai-greip_flood'
 'DDoS-ACK_Fragmentation' 'DNS_Spoofing' 'DoS-TCP_Flood' 'DoS-UDP_Flood'
 'BenignTraffic' 'Mirai-udpplain' 'Mirai-greeth_flood' 'Recon-PortScan'
 'DoS-HTTP_Flood' 'DDoS-UDP_Fragmentation' 'Recon-HostDiscovery'
 'Recon-OSScan' 'VulnerabilityScan' 'MITM-ArpSpoofing' 'DDoS-SlowLoris'
 'DDoS-ICMP_Fragmentation' 'DDoS-HTTP_Flood' 'DictionaryBruteForce'
 'BrowserHijacking' 'Recon-PingSweep' 'SqlInjection' 'XSS'
 'CommandInjection' 'Backdoor_Malware' 'Uploading_Attack']

=== Class Distribution ===
DDoS-ICMP_Flood: 721320
DDoS-UDP_Flood: 540649
DDoS-TCP_Flood: 450251
DDoS-PSHACK_Flood: 409376
DDoS-SYN_Flood: 405994
DDoS-