In [7]:
import pandas as pd
import numpy as np
import os
import glob
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [38]:
def load_and_preprocess_data(file_paths):
    print("Loading data...")
    dfs = []
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        dfs.append(df)
    
    df = pd.concat(dfs, ignore_index=True)
    
    labels_to_remove = ['DictionaryBruteForce', 'BrowserHijacking', 'XSS', 
                        'Uploading_Attack', 'SqlInjection', 'CommandInjection', 
                        'Backdoor_Malware']
    df = df[~df['label'].isin(labels_to_remove)]
    
    print("\nOriginal class distribution:")
    print(df['label'].value_counts())
    
    columns_to_drop = ['label', 'flow_id', 'src_ip', 'src_port', 
                       'dst_ip', 'dst_port', 'protocol', 'timestamp']
    columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    
    X = df.drop(columns_to_drop, axis=1)
    y = df['label']
    
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    return X, y, le

In [39]:
def balance_dataset(X, y, strategy='hybrid', random_state=42):
    print("\nOriginal dataset shape:", Counter(y))
    
    if strategy == 'smote':
        sampler = SMOTE(random_state=random_state)
        X_resampled, y_resampled = sampler.fit_resample(X, y)
    elif strategy == 'adasyn':
        sampler = ADASYN(random_state=random_state)
        X_resampled, y_resampled = sampler.fit_resample(X, y)
    elif strategy == 'hybrid':
        class_counts = Counter(y)
        median_count = np.median(list(class_counts.values()))
        mean_count = np.mean(list(class_counts.values()))
        target_count = int((median_count + mean_count) / 2)
        
        sampling_strategy_over = {k: target_count for k, v in class_counts.items() 
                                  if v < target_count}
        sampling_strategy_under = {k: target_count for k, v in class_counts.items() 
                                   if v > target_count}
        
        pipeline = Pipeline([
            ('smote', SMOTE(sampling_strategy=sampling_strategy_over, 
                            random_state=random_state)),
            ('undersampler', RandomUnderSampler(sampling_strategy=sampling_strategy_under, 
                                                random_state=random_state))
        ])
        
        X_resampled, y_resampled = pipeline.fit_resample(X, y)
    
    print("Balanced dataset shape:", Counter(y_resampled))
    return X_resampled, y_resampled

In [40]:

def prepare_data_for_training(X, y, test_size=0.2, val_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=val_size, random_state=random_state, stratify=y_train
    )
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    num_classes = len(np.unique(y))
    y_train_cat = to_categorical(y_train, num_classes)
    y_val_cat = to_categorical(y_val, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)
    
    return (X_train_scaled, X_val_scaled, X_test_scaled,
            y_train_cat, y_val_cat, y_test_cat,
            scaler, X_test, y_test)

In [59]:
def create_and_train_model(X_train, y_train, X_val, y_val, num_classes):
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
    
    model = Sequential([
        Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Conv1D(128, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Conv1D(256, kernel_size=3, activation='relu'),
        Bidirectional(LSTM(64, return_sequences=True)),
        Bidirectional(LSTM(32)),
        Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
        Dropout(0.2),
        Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
        Dropout(0.2),
        Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')  
    ])
    
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    history = model.fit(X_train, y_train,
                        validation_data=(X_val, y_val),
                        epochs=200,
                        batch_size=32,
                        verbose=1)
    
    return model, history

In [60]:
def evaluate_model(model, X_test, y_test, le):
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)
    
    accuracy = accuracy_score(y_test_classes, y_pred_classes)
    precision = precision_score(y_test_classes, y_pred_classes, average='weighted')
    recall = recall_score(y_test_classes, y_pred_classes, average='weighted')
    f1 = f1_score(y_test_classes, y_pred_classes, average='weighted')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test_classes, y_pred_classes, target_names=le.classes_))

In [61]:
def save_model(model, scaler, le, model_dir='saveded_model'):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    model.save(os.path.join(model_dir, 'ddos_model.h5'))
    joblib.dump(scaler, os.path.join(model_dir, 'scaler.joblib'))
    joblib.dump(le, os.path.join(model_dir, 'label_encoder.joblib'))
    print(f"Model and associated objects saved in {model_dir}")


In [62]:
def load_saved_model(model_dir='saveded_model'):
    model = load_model(os.path.join(model_dir, 'ddos_model.h5'))
    scaler = joblib.load(os.path.join(model_dir, 'scaler.joblib'))
    le = joblib.load(os.path.join(model_dir, 'label_encoder.joblib'))
    print(f"Model and associated objects loaded from {model_dir}")
    return model, scaler, le

In [63]:
def test_loaded_model(model, scaler, le, X_test, y_test):
    X_test_scaled = scaler.transform(X_test)
    X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)
    
    # Ensure y_test is in categorical format
    if len(y_test.shape) == 1 or y_test.shape[1] == 1:
        y_test_cat = to_categorical(y_test, num_classes=len(le.classes_))
    else:
        y_test_cat = y_test
    
    print("Evaluating loaded model:")
    evaluate_model(model, X_test_scaled, y_test_cat, le)

In [None]:
if __name__ == "__main__":
    file_paths = [f"D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-{i:05d}-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv" 
                  for i in range(3)]
    
    # Load and preprocess data
    X, y, le = load_and_preprocess_data(file_paths)
    
    # Balance dataset
    X_balanced, y_balanced = balance_dataset(X, y, strategy='hybrid')
    
    # Prepare data for training
    (X_train_scaled, X_val_scaled, X_test_scaled,
     y_train_cat, y_val_cat, y_test_cat,
     scaler, X_test, y_test) = prepare_data_for_training(X_balanced, y_balanced)
    
    # Create and train model
    num_classes = len(np.unique(y_balanced))
    model, history = create_and_train_model(
        X_train_scaled, y_train_cat,
        X_val_scaled, y_val_cat,
        num_classes
    )
    
    # Save model and components
    save_model(model, scaler, le)
    
    # Load and test model
    loaded_model, loaded_scaler, loaded_le = load_saved_model()
    test_loaded_model(loaded_model, loaded_scaler, loaded_le, 
                      X_test, y_test)
    
    # Print final evaluation
    print("\nFinal model evaluation:")
    evaluate_model(model, X_test_scaled, y_test_cat, le)

Loading data...

Original class distribution:
label
DDoS-ICMP_Flood            112423
DDoS-UDP_Flood              84712
DDoS-TCP_Flood              70630
DDoS-PSHACK_Flood           64473
DDoS-SYN_Flood              64137
DDoS-RSTFINFlood            63524
DDoS-SynonymousIP_Flood     56428
DoS-UDP_Flood               52059
DoS-TCP_Flood               41894
DoS-SYN_Flood               31595
BenignTraffic               17187
Mirai-greeth_flood          15447
Mirai-udpplain              14213
Mirai-greip_flood           11873
DDoS-ICMP_Fragmentation      7194
MITM-ArpSpoofing             4881
DDoS-UDP_Fragmentation       4568
DDoS-ACK_Fragmentation       4524
DNS_Spoofing                 2822
Recon-HostDiscovery          2165
Recon-OSScan                 1517
Recon-PortScan               1311
DoS-HTTP_Flood               1215
VulnerabilityScan             560
DDoS-HTTP_Flood               442
DDoS-SlowLoris                337
Recon-PingSweep                26
Name: count, dtype: int64

Ori

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m10527/10527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 11ms/step - accuracy: 0.5922 - loss: 1.1209 - val_accuracy: 0.7418 - val_loss: 0.6371
Epoch 2/200
[1m10527/10527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 11ms/step - accuracy: 0.7448 - loss: 0.6295 - val_accuracy: 0.7628 - val_loss: 0.5663
Epoch 3/200
[1m10527/10527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 11ms/step - accuracy: 0.7596 - loss: 0.5784 - val_accuracy: 0.7622 - val_loss: 0.6313
Epoch 4/200
[1m10527/10527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 11ms/step - accuracy: 0.7678 - loss: 0.5534 - val_accuracy: 0.7708 - val_loss: 0.5266
Epoch 5/200
[1m10527/10527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 11ms/step - accuracy: 0.7742 - loss: 0.5338 - val_accuracy: 0.7782 - val_loss: 0.5249
Epoch 6/200
[1m10527/10527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 11ms/step - accuracy: 0.7816 - loss: 0.5176 - val_accuracy: 0.7809