In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Bidirectional, LSTM, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
import joblib
import os
import glob

In [2]:
def load_and_preprocess_data(file_paths):
    dfs = []
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    
    
    labels_to_remove = ['DictionaryBruteForce', 'BrowserHijacking', 'XSS', 'Uploading_Attack', 'SqlInjection', 'CommandInjection', 'Backdoor_Malware']
    df = df[~df['label'].isin(labels_to_remove)]
    
    # To check the distribution of labels
    print(df['label'].value_counts())


    
    columns_to_drop = ['label', 'flow_id', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol', 'timestamp']
    columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    
    X = df.drop(columns_to_drop, axis=1)
    y = df['label']
    
    
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    return X, y, le

In [3]:
def create_and_train_model(X_train, y_train, X_val, y_val, num_classes):


    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
    model = Sequential([
        Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Conv1D(128, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Conv1D(256, kernel_size=3, activation='relu'),
        Bidirectional(LSTM(64, return_sequences=True)),
        Bidirectional(LSTM(32)),
        Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
        Dropout(0.2),
        Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
        Dropout(0.2),
        Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')  
    ])
    
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    history = model.fit(X_train, y_train,
                        validation_data=(X_val, y_val),
                        epochs=5,
                        batch_size=32,
                        verbose=1)
    
    return model, history

In [4]:
def evaluate_model(model, X_test, y_test, le):
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)
    
    accuracy = accuracy_score(y_test_classes, y_pred_classes)
    precision = precision_score(y_test_classes, y_pred_classes, average='weighted')
    recall = recall_score(y_test_classes, y_pred_classes, average='weighted')
    f1 = f1_score(y_test_classes, y_pred_classes, average='weighted')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test_classes, y_pred_classes, target_names=le.classes_))

In [5]:
def save_model(model, scaler, le, model_dir='saved_model'):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    model.save(os.path.join(model_dir, 'ddos_model.h5'))
    joblib.dump(scaler, os.path.join(model_dir, 'scaler.joblib'))
    joblib.dump(le, os.path.join(model_dir, 'label_encoder.joblib'))
    print(f"Model and associated objects saved in {model_dir}")

In [6]:
def load_saved_model(model_dir='saved_model'):
    model = load_model(os.path.join(model_dir, 'ddos_model.h5'))
    scaler = joblib.load(os.path.join(model_dir, 'scaler.joblib'))
    le = joblib.load(os.path.join(model_dir, 'label_encoder.joblib'))
    print(f"Model and associated objects loaded from {model_dir}")
    return model, scaler, le

In [7]:
def test_loaded_model(model, scaler, le, X_test, y_test):
    X_test_scaled = scaler.transform(X_test)
    X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)
    y_test_cat = to_categorical(y_test, num_classes=len(le.classes_))
    
    print("Evaluating loaded model:")
    evaluate_model(model, X_test_scaled, y_test_cat, le)

In [8]:
if __name__ == "__main__":
    # directory = "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\"
    
    # # Use glob to find all CSV files that match the pattern
    # file_paths = glob.glob(os.path.join(directory, "part-*.csv"))    

    file_paths = [
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        "D:\\DDOS\\New\\archive(4)\\wataiData\\csv\\CICIoT2023\\part-00014-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv",
        
    ]
    
    X, y, le = load_and_preprocess_data(file_paths)
    
    print("Features:", X.columns.tolist())
    print("Number of features:", X.shape[1])
    print("Unique labels:", le.classes_)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    num_classes = len(np.unique(y))
    y_train_cat = to_categorical(y_train, num_classes)
    y_val_cat = to_categorical(y_val, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    model, history = create_and_train_model(X_train_scaled, y_train_cat, X_val_scaled, y_val_cat, num_classes)
    
    save_model(model, scaler, le)   

    loaded_model, loaded_scaler, loaded_le = load_saved_model()

    test_loaded_model(loaded_model, loaded_scaler, loaded_le, X_test, y_test)                  


label
DDoS-ICMP_Flood            544992
DDoS-UDP_Flood             409362
DDoS-TCP_Flood             340918
DDoS-PSHACK_Flood          310418
DDoS-SYN_Flood             307639
DDoS-RSTFINFlood           305877
DDoS-SynonymousIP_Flood    271629
DoS-UDP_Flood              251542
DoS-TCP_Flood              202278
DoS-SYN_Flood              152378
BenignTraffic               83268
Mirai-greeth_flood          74557
Mirai-udpplain              67454
Mirai-greip_flood           57035
DDoS-ICMP_Fragmentation     34273
MITM-ArpSpoofing            23399
DDoS-UDP_Fragmentation      21861
DDoS-ACK_Fragmentation      21759
DNS_Spoofing                13586
Recon-HostDiscovery         10096
Recon-OSScan                 7600
Recon-PortScan               6172
DoS-HTTP_Flood               5506
VulnerabilityScan            2848
DDoS-HTTP_Flood              2163
DDoS-SlowLoris               1789
Recon-PingSweep               139
Name: count, dtype: int64
Features: ['flow_duration', 'Header_Length', 'Prot

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m70611/70611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m854s[0m 12ms/step - accuracy: 0.8753 - loss: 0.3537 - val_accuracy: 0.9669 - val_loss: 0.1036
Epoch 2/5
[1m70611/70611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m824s[0m 12ms/step - accuracy: 0.9639 - loss: 0.1154 - val_accuracy: 0.9676 - val_loss: 0.1002
Epoch 3/5
[1m70611/70611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m657s[0m 9ms/step - accuracy: 0.9643 - loss: 0.1115 - val_accuracy: 0.9678 - val_loss: 0.0989
Epoch 4/5
[1m70611/70611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m856s[0m 12ms/step - accuracy: 0.9648 - loss: 0.1094 - val_accuracy: 0.9676 - val_loss: 0.0960
Epoch 5/5
[1m70611/70611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m845s[0m 12ms/step - accuracy: 0.9651 - loss: 0.1071 - val_accuracy: 0.9680 - val_loss: 0.0949




Model and associated objects saved in saved_model




Model and associated objects loaded from saved_model
Evaluating loaded model:
[1m22066/22066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 5ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9678
Precision: 0.9696
Recall: 0.9678
F1-score: 0.9608

Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                         precision    recall  f1-score   support

          BenignTraffic       0.74      0.98      0.84     16654
 DDoS-ACK_Fragmentation       1.00      0.97      0.98      4352
        DDoS-HTTP_Flood       0.74      0.52      0.61       432
        DDoS-ICMP_Flood       1.00      1.00      1.00    108998
DDoS-ICMP_Fragmentation       0.99      0.98      0.98      6855
      DDoS-PSHACK_Flood       1.00      1.00      1.00     62084
       DDoS-RSTFINFlood       1.00      1.00      1.00     61175
         DDoS-SYN_Flood       1.00      1.00      1.00     61528
         DDoS-SlowLoris       0.51      0.32      0.39       358
DDoS-SynonymousIP_Flood       1.00      1.00      1.00     54326
         DDoS-TCP_Flood       1.00      1.00      1.00     68184
         DDoS-UDP_Flood       1.00      1.00      1.00     81872
 DDoS-UDP_Fragmentation       0.99      0.98      0.98      4372
           DNS_Spoofing       0.40      0.37      0.38      2717
         DoS-HTTP_Flood 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
import joblib

def predict_ddos(input_data, model_dir='saved_model'):
    model = load_model(f"{model_dir}/ddos_model.h5")
    scaler = joblib.load(f"{model_dir}/scaler.joblib")
    le = joblib.load(f"{model_dir}/label_encoder.joblib")
    
    # Ensure input_data is a 2D numpy array
    if isinstance(input_data, pd.DataFrame):
        input_data = input_data.values
    elif isinstance(input_data, list):
        input_data = np.array(input_data).reshape(1, -1)
    elif isinstance(input_data, np.ndarray) and input_data.ndim == 1:
        input_data = input_data.reshape(1, -1)
    
    # Scale the input data
    input_data_scaled = scaler.transform(input_data)
    
    # Reshape for Conv1D layer
    input_data_reshaped = input_data_scaled.reshape(input_data_scaled.shape[0], input_data_scaled.shape[1], 1)
    
    # Make prediction
    prediction = model.predict(input_data_reshaped)
    
    # Get the predicted class label and probability
    predicted_class_index = np.argmax(prediction, axis=1)[0]
    predicted_probability = np.max(prediction)
    predicted_label = le.inverse_transform([predicted_class_index])[0]
    
    return predicted_label, predicted_probability

# Example usage
if __name__ == "__main__":
    # Example: Make a prediction with sample data
    # Replace this with actual feature values from your dataset
    sample_data = [1000, 20, 6, 60, 100, 50, 50, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1000, 10, 100, 55, 30, 1000, 0.1, 10, 100, 50, 0.5, 900, 1]
    
    predicted_label, predicted_probability = predict_ddos(sample_data)
    
    print(f"Predicted label: {predicted_label}")
    # print(f"Prediction probability: {predicted_probability:.4f}")
    
    # Interactive prediction
    # print("\nEnter custom values for prediction (comma-separated):")
    # user_input = input("Enter values: ")
    # user_values = [float(x.strip()) for x in user_input.split(',')]
    
    # custom_prediction, custom_probability = predict_ddos(user_values)
    # print(f"Predicted label for custom input: {custom_prediction}")
    # print(f"Prediction probability: {custom_probability:.4f}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 652ms/step
Predicted label: BenignTraffic
