In [None]:
from pymongo import MongoClient
from pprint import pprint
import pandas as pd

# MongoDB connection URI
uri = "mongodb://admin:b3BYFU0kJZpGNK6Dt42V@node1-bffd0a8e5302ff2a.database.cloud.ovh.net,node2-bffd0a8e5302ff2a.database.cloud.ovh.net,node3-bffd0a8e5302ff2a.database.cloud.ovh.net/admin?replicaSet=replicaset&tls=true"


try :
    # Connect to the MongoDB cluster
    client = MongoClient(uri)

    db = client["ddos_detection"]
    collection = db["traffic_features"]
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the client connection
    client.close()

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Function to preprocess the JSON data
def preprocess_json_data(collection):

    X = []
    y = []

    # Define feature names for interpretability
    feature_names = [
        'tcp_syn_flag_ratio',
        'udp_port_entropy',
        'avg_pkt_size',
        'flow_density',
        'ip_entropy'
    ]

    # Iterate through the MongoDB collection
    for entry in collection.find():
        # Extract features
        tcp_syn_flag_ratio = (entry.get('tcp_syn_fwd_count', 0) + entry.get('tcp_syn_bwd_count', 0)) / (
                entry.get('fwd_packet_count', 1) + entry.get('bwd_packet_count', 1))
        udp_port_entropy = entry.get('unique_udp_source_ports', 0) * entry.get('unique_udp_dest_ports', 0)
        avg_pkt_size = (entry.get('avg_fwd_pkt_size', 0) + entry.get('avg_bwd_pkt_size', 0)) / 2
        flow_density = entry.get('flow_packets_per_sec', 0) / entry.get('flow_bytes_per_sec', 1)
        ip_entropy = entry.get('source_ip_entropy', 0) + entry.get('dest_port_entropy', 0)

        # Append features to X
        X.append([
            tcp_syn_flag_ratio,
            udp_port_entropy,
            avg_pkt_size,
            flow_density,
            ip_entropy
        ])

        # Append target variable (label)
        label = entry.get('label')
        if label == 'BENIGN':
            y.append(0)
        elif label == 'UDP_FLOOD':
            y.append(1)
        elif label == 'TCP_SYN_FLOOD':
            y.append(2)
        else:
            y.append(-1)  # Adjust this as necessary for unknown labels

    # Convert lists to numpy arrays
    X = np.array(X)
    y = np.array(y)

    # Handle missing values (replace with 0)
    imputer = SimpleImputer(strategy='constant', fill_value=0)
    X = imputer.fit_transform(X)

    # Standardize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)


    # Return the scaled features, labels, and feature names
    return imputer, scaler, X_scaled, y, feature_names


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pymongo import MongoClient
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import  DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd  # Only for feature importance DataFrame, not for `X` processing.
import joblib  # Import joblib to save the model

try:
    # Connect to the MongoDB cluster
    client = MongoClient(uri)

    db = client[db_name]
    collection = db[collection_name]

    
    # Preprocess data and get feature names
    imputer, scaler, X_scaled, y, feature_names = preprocess_json_data(collection)

    # Save the scaler to a .pkl file
    scaler_filename = 'scaler.pkl'
    joblib.dump(scaler, scaler_filename)
    print(f"Standard Scaler saved to {scaler_filename}")

    # Save the imputer to a .pkl file
    imputer_filename = 'imputer.pkl'
    joblib.dump(imputer, imputer_filename)
    print(f"Imputer saved to {imputer_filename}")

    # Train-test split (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Train the Random Forest model
    tree_model = DecisionTreeClassifier(class_weight='balanced', random_state=42)

    tree_model.fit(X_train, y_train)

    # Make predictions
    y_pred = tree_model.predict(X_test)

    # Evaluation: Print classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Feature importances
    feature_importances = tree_model.feature_importances_


    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    # Print the feature importance
    print(feature_importance_df)

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importance in Random Forest Model')
    plt.show()

    # Save the trained model to a .pkl file
    model_filename = 'decision_tree_model.pkl'
    joblib.dump(tree_model, model_filename)
    print(f"Model saved to {model_filename}")

except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the client connection
    client.close()

In [None]:
import joblib
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer

# Load the saved imputer and scaler
scaler = joblib.load('scaler.pkl')  # Load the saved scaler
imputer = joblib.load('imputer.pkl')  # Load the saved imputer

# Load the trained model
def load_model(model_path='decision_tree_model.pkl'):
    return joblib.load(model_path)

# Function to predict a new sample
def predict_sample(model, scaler, imputer, sample_data):
    # Impute missing values using the saved imputer (transform the data)
    sample_data_imputed = imputer.transform([sample_data])  # Shape must be (1, n_features)
    
    # Debug: Print imputed sample data to check
    print("Imputed sample data:", sample_data_imputed)
    
    # Standardize the new sample using the saved scaler (transform the data)
    sample_data_scaled = scaler.transform(sample_data_imputed)  # Shape must be (1, n_features)
    
    # Debug: Print scaled sample data to check
    print("Scaled sample data:", sample_data_scaled)
    
    # Make prediction using the loaded model
    prediction = model.predict(sample_data_scaled)
    
    return prediction

# Example new sample
new_sample = [-0.78267391, -0.14192909, -0.11761717, -0.41677815,  2.21815411]

# Load the model
model = load_model(model_path='decision_tree_model.pkl')

# Get prediction for the new sample
prediction = predict_sample(model, scaler, imputer, new_sample)

if len(prediction) > 0:
    prediction = prediction[0]

print("Prediction for new sample:", prediction)
