In [None]:
!pip install imbalanced-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import joblib
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tqdm import tqdm
import time


In [None]:
# Load all CSV files and combine into a single DataFrame
files = glob.glob('/content/drive/MyDrive/actual-project/data/*.csv')
list_dfs = [pd.read_csv(file) for file in files]
df = pd.concat(list_dfs, axis=0)

In [None]:
# Handling missing values
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = df[numeric_columns].replace([np.inf, -np.inf], np.nan)
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())

In [None]:
# Data Cleaning
# Remove whitespace from column names
df.columns = df.columns.str.strip()

In [None]:
# Label Encoding: Assign a unique integer for each attack type and BENIGN
label_column = [col for col in df.columns if col.strip().lower() == 'label'][0]
df[label_column] = df[label_column].str.strip()
label_encoder = LabelEncoder()
df[label_column] = label_encoder.fit_transform(df[label_column])

In [None]:
# Display label mapping
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label Mapping (Attack Types):", label_mapping)

In [None]:
# Separate features and target
X = df.drop(columns=[label_column])
y = df[label_column]

In [None]:
# Encode categorical features if any
categorical_columns = X.select_dtypes(include=['object']).columns
if len(categorical_columns) > 0:
    X = pd.get_dummies(X, columns=categorical_columns)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Save the scaler for future use
joblib.dump(scaler, 'scaler.pkl')

In [None]:
# Split data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [None]:
# Cross-validation setup with progress tracking
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = []
for train_idx, val_idx in tqdm(cv.split(X_train_res, y_train_res), total=cv.get_n_splits(), desc="Cross-Validation Progress"):
    X_train_fold, X_val_fold = X_train_res[train_idx], X_train_res[val_idx]
    y_train_fold, y_val_fold = y_train_res[train_idx], y_train_res[val_idx]

    # Train the model on the fold
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the validation fold
    score = rf_model.score(X_val_fold, y_val_fold)
    cross_val_scores.append(score)

print(f"RandomForest Cross-Validation Accuracy Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cross_val_scores):.2f}")

In [None]:
# Train RandomForestClassifier on the entire training set
rf_model.fit(X_train_res, y_train_res)

In [None]:
# Save the trained RandomForest model
joblib.dump(rf_model, 'random_forest_model.pkl')

In [None]:
# Make predictions and evaluate RandomForest model
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"RandomForest Model Accuracy: {accuracy_rf:.2f}")
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:")
print(conf_matrix_rf)

In [None]:
# Plot Confusion Matrix
plt.figure(figsize=(12, 8))
sns.heatmap(conf_matrix_rf, annot=True, cmap='Blues', fmt='g', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix - RandomForest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Classification Report
class_report_rf = classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_)
print("Classification Report:")
print(class_report_rf)

In [None]:
# Get predicted probabilities for each class
y_proba_rf = rf_model.predict_proba(X_test)

In [None]:
# Find the class with the highest probability for each prediction
y_pred_labels = [label_encoder.inverse_transform([np.argmax(prob)])[0] for prob in y_proba_rf]

In [None]:
# Print out a few sample predictions with probabilities for each attack type
for i in range(5):
    print(f"Sample {i+1}:")
    print(f"  Predicted Label: {y_pred_labels[i]}")
    print(f"  Probabilities for each class: {dict(zip(label_encoder.classes_, y_proba_rf[i]))}")
    print()

In [None]:
# Plot ROC Curve for each class
for i in range(len(label_encoder.classes_)):
    fpr, tpr, _ = roc_curve(y_test == i, y_proba_rf[:, i])
    plt.plot(fpr, tpr, label=f'{label_encoder.inverse_transform([i])[0]} (AUC = {roc_auc_score(y_test == i, y_proba_rf[:, i]):.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Each Class')
plt.legend()
plt.show()

In [None]:
# Deep Learning Model (Multi-class Classification)
model = Sequential()
model.add(Dense(128, input_dim=X_train_res.shape[1], activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Use softmax for multi-class classification

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the Deep Learning model
history = model.fit(X_train_res, y_train_res, validation_data=(X_test, y_test), epochs=10, batch_size=64)

# Evaluate Deep Learning model
eval_results = model.evaluate(X_test, y_test)
print(f"Deep Learning Model Test Loss: {eval_results[0]:.2f}")
print(f"Deep Learning Model Test Accuracy: {eval_results[1]:.2f}")

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()



In [None]:
# Summary of Findings
print("The updated RandomForestClassifier and Deep Learning models have been trained and evaluated.")
print("The models now predict specific attack types rather than just a binary classification.")

In [None]:
from google.colab import files
import joblib

# Save and download the scaler
joblib.dump(scaler, 'scaler.pkl')
files.download('scaler.pkl')

# Save Deep Learning model to HDF5 format
model.save('deep_learning_model.keras')
# Download Deep Learning model to your laptop
files.download('deep_learning_model.keras')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>