## Set up environment

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import os

import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import mlflow
import mlflow.sklearn
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef, confusion_matrix

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, cross_validate
from sklearn.utils import resample
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve, roc_auc_score
from imblearn.over_sampling import SMOTE





from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier




pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

: 

## Loading the datasets into dataframes

In [2]:
df_fridayAF_DDos = pd.read_csv("../datasets/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
df_fridayAF_PortScan = pd.read_csv("../datasets/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv")
df_fridayMO = pd.read_csv("../datasets/Friday-WorkingHours-Morning.pcap_ISCX.csv")
df_monday = pd.read_csv("../datasets/Monday-WorkingHours.pcap_ISCX.csv")
df_thursdayAF_Infilteration = pd.read_csv("../datasets/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv")
df_thursdayMO_WebAttacks = pd.read_csv("../datasets/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")
df_tuesday = pd.read_csv("../datasets/Tuesday-WorkingHours.pcap_ISCX.csv")
df_wednesday = pd.read_csv("../datasets/Wednesday-workingHours.pcap_ISCX.csv")

## Concatenating the dataframes to single dataframe

In [3]:
df_data = pd.concat([df_fridayAF_DDos,df_fridayAF_PortScan,df_fridayMO,df_monday,df_thursdayAF_Infilteration,df_thursdayMO_WebAttacks,df_tuesday,df_wednesday], axis=0,ignore_index=True)

In [None]:
print(df_data.columns)

In [None]:
df_data.head(5)

In [None]:
# Displaying unique values from the 'Label' column
unique_attacks = df_data[' Label'].unique()
print(unique_attacks)

### Before applying labellization

In [None]:
df_data[' Label'].value_counts()

In [8]:
attack_map = {
    'Web Attack � Brute Force': 'Web Attack',
    'Web Attack � Sql Injection': 'Web Attack',
    'Web Attack � XSS': 'Web Attack',
    'FTP-Patator': 'Brute Force',
    'SSH-Patator': 'Brute Force',
    'DoS slowloris': 'DoS',
    'DoS Slowhttptest': 'DoS',
    'DoS Hulk': 'DoS',
    'DoS GoldenEye': 'DoS',
    'DDoS': 'DoS',
    'PortScan': 'PortScan',
    'Bot': 'Bot',
    'Infiltration': 'Infiltration',
    'Heartbleed': 'Heartbleed',
}


In [None]:
df_data[' Label'] = df_data[' Label'].replace(attack_map)

# Now you can print the DataFrame to see the changes
print(df_data[' Label'].unique())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming df_data is your DataFrame and it's already been modified with the attack_map

# Create a count plot for the 'Label' column
plt.figure(figsize=(6, 8))  # Adjust the size of the plot as needed
sns.countplot(y=' Label', data=df_data, order=df_data[' Label'].value_counts().index)
plt.title('Distribution of Attack Types')
plt.xlabel('Count')
plt.ylabel('Attack Type')
# plt.grid(True)  # Optionally add a grid for better readability
plt.show()


### After applying labellization

In [None]:
label_counts = df_data[' Label'].value_counts()

# Print the counts
print(label_counts)

# Data Preprocessing

#### Find columns with zero variance; columns where all values are the same

In [None]:
zero_variance_cols = [col for col in df_data.columns if df_data[col].nunique() == 1]

# Display the columns with zero variance
if zero_variance_cols:
    print(f"Columns with zero variance: {zero_variance_cols}")
else:
    print("No columns with zero variance found.")

##### Shape before removing zero variance columns

In [None]:
print('Shape before removing zero variance columns:', df_data.shape)

#### Handle columns with zero variance

In [None]:
if zero_variance_cols:
        df_data.drop(zero_variance_cols, axis=1, inplace=True)
        print(f'Dropped zero variance columns: {zero_variance_cols}')

##### Shape after removing zero variance columns

In [None]:
print('Shape after removing zero variance columns:', df_data.shape)

#### Find  spaces from column names

In [None]:
df_data.keys()

##### Handle spaces from column names

In [17]:
df_data.columns = df_data.columns.str.strip()

In [None]:
df_data.keys()

### Identify Rows with NaN, inf, or -inf Values
##### The row listed here contain  NaN ( Not a number) across the columns displayed.

In [None]:
# Select only numeric columns
numeric_cols = df_data.select_dtypes(include=[np.number])

# Check for inf and -inf values in the numeric columns
inf_values = np.isinf(numeric_cols).sum()

# Display the count of inf and -inf values in each numeric column
print("Count of inf and -inf values in each numeric column:")
print(inf_values)

### Check how many NaN it exists

In [None]:
df_data.isna().sum()

### Replace inf and -inf with Nan

In [21]:
df_data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
df_data.isna().sum()

### Check for empty strings or negative values in Flow Bytes/s or Flow Packets/s since they both contains the same number of NaN

In [None]:
sub_df = df_data[df_data['Flow Bytes/s'].isna() | (df_data['Flow Bytes/s'] == '') | (df_data['Flow Bytes/s'] < 0)]
sub_df['Label'].value_counts()

In [None]:
print("Check for NaN values in Label column:", df_data['Label'].isna().sum())

In [None]:
df_data['Label'].value_counts()

In [None]:
print(df_data.isna().sum())

#### This function will filter the dataframe based on NaN values in the specified columns and then calculate the value counts of the 'Label' column for these filtered rows.

In [None]:
import pandas as pd

def nan_label_counts(df_data, column_nan_list):
    # Create a mask that is True wherever any of the columns in column_nan_list have NaN values
    nan_mask = df_data[column_nan_list].isna().any(axis=1)
    
    # Filter the dataframe based on the NaN mask
    nan_rows = df_data[nan_mask]
    
    # Return the value counts of the 'Label' column in the filtered dataframe
    return nan_rows['Label'].value_counts()

# Example usage:
column_nan_list = ['Flow Bytes/s']
print(nan_label_counts(df_data, column_nan_list))


In [None]:
import pandas as pd

def nan_label_counts(df_data, column_nan_list):
    # Create a mask that is True wherever any of the columns in column_nan_list have NaN values
    nan_mask = df_data[column_nan_list].isna().any(axis=1)
    
    # Filter the dataframe based on the NaN mask
    nan_rows = df_data[nan_mask]
    
    # Return the value counts of the 'Label' column in the filtered dataframe
    return nan_rows['Label'].value_counts()

# Example usage:
column_nan_list = ['Flow Packets/s']
print(nan_label_counts(df_data, column_nan_list))


#### Since the occurences of Nan are very low, dropping the Nan values will not have any bad effect 

### Ways to handle missing values

##### Drop rows with missing values

In [29]:
df_data.dropna(inplace=True)

In [None]:
df_data.isna().sum()

##### Identify duplicate rows

In [None]:
duplicate_rows = df_data[df_data.duplicated()]

if not duplicate_rows.empty:
    print("Duplicate rows:")
    print(duplicate_rows.shape)
else:
    print("No duplicate rows found.")

##### Identify columns with identical values

In [32]:
column_pairs = []
num_columns = len(df_data.columns)

for i in range(num_columns):
    for j in range(i + 1, num_columns):
        if df_data.iloc[:, i].equals(df_data.iloc[:, j]): 
            column_pairs.append((df_data.columns[i], df_data.columns[j]))

In [None]:
print("Shape before removing identical columns:", df_data.shape)

##### Print the column pairs with identical values

In [None]:
if column_pairs:
    print("Columns with identical values:")
    for pair in column_pairs:
        print(f"{pair[0]} and {pair[1]} have identical values.")
    
    # Step 3: Drop one column from each pair
    columns_to_drop = [pair[1] for pair in column_pairs]
    df_data.drop(columns=columns_to_drop, axis=1, inplace=True)
    print(f"Dropped columns: {columns_to_drop}")
else:
    print("No columns with identical values found.")

print("Shape after removing identical columns:", df_data.shape)

#### Check for non-numeric columns

In [None]:
non_numeric_columns = df_data.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

#### Displaying the unique categories of cyberattacks from a dataset, represented by the 'Label' column in a DataFrame

In [None]:
df_data['Label'].unique()

In [None]:
print(df_data['Label'].unique())

### Distribution of Network Traffic Types in the Dataset

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df_data' is your DataFrame and it has a column 'Label' with all attack types
sns.countplot(y='Label', data=df_data, order=df_data['Label'].value_counts().index)
plt.title('Distribution of Attack Types')
plt.xlabel('Count')
plt.ylabel('Attack Type')
plt.show()
print(df_data['Label'].value_counts())

### Create a heatmap to visualize missing values

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df_data.isnull(), cbar=False, cmap='viridis')
plt.show()

### Color: The uniform color (purple in this case) across the entire heatmap indicates consistency in the data regarding the presence or absence of missing values. Since the heatmap shows no variation in color, it tells us that there are no missing (NaN) values in any part of your DataFrame.
### Vertical Axis: Since our DataFrame likely contains a large number of rows, only some index labels are shown
### Horizontal Axis (X-axis): Displays the feature or column names of your DataFrame. All columns are represented, and the uniformity of color across all columns confirms that no columns contain missing data.

### Quantitative check to confirm that there are indeed no missing values in the dataset

In [None]:
total_missing = df_data.isnull().sum().sum()
print("Total missing values in the dataset:", total_missing)

### Identifying Categorical Features Based on Unique Value Counts

In [None]:
categorical_columns = []

for c in df_data.columns:
    # Check if the column is categorical (dtype of object or category) or numerical with less than 10 unique values
    if df_data[c].nunique() < 10:
        # Print unique values of columns with < 10 unique values
        print(f"Column '{c}' has unique values: {df_data[c].unique()}")
        categorical_columns.append(c)

# If you want to display the list of categorical columns with unique values < 10
print("Columns with unique values < 10:")
print(categorical_columns)

### Create numerical columns list and remove the target from categorical columns

#### categorical columns before removing Label

In [None]:
categorical_columns

#### categorical columns after removing Label

In [43]:
numerical_columns = [col for col in df_data.columns if col not in categorical_columns]
categorical_columns.remove('Label')

In [None]:
categorical_columns

In [None]:
df_data[categorical_columns].dtypes

## Exploratory Data Analysis (EDA) 

### Class Distribution : Check the distribution of the target variable (Label) to identify whether the data is balanced or imbalanced.

### Before balancing

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df_data' is your DataFrame and it has a column 'Label' with all attack types
sns.countplot(y='Label', data=df_data, order=df_data['Label'].value_counts().index)
plt.title('Distribution of Attack Types')
plt.xlabel('Count')
plt.ylabel('Attack Type')
plt.show()
print(df_data['Label'].value_counts())

### In our exploratory data analysis, we identified that our dataset is significantly imbalanced, with a larger number of 'Benign' instances compared to all other attacks. This imbalance can lead to biased predictive models that perform well on the majority class but poorly on the minority class.

### Correlation Matrix : Blue represents negative correlations, red represents positive correlations, and white or pale colors represent no correlation.

In [None]:
# Calculate the correlation matrix
corr_matrix = df_data[numerical_columns].corr()

# Plotting the heatmap with adjustments
plt.figure(figsize=(12, 10)) 
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', linewidths=0.5)

plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(rotation=0, fontsize=10)

plt.title('Feature Correlation Heatmap', fontsize=16)
plt.tight_layout()
plt.show()

### Since there are too many features, I focused on the most correlated features by selecting a subset

In [None]:
# Select top correlated features (optional)
corr_threshold = 0.5  # Example: Use a threshold to filter features
high_corr_features = corr_matrix.columns[(corr_matrix.abs() > corr_threshold).any()].tolist()

# Plot only the high-correlation features
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix.loc[high_corr_features, high_corr_features], annot=False, cmap='coolwarm', linewidths=0.5)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(rotation=0, fontsize=10)
plt.title('High-Correlation Features Heatmap', fontsize=16)
plt.tight_layout()
plt.show()

#### Since there are too many features, I focused on the most correlated features by selecting a subset and use a threshold to filter features. This particular heatmap visualizes the relationships among various network traffic features, which are crucial for identifying patterns that might suggest normal or malicious behaviors in network traffic data.

### Tabular summary of the most and least correlated features from a correlation matrix

In [None]:
def summarize_correlations(corr_matrix, n=5):
    # Create a DataFrame from the lower triangle of the correlation matrix, excluding the diagonal
    corr_df = corr_matrix.where(np.tril(np.ones(corr_matrix.shape), k=-1).astype(np.bool))
    
    # Stack the DataFrame and reset the index to turn it into a long format
    long_corr = corr_df.stack().reset_index()
    long_corr.columns = ['Feature1', 'Feature2', 'Correlation']

    # Get the top n most correlated features
    top_correlations = long_corr.nlargest(n, 'Correlation')
    
    # Get the top n least correlated features
    least_correlations = long_corr.nsmallest(n, 'Correlation')

    return top_correlations, least_correlations

# Get the summaries
top_correlations, least_correlations = summarize_correlations(corr_matrix, n=5)

# Print or return these summaries as needed
print("Top Correlations:")
print(top_correlations)
print("\nLeast Correlations:")
print(least_correlations)

### Box Plot to Check Outliers: Box plots help identify outliers within the dataset for each feature. Box plots for each numerical column


In [None]:
plt.figure(figsize=(15, 10))
df_data[numerical_columns].boxplot(rot=90)
plt.title("Box Plot for Outlier Detection")
plt.ylim(0, max(df_data[numerical_columns].max())*1.1)
plt.show()

### Focus on the "Flow Bytes/s" column only

#### the shape before removing the outliers

In [51]:
df_data.shape

In [None]:
column_of_interest = 'Flow Bytes/s'
# Plot box plot for the selected column
plt.figure(figsize=(8, 6))
df_data[[column_of_interest]].boxplot()
plt.title(f"Box Plot for Outlier Detection: {column_of_interest}")
plt.ylim(0, df_data[column_of_interest].max() * 1.1) 

### Handle Outliers : Capping the outliers instead of removing them ensures that extreme values are still present but limited to a reasonable range
* The first quartile Q1 is the 25th percentile
* The third quartile Q3 is the 75th percentile
* Any data point that’s 1.5 points below the lower bound quartile or above the upper bound quartile is an outlier. 
### The cap_outliers doesn't remove the outliers but caps them meaing outliers are adjusted or limited to fall within a certain range. 
### Any value that exceeds a predefined upper or lower bound is set to that boundary value. It modifies values that fall outside these bounds by capping them at the nearest bound.

In [53]:
def cap_outliers(df, col):
    Q1 = df[col].quantile(0.25) ##  The first quartile Q1 is the 25th percentile
    Q3 = df[col].quantile(0.75) ## The third quartile Q3 is the 75th percentile
    IQR = Q3 - Q1 ## The IQR is useful because it focuses on the middle 50% of the data and excludes the extreme values
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df[col] = df[col].clip(lower_bound, upper_bound)
    return df
df_data = cap_outliers(df_data, 'Flow Bytes/s')

In [None]:
plt.figure(figsize=(8, 6))
df_data[[column_of_interest]].boxplot()
plt.title(f"Box Plot after Handling Outliers: {column_of_interest}")
plt.ylim(0, df_data[column_of_interest].max() * 1.1)
plt.show()

#### the shape after removing the outliers

In [None]:
df_data.shape

In [None]:
df_data['Label'].value_counts()

## label encoder
## print the target


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_data.iloc[:, -1] = le.fit_transform(df_data.iloc[:, -1])

In [None]:
df_data['Label'].value_counts()

## Feature Engineering

#### Splitting of the data : HoldOut Validation Approach Train and test split.

In [57]:
# Drop rows with missing values
X = df_data.drop('Label',axis=1)

# Make sure to drop corresponding rows in y
y = df_data['Label']

# Split the dataset into train and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
df_data['Label'].value_counts()

* Perform undersampling on BENIGN and DoS using K-means clustering. cluster-based instance selection (CBIS). Removing samples from the majority class
dataset. 
* Perform oversampling on Infiltration and Heartbleed. SMOTE algorithm, which is a popular over-sampling technique
* the majority class are BENIGN and DoS and the minority class are DoS,PortScan,Brute Force,Web Attack,Bot,Infiltration,Heartbleed

==========================

### Cluster centroids

### If we don’t provide a specific target value, the resampling algorithm might either undersample too aggressively (leading to potential loss of important information) or not enough. 
### Specifying target sizes allows us to control the resampling process and ensure that each class is appropriately represented after resampling. 
* https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.ClusterCentroids.html

In [None]:
from imblearn.under_sampling import ClusterCentroids
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from collections import Counter
import numpy as np
import pandas as pd

# Assume X and y are predefined
# Example:
# X = pd.DataFrame(...)  # Feature matrix
# y = pd.Series(...)     # Target vector



# Step 1: Analyze original training class distribution
original_counts_train = Counter(y_train)
print("Original training class distribution:", original_counts_train)

# Step 2: Define sampling strategy
majority_classes = [0, 3, 5]
sampling_strategy = {}
for cls in majority_classes:
    if cls in original_counts_train:
        desired_count = max(1, int(0.1 * original_counts_train[cls]))
        sampling_strategy[cls] = desired_count
    else:
        print(f"Class {cls} not found in the training target vector.")

print("Sampling strategy:", sampling_strategy)

# Step 3: Initialize ClusterCentroids
cc = ClusterCentroids(
    estimator=MiniBatchKMeans(n_init=1, random_state=0),
    sampling_strategy=sampling_strategy,
    random_state=42
)

# Step 4: Apply undersampling to training data
X_train_resampled, y_train_resampled = cc.fit_resample(X_train, y_train)

# Step 6: Verify resampled training class distribution
new_counts_train = Counter(y_train_resampled)
print("Resampled training class distribution:", new_counts_train)

# Step 7: Train the model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_resampled, y_train_resampled)

# Step 8: Evaluate the model on the untouched test set
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

## i want to perform undersampling 

In [None]:
# Now apply oversampling to Infiltration and Heartbleed
smote = SMOTE(sampling_strategy={'Infiltration': 1000, 'Heartbleed': 1000}, random_state=0)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Check the new class distribution
print('Resampled dataset shape:', Counter(y_train))

### Feature Importance 

In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import pandas as pd

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Train the RandomForestClassifier
#rf.fit(X_train_downsampling, y_train)

rf.fit(X_train, y_train)

# Calculate feature importances
feature_importances = rf.feature_importances_

# Convert to a pandas DataFrame for easier visualization
importance_series = pd.Series(feature_importances, index=X_train.columns)

# Sort the features by importance
importance_series = importance_series.sort_values(ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 6))
importance_series.plot(kind='bar', color='skyblue')
plt.title('Feature Importance Using RandomForest')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.show()

### SMOTE should be applied after splitting the data into training and testing sets. This is to ensure that the synthetic data points created do not leak from the test set into the training set, which could lead to overfitting:
How SMOTE works:

1. Identifying Neighbors: For a data point in the minority class, SMOTE finds its nearest neighbors in the feature space.
2. Synthesizing New Data: SMOTE selects one of these nearest neighbors and computes a line segment connecting the minority class data point and its selected neighbor. It then creates new, synthetic data points along this line.
3. Repeating: This process is repeated until the minority class is adequately represented and balances the dataset.


### Handle class imbalance : SMOTE (Synthetic Minority Over-sampling Technique)

In [54]:
# from imblearn.over_sampling import SMOTE


# # Drop rows with missing values
# X = df_data.drop('Label', axis=1)
# y = df_data['Label']

# # Split the dataset into train and test sets (70% train, 30% test)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Applying SMOTE to the training data
# smote = SMOTE(random_state=42,k_neighbors=2)
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# X_train, y_train = X_train_smote, y_train_smote

In [57]:
# from imblearn.over_sampling import SMOTE
# from sklearn.model_selection import train_test_split

# # Assume X and y are your features and labels
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Applying SMOTE to the training data
# smote = SMOTE(random_state=42, k_neighbors=2)
# X_train, y_train = smote.fit_resample(X_train, y_train)

# # Now, X_train and y_train are the resampled data


## To see how the SMOTE impacted each feature in my dataset

In [None]:
df_data['Label'].value_counts()

### Calculate the importance for every feature

In [None]:
# # Calculate mutual information between numerical features and the target
# importances = mutual_info_classif(X_train, y_train)

# # Convert to a pandas DataFrame for easier visualization
# mi_series = pd.Series(importances, index=X.columns)

# # Sort the features by importance
# mi_series = mi_series.sort_values(ascending=False)

# # Plot the mutual information values
# plt.figure(figsize=(10, 6))
# mi_series.plot(kind='bar', color='skyblue')
# plt.title('Mutual Information Between Features and Label')
# plt.xlabel('Features')
# plt.ylabel('Mutual Information')
# plt.show()

### Calculate the sum of importance scores

In [None]:
# # Sort the feature importances along with the feature names
# f_list = sorted(zip(map(lambda x: round(x, 4), importances), X_train.columns), reverse=True) ## 

# # Initialize variables
# Sum = 0
# fs = []

# # Calculate the sum of feature importances and store features
# for i in range(len(f_list)):
#     Sum += f_list[i][0]  # Summing up the importance scores
#     fs.append(f_list[i][1])  # Append the feature name to the list

# # Optional: Print or return the feature importances and the sum
# print("Sum of feature importances:", Sum)

### We select the important feature until the accumulated importance reaches 90%

In [70]:
# f_list2 = sorted(zip(map(lambda x: round(x, 4), importances/Sum), X_train.columns), reverse=True)
# Sum2 = 0
# fs = []

# for i in range(0, len(f_list2)):
#     Sum2 = Sum2 + f_list2[i][0]
#     fs.append(f_list2[i][1])
    
#     if Sum2 >= 0.9:
#         break

In [None]:
# print(len(fs))
# print(fs)
feature_selected = ['Average Packet Size', 'Packet Length Mean', 'Fwd Header Length', 'Init_Win_bytes_forward', 'Packet Length Variance', 'Packet Length Std', 'Subflow Fwd Bytes', 'Total Length of Fwd Packets', 'Bwd Header Length', 'Flow IAT Max', 'Max Packet Length', 'Fwd IAT Max', 'Destination Port', 'Init_Win_bytes_backward', 'Flow Bytes/s', 'Fwd Packet Length Max', 'Flow Duration', 'Fwd IAT Total', 'Total Length of Bwd Packets', 'Subflow Bwd Bytes', 'Fwd Packets/s', 'Fwd Packet Length Mean', 'Bwd Packets/s', 'Avg Fwd Segment Size', 'Flow Packets/s', 'Flow IAT Mean', 'Fwd IAT Mean', 'Bwd Packet Length Mean', 'Avg Bwd Segment Size', 'Bwd Packet Length Max', 'Flow IAT Std', 'Fwd IAT Std', 'Total Backward Packets', 'Total Fwd Packets', 'Fwd Packet Length Std', 'Bwd IAT Total', 'Bwd IAT Max', 'act_data_pkt_fwd', 'Bwd IAT Mean', 'Bwd Packet Length Std', 'Bwd IAT Std', 'Fwd IAT Min', 'Flow IAT Min', 'Bwd IAT Min']


### Apply Z-Score Normalization : In our preprocessing steps, we applied Z-score normalization to our numerical features using StandardScaler. This method standardizes the features so that each has a mean of zero and a standard deviation of one, aligning with the Z-score formula.

In [65]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler only on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train[numerical_columns])

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test[numerical_columns])

### This code handles the integration of scaled numerical data with unscaled categorical data

In [None]:
# Convert the scaled data back into DataFrames for ease of use
# columns=numerical_columns: This specifies the column names for the new DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numerical_columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numerical_columns)

# axis=1 means along "columns". It's a column-wise operation.
X_train_final = pd.concat([X_train_scaled_df, X_train[categorical_columns].reset_index(drop=True)], axis=1)
X_test_final = pd.concat([X_test_scaled_df, X_test[categorical_columns].reset_index(drop=True)], axis=1)

# Print the shape of the final datasets to verify the concatenation
print("X_train_final shape:", X_train_final.shape)
print("X_test_final shape:", X_test_final.shape)

### This cell represent how data are represented before being scaled

In [None]:
X_train_final.head(5)

### This cell represent how data are represented after being scaled

In [None]:
X_test_final.head(5)

## Training Models on all the features
#### We perform k-fold cross validation on the entire dataset

In [71]:
score = []  # For normal training results
score_hyper = []  # For hyperparameter tuning results

In [72]:
def save_best_params_to_json(model_name, best_params):
    filename= "best_params_multiclass_multiple_models.json"
    # Check if the file already exists
    if os.path.exists(filename):
        # Load existing data
        with open(filename, 'r') as file:
            data = json.load(file)
    else:
        # Create a new dictionary if the file does not exist
        data = {}

    # Update the dictionary with the new model's best parameters
    data[model_name] = best_params

    # Write updated dictionary back to the JSON file
    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)

In [73]:
def save_best_fs_params_to_json(model_name, best_params):
    filename= "best_params_fs_multiclass_multiple_models.json"
    # Check if the file already exists
    if os.path.exists(filename):
        # Load existing data
        with open(filename, 'r') as file:
            data = json.load(file)
    else:
        # Create a new dictionary if the file does not exist
        data = {}

    # Update the dictionary with the new model's best parameters
    data[model_name] = best_params

    # Write updated dictionary back to the JSON file
    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)

## 1. Random Forest Classifier

### Training 

In [None]:
rfc = RandomForestClassifier(random_state=42)
mlflow.set_experiment("training_by_default_multiclass")
mlflow.start_run(run_name="rfc_multi_model",nested=True)
# Fit the RandomForest model on the training data
rfc.fit(X_train_final, y_train)

## K-Fold  Cross Validation

In [None]:
from sklearn.model_selection import  cross_validate

# Define K-Fold cross-validator
kfold_validation = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the classifier
rf_cross_scores = RandomForestClassifier(random_state=42)

# Perform k-fold cross-validation with multiple scoring metrics
rf_cross_scores = cross_validate(rf_cross_scores, X, y, cv=kfold_validation, scoring='accuracy', return_train_score=True)

# Print the scores for each fold and metric
print("Average Accuracy Score:", rf_cross_scores['test_score'].mean())
print(rf_cross_scores)

### Testing

In [None]:
# Make predictions on the test data
y_pred = rfc.predict(X_test_final)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred) # function calculates MCC by comparing the true labels (y_test) with the predicted labels (y_pred).

score.append(['Random Forest', accuracy, recall, precision, f1, mcc])

metrics = {"accuracy" : accuracy , "recall" :recall, "precision":precision,"f1":f1,"mcc":mcc}
mlflow.log_metrics(metrics)
mlflow.end_run()
# Print the results
print("Accuracy:", accuracy * 100)
print('Recall:', recall * 100)
print('Precision:', precision * 100)
print('F1 Score:', f1 * 100)
print('MCC:', mcc * 100)

### Confusion matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume y_test and y_pred are defined elsewhere in your code
# Compute the confusion matrix
cm_rfc = confusion_matrix(y_test, y_pred)

# Define the labels from your dataset (adjust labels as per your actual data)
labels = ['BENIGN', 'DoS', 'PortScan', 'DDoS', 'Brute Force', 'Web Attack', 'Bot', 'Infiltration', 'Heartbleed']

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_rfc, annot=True, cmap='Blues', fmt='g',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

## Let's proceed with RandomizedSearchCV
### Hyperparameter Tuning

In [None]:
mlflow.set_tracking_uri("http://localhost:5001")  # replace with your tracking URI

# Define parameter grid
param_dist = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=16,
    scoring='f1_weighted',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
mlflow.set_experiment("hyperparameter_tuning_multiclass")


# automatically starts and ends an MLflow run
mlflow.start_run(run_name="random_forest_tuned_multiclass")
# Fit the RandomForest model on the training data
random_search.fit(X_train_final, y_train)

# Save the best parameters
model_name = "Random Forest"
best_params = random_search.best_params_
mlflow.log_params(best_params)

# Optionally log the model as well
mlflow.sklearn.log_model(random_search.best_estimator_, "random_forest_model_mlflow")


save_best_params_to_json(model_name, best_params)

### Evaluation Steps

In [None]:
# Evaluate the best model on the test set
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test_final)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred)

score_hyper.append(['Random Forest (Tuned)', accuracy, recall, precision, f1, mcc])
metrics = {"accuracy" : accuracy , "recall" :recall, "precision":precision,"f1":f1,"mcc":mcc}
mlflow.log_metrics(metrics)
mlflow.end_run()
# Print the results
print("Accuracy:", accuracy * 100)
print('Recall:', recall * 100)
print('Precision:', precision * 100)
print('F1 Score:', f1 * 100)
print('MCC:', mcc * 100)

### The Receiver Operating Characteristic (ROC) curve and the Area Under the Curve (AUC) are important tools for evaluating the performance of a binary classifier. The ROC curve plots the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings, providing insight into the trade-off between sensitivity (true positive rate) and specificity (false positive rate). The AUC provides a single scalar value which measures the overall ability of the model to discriminate between the positive and negative classes across all possible thresholds. Essentially, a higher AUC value indicates better model performance.

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Calculate predicted probabilities for the test data using the best model from hyperparameter tuning
y_pred_prob_tuned = best_rf.predict_proba(X_test_final)[:, 1]  # Take the probability for the positive class (1)

# Compute ROC curve
fpr_tuned, tpr_tuned, thresholds_tuned = roc_curve(y_test, y_pred_prob_tuned)

# Compute AUC score
auc_tuned = roc_auc_score(y_test, y_pred_prob_tuned)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_tuned, tpr_tuned, color='blue', label='ROC Curve (Tuned) (AUC = {:.2f})'.format(auc_tuned))
plt.plot([0, 1], [0, 1], color='red', linestyle='--')  # Diagonal line for random guessing
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Tuned RandomForest')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Print the AUC
print("AUC Score (Tuned):", auc_tuned)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume y_test and y_pred are defined elsewhere in your code
# Compute the confusion matrix
cm_rfc_tuned = confusion_matrix(y_test, y_pred)

# Define the labels from your dataset (adjust labels as per your actual data)
labels = ['BENIGN', 'DoS', 'PortScan', 'DDoS', 'Brute Force', 'Web Attack', 'Bot', 'Infiltration', 'Heartbleed']

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_rfc_tuned, annot=True, cmap='Blues', fmt='g',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()


## 2. XGB

### Training

In [None]:
# Initialize the XGBoost model
xgb = XGBClassifier(random_state=42)
mlflow.set_experiment("training_by_default_multiclass")
mlflow.start_run(run_name="xgb_multi_model",nested=True)
# Fit the RandomForest model on the training data
xgb.fit(X_train_final, y_train)

## K-fold Cross validation

In [None]:
from sklearn.model_selection import  cross_validate

# Define K-Fold cross-validator
kfold_validation = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the classifier
xgb_cross = XGBClassifier(random_state=42)

# Perform k-fold cross-validation with multiple scoring metrics
xgb_cross_scores = cross_validate(xgb_cross, X, y, cv=kfold_validation, scoring='accuracy', return_train_score=True)

# Print the scores for each fold and metric
print("Average Accuracy Score:", xgb_cross_scores['test_score'].mean())
print(xgb_cross_scores)

### Testing

In [None]:
# Make predictions on the test data
y_pred = xgb.predict(X_test_final)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred)

score.append(['XGB', accuracy, recall, precision, f1, mcc])

metrics = {"accuracy" : accuracy , "recall" :recall, "precision":precision,"f1":f1,"mcc":mcc}
mlflow.log_metrics(metrics)
# Print the results
print("Accuracy:", accuracy * 100)
print('Recall:', recall * 100)
print('Precision:', precision * 100)
print('F1 Score:', f1 * 100)
print('MCC:', mcc * 100)

### Confusion matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume y_test and y_pred are defined elsewhere in your code
# Compute the confusion matrix
cm_xgb = confusion_matrix(y_test, y_pred)

# Define the labels from your dataset (adjust labels as per your actual data)
labels = ['BENIGN', 'DoS', 'PortScan', 'DDoS', 'Brute Force', 'Web Attack', 'Bot', 'Infiltration', 'Heartbleed']

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_xgb, annot=True, cmap='Blues', fmt='g',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

### Hyperparameter tuning

In [None]:
import mlflow
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# Set the MLflow tracking URI and experiment
mlflow.set_tracking_uri("http://localhost:5001")  # Ensure this URI is correct
mlflow.set_experiment("hyperparameter_tuning_multiclass")

# Define parameter grid for XGBoost
param_dist_xgb = {
    'n_estimators': [50, 100],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1]
}

# Initialize RandomizedSearchCV for XGBoost
random_search_xgb = RandomizedSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_distributions=param_dist_xgb,
    n_iter=50,
    scoring='f1_weighted',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Ensure any existing runs are closed
mlflow.end_run()

# Start a new MLflow run within a context manager
mlflow.start_run(run_name="xgb_tuned_multiclass")
# Fit random search to the data
random_search_xgb.fit(X_train_final, y_train)

# Retrieve and log the best parameters
best_params = random_search_xgb.best_params_
print("Best Hyperparameters for XGBoost:", best_params)
mlflow.log_params(best_params)

# Optionally log the model
mlflow.sklearn.log_model(random_search_xgb.best_estimator_, "xgb_model_mlflow")

# Save the best parameters to a JSON file (optional, not recommended within the context block)
model_name = "XGB"
save_best_params_to_json(model_name, best_params)

### Evaluation Steps

In [None]:
# Evaluate the best XGBoost model on the test set
best_xgb = random_search_xgb.best_estimator_
y_pred = best_xgb.predict(X_test_final)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred)

score_hyper.append(['XGB (Tuned)', accuracy, recall, precision, f1, mcc])
metrics = {"accuracy" : accuracy , "recall" :recall, "precision":precision,"f1":f1,"mcc":mcc}
mlflow.log_metrics(metrics)
mlflow.end_run()
# Print the results
print("Accuracy:", accuracy * 100)
print('Recall:', recall * 100)
print('Precision:', precision * 100)
print('F1 Score:', f1 * 100)
print('MCC:', mcc * 100)

### The Receiver Operating Characteristic (ROC) curve and the Area Under the Curve (AUC) are important tools for evaluating the performance of a binary classifier. The ROC curve plots the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings, providing insight into the trade-off between sensitivity (true positive rate) and specificity (false positive rate). The AUC provides a single scalar value which measures the overall ability of the model to discriminate between the positive and negative classes across all possible thresholds. Essentially, a higher AUC value indicates better model performance.

In [None]:
y_pred_prob_xgb = best_xgb.predict_proba(X_test_final)[:, 1]
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_prob_xgb)
auc_xgb = roc_auc_score(y_test, y_pred_prob_xgb)

plt.figure(figsize=(8, 6))
plt.plot(fpr_xgb, tpr_xgb, color='green', label='ROC Curve (XGBoost) (AUC = {:.2f})'.format(auc_xgb))
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Tuned XGBoost')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume y_test and y_pred are defined elsewhere in your code
# Compute the confusion matrix
cm_xgb_tuned = confusion_matrix(y_test, y_pred)

# Define the labels from your dataset (adjust labels as per your actual data)
labels = ['BENIGN', 'DoS', 'PortScan', 'DDoS', 'Brute Force', 'Web Attack', 'Bot', 'Infiltration', 'Heartbleed']

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_xgb_tuned, annot=True, cmap='Blues', fmt='g',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

## 3. Decision Tree

### Training

In [None]:
# Initialize the Decision Tree model
dt = DecisionTreeClassifier(random_state=42)
mlflow.set_experiment("training_by_default_multiclass")
mlflow.start_run(run_name="decision_tree",nested=True)
    # Fit the RandomForest model on the training data
dt.fit(X_train_final, y_train)

## k- fold cross

In [None]:
from sklearn.model_selection import  cross_validate

# Define K-Fold cross-validator
kfold_validation = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the classifier
DT_cross = DecisionTreeClassifier(random_state=42)

# Perform k-fold cross-validation with multiple scoring metrics
DT_cross_scores = cross_validate(DT_cross, X, y, cv=kfold_validation, scoring='accuracy', return_train_score=True)

# Print the scores for each fold and metric
print("Average Accuracy Score:", DT_cross_scores['test_score'].mean())
print(DT_cross_scores)

### Testing

In [None]:
# Make predictions on the test data
y_pred_dt = dt.predict(X_test_final)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred)

score.append(['Decision Tree ', accuracy, recall, precision, f1, mcc])

metrics = {"accuracy" : accuracy , "recall" :recall, "precision":precision,"f1":f1,"mcc":mcc}
mlflow.log_metrics(metrics)
mlflow.end_run()
# Print the results
print("Accuracy:", accuracy * 100)
print('Recall:', recall * 100)
print('Precision:', precision * 100)
print('F1 Score:', f1 * 100)
print('MCC:', mcc * 100)

### Confusion Matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume y_test and y_pred are defined elsewhere in your code
# Compute the confusion matrix
cm_dt = confusion_matrix(y_test, y_pred)

# Define the labels from your dataset (adjust labels as per your actual data)
labels = ['BENIGN', 'DoS', 'PortScan', 'DDoS', 'Brute Force', 'Web Attack', 'Bot', 'Infiltration', 'Heartbleed']

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_dt, annot=True, cmap='Blues', fmt='g',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()


### Hyperparameter Tuning

In [None]:
import mlflow
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid for Decision Tree
param_dist_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'splitter': ['best', 'random']
}

# Initialize RandomizedSearchCV for Decision Tree
random_search_dt = RandomizedSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_distributions=param_dist_dt,
    n_iter=50,  # Adjusted number of iterations
    scoring='f1_weighted',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Set the MLflow tracking URI and experiment
mlflow.set_tracking_uri("http://localhost:5001")  # Ensure this URI is correct
mlflow.set_experiment("hyperparameter_tuning_multiclass")

# Ensure any existing runs are closed
mlflow.end_run()

# Start a new MLflow run within a context manager
mlflow.start_run(run_name="decision_tree_tuned_multiclass")
# Fit random search to the data
random_search_dt.fit(X_train_final, y_train)

# Retrieve and log the best parameters
best_params = random_search_dt.best_params_
print("Best Hyperparameters for Decision Tree:", best_params)
mlflow.log_params(best_params)

# Optionally log the model
mlflow.sklearn.log_model(random_search_dt.best_estimator_, "Decision_Tree_model_mlflow")

# Save the best parameters to a JSON file (optional, not recommended within the context block)
model_name = "Decision Tree"
save_best_params_to_json(model_name, best_params)


### Evaluation Steps

In [None]:
best_dt = random_search_dt.best_estimator_
y_pred = best_dt.predict(X_test_final)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred)

score_hyper.append(['Decision Tree (Tuned)', accuracy, recall, precision, f1, mcc])
metrics = {"accuracy" : accuracy , "recall" :recall, "precision":precision,"f1":f1,"mcc":mcc}
mlflow.log_metrics(metrics)
mlflow.end_run()
# Print the results
print("Accuracy:", accuracy * 100)
print('Recall:', recall * 100)
print('Precision:', precision * 100)
print('F1 Score:', f1 * 100)
print('MCC:', mcc * 100)

### The Receiver Operating Characteristic (ROC) curve and the Area Under the Curve (AUC) are important tools for evaluating the performance of a binary classifier. The ROC curve plots the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings, providing insight into the trade-off between sensitivity (true positive rate) and specificity (false positive rate). The AUC provides a single scalar value which measures the overall ability of the model to discriminate between the positive and negative classes across all possible thresholds. Essentially, a higher AUC value indicates better model performance.

In [None]:
y_pred_prob_dt = best_dt.predict_proba(X_test_final)[:, 1]
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_prob_dt)
auc_dt = roc_auc_score(y_test, y_pred_prob_dt)

plt.figure(figsize=(8, 6))
plt.plot(fpr_dt, tpr_dt, color='purple', label='ROC Curve (Decision Tree) (AUC = {:.2f})'.format(auc_dt))
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Tuned Decision Tree')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume y_test and y_pred are defined elsewhere in your code
# Compute the confusion matrix
cm_dt_tuned = confusion_matrix(y_test, y_pred)

# Define the labels from your dataset (adjust labels as per your actual data)
labels = ['BENIGN', 'DoS', 'PortScan', 'DDoS', 'Brute Force', 'Web Attack', 'Bot', 'Infiltration', 'Heartbleed']

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_dt_tuned, annot=True, cmap='Blues', fmt='g',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()


### 4. Logistic Regression: While simpler, logistic regression can still provide solid results, especially when regularized using techniques like L2 (Ridge) or L1 (Lasso).

### Training

In [None]:
# Initialize the Logistic Regression model
lr = LogisticRegression(random_state=42)
mlflow.set_experiment("training_by_default_multiclass")

mlflow.start_run(run_name="logistic_regression",nested=True)
    # Fit the RandomForest model on the training data
lr.fit(X_train_final, y_train)

## k-fold cross validation

In [None]:
from sklearn.model_selection import  cross_validate

# Define K-Fold cross-validator
kfold_validation = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the classifier
lr_cross = LogisticRegression(random_state=42)

# Perform k-fold cross-validation with multiple scoring metrics
lr_cross_scores = cross_validate(lr_cross, X, y, cv=kfold_validation, scoring='accuracy', return_train_score=True)

# Print the scores for each fold and metric
print("Average Accuracy Score:", lr_cross_scores['test_score'].mean())
print(lr_cross_scores)

### Testing

In [None]:
# Make predictions on the test data
y_pred = lr.predict(X_test_final)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred)

score.append(['Logistic Regression', accuracy, recall, precision, f1, mcc])

metrics = {"accuracy" : accuracy , "recall" :recall, "precision":precision,"f1":f1,"mcc":mcc }
mlflow.log_metrics(metrics)
mlflow.end_run()
# Print the results
print("Accuracy:", accuracy * 100)
print('Recall:', recall * 100)
print('Precision:', precision * 100)
print('F1 Score:', f1 * 100)
print('MCC:', mcc * 100)

### Confusion matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume y_test and y_pred are defined elsewhere in your code
# Compute the confusion matrix
cm_lr = confusion_matrix(y_test, y_pred)

# Define the labels from your dataset (adjust labels as per your actual data)
labels = ['BENIGN', 'DoS', 'PortScan', 'DDoS', 'Brute Force', 'Web Attack', 'Bot', 'Infiltration', 'Heartbleed']

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_lr, annot=True, cmap='Blues', fmt='g',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()


### Hyperparameter Tuning

In [None]:
import mlflow
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
param_grid = {
    'penalty': ['l2'],           # Simplified to only 'l2' for now to reduce complexity
    'C': [0.1, 1.0],             # Reduced range for C
    'solver': ['liblinear'],     # Using 'liblinear' as it's often faster for smaller datasets
    'max_iter': [1000],          # Standard max_iter to see if it converges
    'tol': [1e-4]                # Standard tolerance
}

# Initialize the Logistic Regression model
log_model = LogisticRegression(random_state=42)

# Initialize GridSearchCV
clf = GridSearchCV(log_model, param_grid=param_grid, cv=3, verbose=True, n_jobs=-1, scoring='f1_weighted')

# Set the MLflow tracking URI and experiment
mlflow.set_tracking_uri("http://localhost:5001")  # Ensure this URI is correct
mlflow.set_experiment("hyperparameter_tuning_multiclass")

# Ensure any existing runs are closed
mlflow.end_run()

# Start a new MLflow run within a context manager
mlflow.start_run(run_name="logistic_regression_tuned_multiclass")
# Fit GridSearchCV to the data
clf.fit(X_train_final, y_train)

# Retrieve and log the best parameters
best_params = clf.best_params_
print("Best parameters found: ", best_params)
print("Best cross-validated score: ", clf.best_score_)

# Log the best parameters
mlflow.log_params(best_params)

# Optionally log the model
mlflow.sklearn.log_model(clf.best_estimator_, "logistic_regression_model_mlflow")

# Save the best parameters to a JSON file
model_name = "Logistic Regression"
save_best_params_to_json(model_name, best_params)


### Evaluation Steps

In [None]:
# Get the best model from GridSearchCV
best_lr = clf.best_estimator_

# Validating the model with training data (optional)
train_score = best_lr.score(X_train_final, y_train)
print(f'Accuracy on training set: {train_score:.3f}')

# Testing the best logistic regression model (after hyperparameter tuning)
y_pred = best_lr.predict(X_test_final)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred)

# Append the results to the hyperparameter-tuned score list
score_hyper.append(['Logistic Regression (Tuned)', accuracy, recall, precision, f1, mcc])
metrics = {"accuracy" : accuracy , "recall" :recall, "precision":precision,"f1":f1,"mcc":mcc }
mlflow.log_metrics(metrics)
# End the MLflow run
mlflow.end_run()
# Print the results
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")
print(f"MCC: {mcc * 100:.2f}%")

### The Receiver Operating Characteristic (ROC) curve and the Area Under the Curve (AUC) are important tools for evaluating the performance of a binary classifier. The ROC curve plots the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings, providing insight into the trade-off between sensitivity (true positive rate) and specificity (false positive rate). The AUC provides a single scalar value which measures the overall ability of the model to discriminate between the positive and negative classes across all possible thresholds. Essentially, a higher AUC value indicates better model performance.

In [None]:
y_pred_prob_lr = best_lr.predict_proba(X_test_final)[:, 1]
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_prob_lr)
auc_lr = roc_auc_score(y_test, y_pred_prob_lr)

plt.figure(figsize=(8, 6))
plt.plot(fpr_lr, tpr_lr, color='orange', label='ROC Curve (Logistic Regression) (AUC = {:.2f})'.format(auc_lr))
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Tuned Logistic Regression')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume y_test and y_pred are defined elsewhere in your code
# Compute the confusion matrix
cm_lr_tuned = confusion_matrix(y_test, y_pred)

# Define the labels from your dataset (adjust labels as per your actual data)
labels = ['BENIGN', 'DoS', 'PortScan', 'DDoS', 'Brute Force', 'Web Attack', 'Bot', 'Infiltration', 'Heartbleed']

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_lr_tuned, annot=True, cmap='Blues', fmt='g',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()


## 5. Naive Bayes: This model is quick and easy to implement but may be limited by its assumptions of feature independence.
### Training

In [None]:
# Initialize the Naive Bayes model
nb = GaussianNB()
mlflow.set_experiment("training_by_default_multiclass")


mlflow.start_run(run_name="naive_multi_bayes",nested=True)
    # Fit the RandomForest model on the training data
nb.fit(X_train_final, y_train)

### k-fold cross validation

In [None]:
from sklearn.model_selection import  cross_validate

# Define K-Fold cross-validator
kfold_validation = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the classifier
nb_cross = GaussianNB()

# Perform k-fold cross-validation with multiple scoring metrics
nb_cross_scores = cross_validate(nb_cross, X, y, cv=kfold_validation, scoring='accuracy', return_train_score=True)

# Print the scores for each fold and metric
print("Average Accuracy Score:", nb_cross_scores['test_score'].mean())
print(nb_cross_scores)

### Testing

In [None]:
# Make predictions on the test data
y_pred = nb.predict(X_test_final)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred)

score.append(['Naive Bayes', accuracy, recall, precision, f1, mcc])

metrics = {"accuracy" : accuracy , "recall" :recall, "precision":precision,"f1":f1,"mcc":mcc, }
mlflow.log_metrics(metrics)
mlflow.end_run()
# Print the results
print("Accuracy:", accuracy * 100)
print('Recall:', recall * 100)
print('Precision:', precision * 100)
print('F1 Score:', f1 * 100)
print('MCC:', mcc * 100)

### Confusion Matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume y_test and y_pred are defined elsewhere in your code
# Compute the confusion matrix
cm_naive_bayes = confusion_matrix(y_test, y_pred)

# Define the labels from your dataset (adjust labels as per your actual data)
labels = ['BENIGN', 'DoS', 'PortScan', 'DDoS', 'Brute Force', 'Web Attack', 'Bot', 'Infiltration', 'Heartbleed']

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_naive_bayes, annot=True, cmap='Blues', fmt='g',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

### Hyperparameter Tuning

In [None]:
import mlflow
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid for Gaussian Naive Bayes
param_dist_nb = {
    'var_smoothing': [1e-12, 1e-11, 1e-10, 1e-09, 1e-08, 1e-07, 1e-06]

}

# Initialize RandomizedSearchCV for Gaussian Naive Bayes
random_search_nb = RandomizedSearchCV(
    estimator=GaussianNB(),
    param_distributions=param_dist_nb,
    n_iter=7,  # Adjusted number of iterations due to fewer parameters
    scoring='f1_weighted',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Set the MLflow tracking URI and experiment
mlflow.set_tracking_uri("http://localhost:5001")  # Ensure this URI is correct
mlflow.set_experiment("hyperparameter_tuning")

# Ensure any existing runs are closed
mlflow.end_run()

# Start a new MLflow run within a context manager
mlflow.start_run(run_name="naive_bayes_tuned")
# Fit RandomizedSearchCV to the data
random_search_nb.fit(X_train_final, y_train)

# Retrieve and log the best parameters
best_params = random_search_nb.best_params_
print("Best Hyperparameters for Gaussian Naive Bayes:", best_params)
mlflow.log_params(best_params)

# Optionally log the model
mlflow.sklearn.log_model(random_search_nb.best_estimator_, "Naive_Bayes_model_mlflow")

# Save the best parameters to a JSON file (optional, not recommended within the context block)
model_name = "Naive Bayes"
save_best_params_to_json(model_name, best_params)

## Evaluation steps

In [None]:
# Evaluate the best Naive Bayes model on the test set
best_nb = random_search_nb.best_estimator_
y_pred_nb = best_nb.predict(X_test_final)


accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred)

# Append to the score_hyper list
score_hyper.append(['Naive bayes (Tuned)', accuracy, recall, precision, f1, mcc])
metrics = {"accuracy" : accuracy , "recall" :recall, "precision":precision,"f1":f1,"mcc":mcc}
mlflow.log_metrics(metrics)
mlflow.end_run()
# Print the results
print("Accuracy:", accuracy * 100)
print('Recall:', recall * 100)
print('Precision:', precision * 100)
print('F1 Score:', f1 * 100)
print('MCC:', mcc * 100)

### The Receiver Operating Characteristic (ROC) curve and the Area Under the Curve (AUC) are important tools for evaluating the performance of a binary classifier. The ROC curve plots the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings, providing insight into the trade-off between sensitivity (true positive rate) and specificity (false positive rate). The AUC provides a single scalar value which measures the overall ability of the model to discriminate between the positive and negative classes across all possible thresholds. Essentially, a higher AUC value indicates better model performance.

In [None]:
y_pred_prob_nb = best_nb.predict_proba(X_test_final)[:, 1]
fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_prob_nb)
auc_nb = roc_auc_score(y_test, y_pred_prob_nb)

plt.figure(figsize=(8, 6))
plt.plot(fpr_nb, tpr_nb, color='cyan', label='ROC Curve (Naive Bayes) (AUC = {:.2f})'.format(auc_nb))
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Tuned Naive Bayes')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume y_test and y_pred are defined elsewhere in your code
# Compute the confusion matrix
cm_nb_tunned = confusion_matrix(y_test, y_pred)

# Define the labels from your dataset (adjust labels as per your actual data)
labels = ['BENIGN', 'DoS', 'PortScan', 'DDoS', 'Brute Force', 'Web Attack', 'Bot', 'Infiltration', 'Heartbleed']

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_nb_tunned, annot=True, cmap='Blues', fmt='g',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

## DataFrames for Model Metrics:

### After collecting the results in score and score_hyper , I have created a new DataFrames to display the results (accuracy, recall, etc.), as shown below

In [None]:
# Define the columns for the tables
columns = ['Model', 'Accuracy', 'Recall', 'Precision', 'F1 Score', 'MCC']

# Create a DataFrame for the normal training results
df_normal = pd.DataFrame(score, columns=columns)

# Create a DataFrame for the hyperparameter tuning results
df_hyper = pd.DataFrame(score_hyper, columns=columns)

In [None]:
print("Normal Training Results:")
df_normal

In [None]:
print("Normal Training Results:")
df_normal

## Plot and Save Multiple Confusion Matrices

In [None]:
# Define the confusion matrices and titles in tuples for easier handling
confusion_matrices = [
    (cm_rfc_tuned, "Random Forest"),
    (cm_xgb_tuned, "XGBoost"),
    (cm_lr_tuned, "Logistic Regression"),
    (cm_dt_tuned, "Decision Tree"),
    (cm_nb_tunned, "Naive Bayes")
]

# Setup the matplotlib figure and axes
num_matrices = len(confusion_matrices)
fig, axes = plt.subplots(nrows=1, ncols=num_matrices, figsize=(5 * num_matrices, 5))  # Adjust size as needed

# Check if only one row of subplots, make axes iterable
if num_matrices == 1:
    axes = [axes]

# Loop through the confusion matrices and corresponding axes
for ax, (cm, title) in zip(axes, confusion_matrices):
    # Plotting the confusion matrix using seaborn's heatmap function
    sns.heatmap(cm, annot=True, fmt="d", cmap='Blues', ax=ax, 
                xticklabels=['BENIGN', 'Attacks'], yticklabels=['BENIGN', 'Attacks'])
    ax.set_title(title)
    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

# Tight layout to ensure subplots do not overlap
plt.tight_layout()

# Save the figure
plt.savefig('all_confusion_matrices_multiclass.png')
plt.show()

## K-fold cross-validation

In [None]:
# Assuming the cross-validation score data is correct as provided
models = ['Naive Bayes', 'Logistic Regression', 'Decision Tree', 'XGBoost', 'Random Forest']
results = {
    'Model': models,
    'Fold-1': [scores['test_score'][0] for scores in [nb_cross_scores, lr_cross_scores, DT_cross_scores, xgb_cross_scores, rf_cross_scores]],
    'Fold-2': [scores['test_score'][1] for scores in [nb_cross_scores, lr_cross_scores, DT_cross_scores, xgb_cross_scores, rf_cross_scores]],
    'Fold-3': [scores['test_score'][2] for scores in [nb_cross_scores, lr_cross_scores, DT_cross_scores, xgb_cross_scores, rf_cross_scores]],
    'Fold-4': [scores['test_score'][3] for scores in [nb_cross_scores, lr_cross_scores, DT_cross_scores, xgb_cross_scores, rf_cross_scores]],
    'Fold-5': [scores['test_score'][4] for scores in [nb_cross_scores, lr_cross_scores, DT_cross_scores, xgb_cross_scores, rf_cross_scores]]
}

df_results = pd.DataFrame(results)

# Avoid converting to strings, calculate mean and standard deviation directly
df_results['Mean Accuracy'] = df_results[['Fold-1', 'Fold-2', 'Fold-3', 'Fold-4', 'Fold-5']].mean(axis=1)
df_results['Standard Deviation'] = df_results[['Fold-1', 'Fold-2', 'Fold-3', 'Fold-4', 'Fold-5']].std(axis=1)

# Formatting the output for display (not altering the actual DataFrame)
formatted_df = df_results.copy()
formatted_df.iloc[:, 1:] = formatted_df.iloc[:, 1:].map(lambda x: f"{x:.4f}")

# Plotting
fig, ax = plt.subplots(figsize=(12, 6))
ax.axis('tight')
ax.axis('off')

# Creating the table
table = ax.table(cellText=formatted_df.values, colLabels=formatted_df.columns, cellLoc='center', loc='center', colWidths=[0.15, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.13])
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.2)

# Add a title
plt.title('Cross-Validation Results for Various Models')

# Save the figure
plt.savefig('model_performance_table_multiclass.png')

# Display the plot
plt.show()

## Save All ROC curve

In [None]:
plt.figure(figsize=(12, 8))

plt.plot(fpr_tuned, tpr_tuned, color='blue', label='Random Forest (AUC = {:.2f})'.format(auc_tuned))
plt.plot(fpr_xgb, tpr_xgb, color='green', label='XGBoost (AUC = {:.2f})'.format(auc_xgb))
plt.plot(fpr_dt, tpr_dt, color='purple', label='Decision Tree (AUC = {:.2f})'.format(auc_dt))
plt.plot(fpr_lr, tpr_lr, color='orange', label='Logistic Regression (AUC = {:.2f})'.format(auc_lr))
plt.plot(fpr_nb, tpr_nb, color='cyan', label='Naive Bayes (AUC = {:.2f})'.format(auc_nb))

plt.plot([0, 1], [0, 1], 'k--')  # Random chance
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Comparison of ROC Curves for All Tuned Models')
plt.legend(loc='lower right')
plt.grid(True)

# Save the combined plot
plt.savefig('all_roc_curves.png')
plt.show()