# Jupyter Notebook 6: AWS Notebook for ML Baseline Models - This has moved to AWS. 

---

# Introduction

This Notebook is used to get a baseline model for use in the real time section of the project.

4 models will be run, first with a baseline model and then with a Search for the best tuning parameters.
   - 1) Random Forest Model
   - 2) K-Means Model
   - 3) Isolate Forest Model
   - 4) One Class SVM Model
   
Previous research has shown very high accuracy results from models so this will be expected from these models. The previous studies did not discuss the Data Preprocessing steps in enough detail to explore why this is the case.

The data is highly imbalanced which is accepted for this study as this is a real world representation of network traffic in the real world. 

The Results will be stored to be the local SQL Database for comparison later on the Dashboard.

---

# Load data and pretraining steps

In [1]:
# General
import os
import time

# Data Manipulation
import pandas as pd
import numpy as np

# Data Visualisation
import matplotlib.pyplot as plt

# Data Types
import json
import mysql.connector
from joblib import dump, load

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

# Getting metrics
from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix as cm_function
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, average_precision_score,  roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Location of preprocessed file
folder_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\Notebooks"
file_path = os.path.join(folder_path, "ids_2018_baseline_data.parquet")

# load data
data = pd.read_parquet(file_path, engine="pyarrow")

---

# Train and Test Split 80:20 Split

Create a Train and Test split with 80:20 to create baseline models.

As the minority class has a large imbalanced, its important to keep the percentage right in each train and val split.

In [4]:
# Seperate the x and y features
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [5]:
# Create the splits

# Train and val_test split
train_x, val_test_x, train_y, val_test_y = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Validation and Test split
val_x, test_x, val_y, test_y = train_test_split(val_test_x, val_test_y, test_size=0.5, random_state=42, stratify=val_test_y)

In [None]:
# # Get a sample an save to file.
# sample_data_10_perc = train_x.sample(frac=0.01, random_state=42)
# sample_data_10_perc.shape
# file_path = r'C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models\sample_file.pkl'
# sample_data_10_perc.to_pickle(file_path)

---

# Open connection to Database

In [6]:
config = {
    "user":"root",
    "password":"root",
    "host": "localhost",
    "database":"mtu_capstone_db",
    "raise_on_warnings":True
}

In [7]:
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()

In [8]:
# Test the connection

try:
    cnx = mysql.connector.connect(**config)
    cursor = cnx.cursor()
    
    # Test Query - Different types
    # cursor.execute("SELECT VERSION();")
    # cursor.execute("SELECT * FROM baseline_model_results;") # This query exceeds the limit.
    # cursor.execute("SELECT * FROM baseline_model_results LIMIT 1;")  # Limit to 10 rows
    cursor.execute("SELECT COUNT(*) FROM baseline_model_results;")
    
    #Fetch the results - Differnt for above.
    # version = cursor.fetchone()
    # select_all = cursor.fetchone()
    # select_all = cursor.fetchall()  # Use fetchall() or fetchmany(size)
    number_of_rows = cursor.fetchone()[0]

    # print("My SQL Server version:", version)
    # print("Select all:", select_all)
    print("Number of rows in 'baseline_model_results':", number_of_rows)
    
except mysql.connector.Error as err:
    print("Error:", err)
    
else:
    cursor.close()
    cnx.close()

Number of rows in 'baseline_model_results': 17


## Insert data to database with a function

Create a function to send the data to the database. This will be used in this notebook for the baseline models and then later modified to be used with the live streaming models.

In [9]:
# create model results funciton.
def insert_model_results_to_db(config, model_name, model_parameters, confusion_matrix, # note the config is the conneciton details, username, pw, host, etc
                                accuracy_value, precision_value, recall_value, f1_value, 
                                ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time,  
                                silhouette_score, cluster_centers, elbow_method_results, 
                                cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
                                decision_function_values, support_vectors, one_class_svm_plot):
    
    try: 
        # Connect to the database
        cnx = mysql.connector.connect(**config)
        cursor = cnx.cursor()
    
        # SQL query for iserting data to baseline_model_results
        add_results=("""
            INSERT INTO baseline_model_results
            (model_name, model_parameters, confusion_matrix, 
            accuracy_value, precision_value, recall_value, f1_value, 
            ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
            silhouette_score, cluster_centers, elbow_method_results, 
            cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
            decision_function_values, support_vectors, one_class_svm_plot) 
            VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 
                    %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """)
    
        # Data tuple
        data= (model_name, model_parameters, confusion_matrix,
            accuracy_value, precision_value, recall_value, f1_value, 
            ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
            silhouette_score, cluster_centers, elbow_method_results, 
            cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
            decision_function_values, support_vectors, one_class_svm_plot)
    
        # Execute the query
        cursor.execute(add_results, data)
    
        # Commit the transaction
        cnx.commit()
        print("The model results have been inserted successfully")
    
    #except Error as err:
    #    print(f"Error: {err}")
    except mysql.connector.Error as err:
        print(f"Error: {err}")
    
    finally: 
        if cnx.is_connected():
            cursor.close()
            cnx.close()
            print("The Conneciton to MySQL is closed.")

# **Random Forest Basic Model**

## Basic Model

For this model we as it is a basic model only uses a train and test dataset. 

In [None]:
# Initialise the random forest model
rf_model_basic = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# # Fit the model on the training data
# rf_model_basic.fit(train_x, train_y)

In [None]:
# Test the model against the test 
test_predications = rf_model_basic.predict(val_test_x)

## Explore the results

### Name of the model

In [None]:
model_name = "rf_model_baseline_basic"

In [None]:
model_name

### Model Parameters

In [None]:
model_parameters = "n_estimators=100"

### Confusion Matrix

In [None]:
confusion_matrix = confusion_matrix(val_test_y, test_predications)

In [None]:
confusion_matrix = confusion_matrix.tolist()

In [None]:
confusion_matrix = confusion_matrix.tolist()
confusion_matrix = json.dumps(confusion_matrix)

### Accuracy, Precision, Recall, F1 Score

In [None]:
accuracy_value = float(accuracy_score(val_test_y, test_predications))
precision_value = float(precision_score(val_test_y, test_predications))
recall_value = float(recall_score(val_test_y, test_predications))
f1_value = float(f1_score(val_test_y, test_predications))

In [None]:
accuracy_value, precision_value, recall_value, f1_value

## AUC Score and Roc Curve

In [None]:
fpr, tpr, thresholds = roc_curve(val_test_y, rf_model_basic.predict_proba(val_test_x)[:,1])
auc_score = float(auc(fpr, tpr))

In [None]:
# Area under the curve score.
auc_score

In [None]:
# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Send the ROC Curve to the DB as a JSON object.

# convert the ROC Curve to JSON
roc_data = {
    "fpr": fpr.tolist(),
    "tpr": tpr.tolist(),
    "roc_auc": auc_score
}


ruc_curve = json.dumps(roc_data)

### Precision-Recall Curve

In [None]:
precision, recall, _ = precision_recall_curve(val_test_y, rf_model_basic.predict_proba(val_test_x)[:,1])

In [None]:
# Plot the Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="upper right")
plt.show()

In [None]:
# Send the Precision Recall Curve to the DB as a JSON object.

pr_data = {
    "precision": precision.tolist(), 
    "recall": recall.tolist()
}


precision_recall_curve = json.dumps(pr_data)

### Feature Importance

In [None]:
feature_importances = rf_model_basic.feature_importances_

In [None]:
feature_importances

In [None]:
type(feature_importances)

In [None]:
# Send the feature_importances to the DB as a JSON object.


In [None]:
print(feature_importance)

### Model Training Time

In [None]:
# get the start time
start_time = time.time()

# run the model
# Fit the model on the training data
rf_model_basic.fit(train_x, train_y)

# record the end time.
end_time = time.time()

In [None]:
training_time = end_time - start_time

In [None]:
training_time

In [None]:
model_training_time = training_time
model_training_time

In [None]:
print("Training time: ", model_training_time, "seconds")

In [None]:
print("Training time: ", model_training_time/60, "minutes")

In [None]:
print("Training time: ", model_training_time/60, "hours")

### Sent Results to DB

In [None]:
# Kemans
silhouette_score = None
cluster_centers = None
elbow_method_results = None
cluster_visualisation_plot = None

# Isolate Forest
anomaly_score = None
anomaly_detection_plot = None

# One Class SVM
decision_function_values = None
support_vectors = None
one_class_svm_plot = None

In [None]:
# Need to check the object types so that the DB can handle them

#DONE. Objects ok now.

# print(type(model_name))
# print(type(model_parameters))
# print(type(confusion_matrix))
# print(type(accuracy_value))
# print(type(precision_value))
# print(type(recall_value))
# print(type(f1_value))
# print(type(ruc_curve))
# print(type(auc_score))
# print(type(precision_recall_curve))
# print(type(feature_importance))
# print(type(model_training_time))
# print(type(silhouette_score)) 
# print(type(cluster_centers))
# print(type(elbow_method_results))
# print(type(cluster_visualisation_plot)) 
# print(type(anomaly_score))
# print(type(anomaly_detection_plot))
# print(type(decision_function_values)) 
# print(type(support_vectors))
# print(type(one_class_svm_plot))

In [None]:
insert_model_results_to_db(config, model_name, model_parameters, confusion_matrix, # note the config is the conneciton details, username, pw, host, etc
                                accuracy_value, precision_value, recall_value, f1_value, 
                                ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
                                silhouette_score, cluster_centers, elbow_method_results, 
                                cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
                                decision_function_values, support_vectors, one_class_svm_plot)

## Save Model

In [None]:
# save model to file.

#save folder location
model_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models\rf_model_basic.joblib"

# save model
dump(rf_model_basic, model_path)


In [None]:
# load this model
folder_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models"
model_name = "rf_model_basic.joblib"
model = os.path.join(folder_path, model_name)

In [None]:
loaded_model = load(model)

In [None]:

# Check the number of features
num_features = model.n_features_

---

#  **Random Forest Best Model Search**

The Baseline Random Forest Model has done very well with an accuracy of 99% so for now I will not look at improving this model. Time being short. 

## GridSearch of RF Models

In [None]:
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

---

# **K-Means Basic Model**

## Basic Model with 2 clusters

In [None]:
# Intialise Kmeans model
kmeans_model_baseline_basic = KMeans(n_clusters=2, random_state=42)

In [None]:
# This runs the model and gets the training time as well.

# start timer
start_time = time.time()

# train the model
kmeans_model_baseline_basic.fit(train_x)

# end timer
end_time = time.time()

In [None]:
# Predictions on the test data
predictions = kmeans_model_baseline_basic.predict(test_x)

In [None]:
predictions

## Explore the Results

In [None]:
# Evaluate the model using Silhouette Score
silhouette_avg = silhouette_score(test_x, predictions)

In [None]:
silhouette_avg

The silhouette score is a metric to calculate the goodness of a clustering technique. Its range is fom -1 to 1. A value close to 1  indicates that the data points are very far away from  neighbouring clusters. This means the clusters are distinct and well seperated, which usually represents a good clustering.

0.42 indicates that there is a good struture of clusters seperated from each other.

In [None]:
# Visualizations specific to K-Means
plt.scatter(test_x.iloc[:, 0], test_x.iloc[:, 1], c=predictions, s=50, cmap='viridis')
centers = kmeans_model_baseline_basic.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
plt.title('K-Means Clustering')
plt.show()

In [None]:
# convert to a json object for the database

# Extracting data points
data_points_x = test_x.iloc[:, 0].tolist()  # First feature values
data_points_y = test_x.iloc[:, 1].tolist()  # Second feature values

# Extracting cluster centers
centers_x = centers[:, 0].tolist()  # X-coordinates of cluster centers
centers_y = centers[:, 1].tolist()  # Y-coordinates of cluster centers

# Organize data into a structured dictionary
visualization_data = {
    "data_points": {
        "x": data_points_x,
        "y": data_points_y,
        "cluster": predictions.tolist()  # Cluster assignments
    },
    "cluster_centers": {
        "x": centers_x,
        "y": centers_y
    }
}

# Convert to JSON
cluster_visualisation = json.dumps(visualization_data)

In [None]:
# model_training_time
model_training_time = end_time - start_time

In [None]:
model_training_time 

In [None]:
# Silhouette time
silhouette_score_db = float(silhouette_avg) 
silhouette_score_db

In [None]:
# cluster centers
cluster_centers = kmeans_model_baseline_basic .cluster_centers_.tolist() 
cluster_centers = json.dumps(cluster_centers) 
cluster_centers

In [None]:
# elbow method resutls

wcss = []
for i in range(1, 11):  # Example: Test for 1 to 10 clusters
    km = KMeans(n_clusters=i, random_state=42)
    km.fit(train_x)
    wcss.append(km.inertia_)
elbow_method_results = json.dumps(wcss) 

In [None]:
elbow_method_results

## Sent Results to DB

In [None]:
# Need to check the object types so that the DB can handle them

#DONE. Objects ok now.

# print(type(model_name))
# print(type(model_parameters))
# print(type(confusion_matrix))
# print(type(accuracy_value))
# print(type(precision_value))
# print(type(recall_value))
# print(type(f1_value))
# print(type(ruc_curve))
# print(type(auc_score))
# print(type(precision_recall_curve))
# print(type(feature_importance))
# print(type(model_training_time))
# print(type(silhouette_score)) 
# print(type(cluster_centers))
# print(type(elbow_method_results))
# print(type(cluster_visualisation_plot)) 
# print(type(anomaly_score))
# print(type(anomaly_detection_plot))
# print(type(decision_function_values)) 
# print(type(support_vectors))
# print(type(one_class_svm_plot))

In [None]:
model_parameters_update = json.dumps(kmeans_model_baseline_basic.get_params())

In [None]:
model_name = "kmeans_model_baseline_basic"
model_parameters = model_parameters_update
confusion_matrix = None  # Not applicable for K-Means, same below.
accuracy_value = None 
precision_value = None
recall_value = None
f1_value = None
ruc_curve = None
auc_score = None
precision_recall_curve = None
feature_importance = None
model_training_time = model_training_time

silhouette_score = silhouette_score_db
cluster_centers = cluster_centers
elbow_method_results = elbow_method_results
cluster_visualisation_plot = cluster_visualisation 

anomaly_score = None
anomaly_detection_plot = None

decision_function_values = None
support_vectors = None
one_class_svm_plot = None

In [None]:
# # test the conneciton
# try:
#     cnx = mysql.connector.connect(**config)
#     cursor = cnx.cursor()
#     cursor.execute("SELECT 1;")
#     result = cursor.fetchone()
#     print(result)
# except mysql.connector.Error as err:
#     print("Error:", err)
# finally:
#     cursor.close()
#     cnx.close()

In [None]:
# call and save to DB
insert_model_results_to_db(config, model_name, model_parameters, confusion_matrix, # note the config is the conneciton details, username, pw, host, etc
                                accuracy_value, precision_value, recall_value, f1_value, 
                                ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
                                silhouette_score, cluster_centers, elbow_method_results, 
                                cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
                                decision_function_values, support_vectors, one_class_svm_plot)

## Save Model

In [None]:
# save model to file.

#save folder location
model_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models\kmeans_model_baseline_basic.joblib"

# save model
dump(kmeans_model_baseline_basic, model_path)

---

# **K-Means Best Model Search**

Look at improving the kmeans model by looking at the opitmal number of clusters and other hyper-parameters. Some parameters to explore:
1) Determine the Range for "n_clusters": No need to change this, there are attack and benign groups so 2 clusters. However, its still worth exploring other numbers of clusters.

2) Iternate over other values for "n_cluster".

3) Other Hyperparameters. There are other clusters that are less important but worth trying. "init" (method for initialization), "n_init" (number of time the k-means algorithm will be run with different centroid seeds) and "max_iter". 

4) Choose best kmeans model.

## GridSearch for Best Model

In [None]:
# range of clusters to try
cluster_range = range(2,6)

# Store the average silhouetter scores
silhouette_avg_scores = []

# store the inertias (sum of the distances)
inertias = []

for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(train_x)
    cluster_labels = kmeans.predict(test_x)
    
    silhouette_avg = silhouette_score(test_x, cluster_labels)
    silhouette_avg_scores.append(silhouette_avg)
    
    inertias.append(kmeans.inertia_)  # Corrected this line
    
# Plotting the results
# Silhouette Scores
plt.figure(figsize=(10, 5))
plt.plot(cluster_range, silhouette_avg_scores, marker='o')
plt.title('Silhouette Scores for Different Numbers of Clusters')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()

# Inertia
plt.figure(figsize=(10, 5))
plt.plot(cluster_range, inertias, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# Silhouette Scores
plt.figure(figsize=(10, 5))
plt.plot(cluster_range, silhouette_avg_scores, marker='o')
plt.title('Silhouette Scores for Different Numbers of Clusters')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')

plt.savefig('silhouette_scores.png')
plt.show()

In [None]:
# Inertia
plt.figure(figsize=(10, 5))
plt.plot(cluster_range, inertias, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.savefig('inertia.png')
plt.show()

##### Get the best model

Reading the plots We will choose the 5 clusters as our best model. While this seems large for the domain it might give more interesting results as the silhouette score is higher.

## Rerun the best model

In [None]:
# Intialise Kmeans model
kmeans_model_baseline_best_serach = KMeans(n_clusters=5, random_state=42)

In [None]:
# This runs the model and gets the training time as well.

# start timer
start_time = time.time()

# train the model
kmeans_model_baseline_best_serach.fit(train_x)

# end timer
end_time = time.time()

In [None]:
# Predictions on the test data
predictions = kmeans_model_baseline_best_serach.predict(test_x)

In [None]:
predictions

In [None]:
# Evaluate the model using Silhouette Score
silhouette_avg = silhouette_score(test_x, predictions)

In [None]:
silhouette_avg

improvement on the 42 in 2 clusters

## Explore the Results

In [None]:
# Visualizations specific to K-Means
plt.scatter(test_x.iloc[:, 0], test_x.iloc[:, 1], c=predictions, s=50, cmap='viridis')
centers = kmeans_model_baseline_best_serach.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
plt.title('K-Means Clustering')
plt.show()

In [None]:
# convert to a json object for the database

# Extracting data points
data_points_x = test_x.iloc[:, 0].tolist()  # First feature values
data_points_y = test_x.iloc[:, 1].tolist()  # Second feature values

# Extracting cluster centers
centers_x = centers[:, 0].tolist()  # X-coordinates of cluster centers
centers_y = centers[:, 1].tolist()  # Y-coordinates of cluster centers

# Organize data into a structured dictionary
visualization_data = {
    "data_points": {
        "x": data_points_x,
        "y": data_points_y,
        "cluster": predictions.tolist()  # Cluster assignments
    },
    "cluster_centers": {
        "x": centers_x,
        "y": centers_y
    }
}

# Convert to JSON
cluster_visualisation = json.dumps(visualization_data)

In [None]:
# model_training_time
model_training_time = end_time - start_time
model_training_time

In [None]:
# Silhouette time
silhouette_score_db = float(silhouette_avg) 
silhouette_score_db

In [None]:
# cluster centers
cluster_centers = kmeans_model_baseline_best_serach.cluster_centers_.tolist() 

In [None]:
cluster_centers = json.dumps(cluster_centers) 
cluster_centers

In [None]:
type(cluster_centers)

In [None]:
# elbow method resutls

wcss = []
for i in range(1, 11):  # Example: Test for 1 to 10 clusters
    km = KMeans(n_clusters=i, random_state=42)
    km.fit(train_x)
    wcss.append(km.inertia_)
elbow_method_results = json.dumps(wcss)  #

In [None]:
elbow_method_results

In [None]:
model_parameters_update = json.dumps(kmeans_model_baseline_best_serach.get_params())

In [None]:
model_name = "kmeans_model_baseline_best_serach"
model_parameters = model_parameters_update
confusion_matrix = None  # Not applicable for K-Means, same below.
accuracy_value = None 
precision_value = None
recall_value = None
f1_value = None
ruc_curve = None
auc_score = None
precision_recall_curve = None
feature_importance = None
model_training_time = model_training_time

silhouette_score = silhouette_score_db
cluster_centers = cluster_centers
elbow_method_results = elbow_method_results
cluster_visualisation_plot = cluster_visualisation 

anomaly_score = None
anomaly_detection_plot = None

decision_function_values = None
support_vectors = None
one_class_svm_plot = None

In [None]:
# Need to check the object types so that the DB can handle them

#DONE. Objects ok now.

print(type(model_name))
print(type(model_parameters))
print(type(confusion_matrix))
print(type(accuracy_value))
print(type(precision_value))
print(type(recall_value))
print(type(f1_value))
print(type(ruc_curve))
print(type(auc_score))
print(type(precision_recall_curve))
print(type(feature_importance))
print(type(model_training_time))
print(type(silhouette_score)) 
print(type(cluster_centers))
print(type(elbow_method_results))
print(type(cluster_visualisation_plot)) 
print(type(anomaly_score))
print(type(anomaly_detection_plot))
print(type(decision_function_values)) 
print(type(support_vectors))
print(type(one_class_svm_plot))

##### Connect to DB and send results

In [None]:
config = {
    "user":"root",
    "password":"root",
    "host": "localhost",
    "database":"mtu_capstone_db",
    "raise_on_warnings":True
}

In [None]:
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()

In [None]:
# call and save to DB
insert_model_results_to_db(config, model_name, model_parameters, confusion_matrix, # note the config is the conneciton details, username, pw, host, etc
                                accuracy_value, precision_value, recall_value, f1_value, 
                                ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
                                silhouette_score, cluster_centers, elbow_method_results, 
                                cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
                                decision_function_values, support_vectors, one_class_svm_plot)

## Save model

In [None]:
# save model to file.

#save folder location
model_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models\kmeans_model_baseline_best_serach.joblib"

# save model
dump(kmeans_model_baseline_best_serach, model_path)

---

# **Isolate Forest  Basic Model: Attempt 3**

## Basic Model

In [None]:
# Initialise the IF model
iso_model_baseline_basic = IsolationForest(n_estimators=100, random_state=42)

In [None]:
# Fit the model and get the training time.

# start timer
start_time = time.time()

# train the model
iso_model_baseline_basic.fit(train_x)

# end timer
end_time = time.time()

In [None]:
# Prediations and Evualations
predictions = iso_model_baseline_basic.predict(test_x)

In [None]:
# Convert prredictions to make the labels
predictions = [1 if p==-1 else 0 for p in predictions]

In [None]:
# If you have true labels (test_y), you can evaluate the model
print(classification_report(test_y, predictions))

## Explore Results

In [None]:
model_name = "iso_model_baseline_basic"

In [None]:
model_parameters = json.dumps(iso_model_baseline_basic.get_params())
model_parameters

In [None]:
confusion_matrix_results = cm_function(test_y, predictions)
confusion_matrix_results

In [None]:
report = classification_report(test_y, predictions, output_dict=True)

In [None]:
confusion_matrix = confusion_matrix_results.tolist()
confusion_matrix

In [None]:
confusion_matrix = json.dumps(confusion_matrix)

In [None]:
report = classification_report(test_y, predictions, output_dict=True)

In [None]:
accuracy_value = report["accuracy"]
accuracy_value 

In [None]:
precision_value = report["macro avg"]["precision"]
precision_value

In [None]:
recall_value = report["macro avg"]["recall"]
recall_value

In [None]:
f1_value = report["macro avg"]["f1-score"]
f1_value 

In [None]:
model_training_time = end_time - start_time
model_training_time

In [None]:
anomaly_scores = iso_model_baseline_basic.decision_function(test_x)
anomaly_scores

In [None]:
anomaly_scores_json = json.dumps(anomaly_scores.tolist())

In [None]:
anomaly_score = anomaly_scores_json

In [None]:
x_axis = test_x.iloc[:, 0] 
y_axis = test_x.iloc[:, 1]  
predictions_array = np.array(predictions)

# Create boolean masks
normal_points = predictions_array == 0
anomalies = predictions_array == 1

plt.figure(figsize=(10, 6))
plt.scatter(x_axis[normal_points], y_axis[normal_points], color='green', label='Normal')
plt.scatter(x_axis[anomalies], y_axis[anomalies], color='red', label='Anomaly')
plt.title('Anomaly Detection')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
save_path = r'C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\Save Plots\iso_basic_model_anomaly_detection.png'
plt.savefig(save_path)
plt.show()

In [None]:
plot_data = {
    "x_axis": x_axis.tolist(),  
    "y_axis": y_axis.tolist(),  
    "normal_points": normal_points.tolist(),  
    "anomalies": anomalies.tolist()  
}

anomaly_detection_plot = json.dumps(plot_data)

In [None]:
silhouette_score_for_db = silhouette_score(test_x, predictions)

In [None]:
silhouette_score_for_db

In [None]:
silhouette_score = float(silhouette_score_for_db)

## Sent Results to DB

In [None]:
# Not needed to send for ISO model
ruc_curve = None
auc_score = None
precision_recall_curve = None
feature_importance = None

cluster_centers = None
elbow_method_results = None
cluster_visualisation_plot = None

decision_function_values = None
support_vectors = None
one_class_svm_plot = None

In [None]:
# test object types

print(type(model_name))
print(type(model_parameters))
print(type(confusion_matrix))
print(type(accuracy_value))
print(type(precision_value))
print(type(recall_value))
print(type(f1_value))
print(type(ruc_curve))
print(type(auc_score))
print(type(precision_recall_curve))
print(type(feature_importance))
print(type(model_training_time))
print(type(silhouette_score)) 
print(type(cluster_centers))
print(type(elbow_method_results))
print(type(cluster_visualisation_plot)) 
print(type(anomaly_score))
print(type(anomaly_detection_plot))
print(type(decision_function_values)) 
print(type(support_vectors))
print(type(one_class_svm_plot))

In [None]:
# call and save to DB
insert_model_results_to_db(config, model_name, model_parameters, confusion_matrix, # note the config is the conneciton details, username, pw, host, etc
                                accuracy_value, precision_value, recall_value, f1_value, 
                                ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
                                silhouette_score, cluster_centers, elbow_method_results, 
                                cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
                                decision_function_values, support_vectors, one_class_svm_plot)

## Save model

In [None]:
# save model to file.

#save folder location
model_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models\iso_model_baseline_basic_8.joblib"

# save model
dump(iso_model_baseline_basic, model_path)

---

# **Isolate Forest Model Search: Attempt 3**

Next, we look at grid search to find the best model for Isolate forest.

Most influential hyperparameters:
 - n_estimators: Number of base estimators or trees in the ensemble.
 - max_samples: Number of samples to draw from the total dataset to train the estimator.
 - contamination: Represents the porportion of outliers in the dataset. Auto can can be used if unsure
     - We know that 13% are attack instances so this value will be used. contamination = 0.13
 - max-features: The Number of features to draw from the total features to train each base estimator.
 - bootstramp:Determines to draw samples with replacement or not.


## GridSearch for Isolate Forest

In [None]:
# Grid Search options for Isolate Forest

n_estimators_options = [100, 200, 300, 400, 500]
max_samples_options = [0.25, 0.5, 0.75, "auto"]
contamination_options = [0.05, 0.1, 0.15, 0.2]

best_score = -1
best_params = {}

for n_estimators in n_estimators_options:
    for max_samples in max_samples_options:
        for contamination in contamination_options:
            clf = IsolationForest(n_estimators=n_estimators, 
                                  max_samples=max_samples, 
                                  contamination=contamination, 
                                  random_state=42)
            clf.fit(train_x)
            predictions = clf.predict(test_x)
            score = silhouette_score(test_x, predictions)

            if score > best_score:
                best_score = score
                best_params = {
                    "n_estimators": n_estimators, 
                    "max_samples": max_samples, 
                    "contamination": contamination
                }

print("Best Score:", best_score)
print("Best Parameters:", best_params)

**Results from Best Search attempt 1: bad results. Run greater range of values**
define paramters to use in the grid search
- n_estimators_options = [200,300,400]
- max_samples_options = [0.5, "auto"]
- contamination = 0.13

Best Result is less than the Basic Model - Could be good for testing so will keep for now.

Best Score: 0.26775658
Best Parameters: {'n_estimators': 300, 'max_samples': 0.5, 'contamination': 0.13}

**Results from Best Search attempt 1: ??**

- n_estimators_options = [100, 200, 300, 400, 500]
- max_samples_options = [0.25, 0.5, 0.75, "auto"]
- contamination_options = [0.05, 0.1, 0.15, 0.2]

In [None]:
# Set paramaters that the best model found
best_n_estimators = ?
best_max_samples = ?
best_contamination= ?

iso_model_baseline_best = IsolationForest(n_estimators=best_n_estimators, 
                                          max_samples=best_max_samples, 
                                          contamination=best_contamination, 
                                          random_state=42)

In [None]:
# Fit the model and get the training time.

# start timer
start_time = time.time()

# train the model
iso_model_baseline_best.fit(train_x)

# end timer
end_time = time.time()

## Explore the Results

In [None]:
# Prediations and Evualations
predictions = iso_model_baseline_best.predict(test_x)

In [None]:
# If you have true labels (test_y), you can evaluate the model
print(classification_report(test_y, predictions))

In [None]:
model_name = "iso_model_baseline_best"

In [None]:
model_parameters = json.dumps(iso_model_baseline_best.get_params())
model_parameters

In [None]:
confusion_matrix_results = cm_function(test_y, predictions)
confusion_matrix_results

In [None]:
confusion_matrix = confusion_matrix_results.tolist()
confusion_matrix

In [None]:
confusion_matrix = json.dumps(confusion_matrix)

In [None]:
report = classification_report(test_y, predictions, output_dict=True)

In [None]:
accuracy_value = report["accuracy"]
accuracy_value 

In [None]:
precision_value = report["macro avg"]["precision"]
precision_value

In [None]:
recall_value = report["macro avg"]["recall"]
recall_value

In [None]:
f1_value = report["macro avg"]["f1-score"]
f1_value 

In [None]:
model_training_time = end_time - start_time
model_training_time

In [None]:
anomaly_scores = iso_model_baseline_best.decision_function(test_x)
anomaly_scores

In [None]:
anomaly_scores_json = json.dumps(anomaly_scores.tolist())

In [None]:
anomaly_score = anomaly_scores_json

In [None]:
x_axis = test_x.iloc[:, 0] 
y_axis = test_x.iloc[:, 1]  
predictions_array = np.array(predictions)

# Create boolean masks
normal_points = predictions_array == 0
anomalies = predictions_array == 1

plt.figure(figsize=(10, 6))
plt.scatter(x_axis[normal_points], y_axis[normal_points], color='green', label='Normal')
plt.scatter(x_axis[anomalies], y_axis[anomalies], color='red', label='Anomaly')
plt.title('Anomaly Detection')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()

In [None]:
plot_data = {
    "x_axis": x_axis.tolist(),  
    "y_axis": y_axis.tolist(),  
    "normal_points": normal_points.tolist(),  
    "anomalies": anomalies.tolist()  
}

anomaly_detection_plot = json.dumps(plot_data)

In [None]:
silhouette_score = silhouette_score(test_x, predictions)

In [None]:
silhouette_score = float(silhouette_score)

## Sent Results to DB

In [None]:
# Not needed to send for ISO model
ruc_curve = None
auc_score = None
precision_recall_curve = None
feature_importance = None

cluster_centers = None
elbow_method_results = None
cluster_visualisation_plot = None

decision_function_values = None
support_vectors = None
one_class_svm_plot = None

In [None]:
# test object types

print(type(model_name))
print(type(model_parameters))
print(type(confusion_matrix))
print(type(accuracy_value))
print(type(precision_value))
print(type(recall_value))
print(type(f1_value))
print(type(model_training_time))
print(type(silhouette_score)) 
print(type(anomaly_score))
print(type(anomaly_detection_plot))

In [None]:
# call and save to DB
insert_model_results_to_db(config, model_name, model_parameters, confusion_matrix, # note the config is the conneciton details, username, pw, host, etc
                                accuracy_value, precision_value, recall_value, f1_value, 
                                ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
                                silhouette_score, cluster_centers, elbow_method_results, 
                                cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
                                decision_function_values, support_vectors, one_class_svm_plot)

## Save the best Model

In [None]:
# save model to file.

#save folder location
model_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models\iso_model_baseline_best.joblib"

# save model
dump(iso_model_baseline_best, model_path)

---

---

# **One Class SVM model Exploration**

### Create Normal Data

In [None]:
train_x_normal = train_x[train_y==0]

In [None]:
train_x_normal.shape

The training is taking a huge amount of time. Either running for days or crashing my machine regularlly

#  **One Class SVM Model 1 : 1%**

In [None]:
# Initialize One-Class SVM # default settings
oc_svm = OneClassSVM(kernel="linear", nu=0.1)

In [None]:
# Take 1% sample
sample_data_1_perc = train_x_normal.sample(frac=0.01, random_state=42)
sample_data_1_perc.shape

In [None]:
# start timer
start_time = time.time()

# Train the model only on normal data
oc_svm.fit(sample_data_1_perc)

end_time = time.time()

In [None]:
training_time =  end_time - start_time
training_time 

## Explore Results

In [None]:
model_name = "oc_svm_model_baseline_basic_1_perc"

In [None]:
model_parameters = oc_svm.get_params()

In [None]:
model_parameters = json.dumps(model_parameters)
model_parameters

In [None]:
anomaly_scores = -oc_svm.decision_function(sample_data_1_perc)

In [None]:
anomaly_scores

In [None]:
anomaly_scores_test = -oc_svm.decision_function(val_test_x)

# get the roc curve and auc
fpr, tpr, thresholds = roc_curve(val_test_y, anomaly_scores_test, pos_label=1)
roc_auc = auc(fpr, tpr)

# print the auc score
print("AUC: {:.2f}".format(roc_auc))

# plot the roc curve.
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Send the ROC Curve to the DB as a JSON object.

# convert the ROC Curve to JSON
roc_data = {
    "fpr": fpr.tolist(),
    "tpr": tpr.tolist(),
    "roc_auc": roc_auc
}


ruc_curve = json.dumps(roc_data)

In [None]:
auc_score = float(roc_auc)

In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Optimal Threshold:", optimal_threshold)

In [None]:
# change scores to binary labels using the optial 
predicted_labels = (anomaly_scores_test > optimal_threshold).astype(int)

# get values and confusion matrix
conf_matrix = confusion_matrix(val_test_y, predicted_labels)
accuracy = accuracy_score(val_test_y, predicted_labels)
precision, recall, fscore, _ = precision_recall_fscore_support(val_test_y, predicted_labels, average='binary')

# print results
print("Confusion Matrix:")
print(conf_matrix)
confusion_matrix = conf_matrix.tolist()
confusion_matrix = json.dumps(confusion_matrix)

print("\nAccuracy: {:.2f}".format(accuracy))
accuracy_value = float(accuracy)
print("Precision: {:.2f}".format(precision))
precision_value = float(precision)
print("Recall: {:.2f}".format(recall))
recall_value = float(recall)
print("F1 Score: {:.2f}".format(fscore))
f1_value = float(fscore)

In [None]:
# # Plot Decision Function Values
# plt.scatter(val_test_x.iloc[:, 0], val_test_x.iloc[:, 1], c=anomaly_scores_test, cmap='coolwarm')
# plt.colorbar(label='Decision Function Value')
# plt.xlabel('Feature 1')
# plt.ylabel('Feature 2')
# plt.title('Decision Function Values ')
# plt.show()

In [None]:
# # Plot support vector Values
# plt.scatter(train_x.iloc[:, 0], train_x.iloc[:, 1], alpha=0.5)
# plt.scatter(oc_svm.support_vectors_[:, 0], oc_svm.support_vectors_[:, 1], edgecolor='r', facecolor='none')
# plt.xlabel('Feature 1')
# plt.ylabel('Feature 2')
# plt.title('Support Vectors ')
# plt.show()

In [None]:
# # plot One-Class SVM results
# plt.scatter(val_test_x.iloc[:, 0], val_test_x.iloc[:, 1], c=predicted_labels, cmap='coolwarm')
# plt.xlabel('Feature 1')
# plt.ylabel('Feature 2')
# plt.title('One-Class SVM Results ')
# plt.show()

In [None]:
# values that are not needed.
# Random forest
precision_recall_curve = None
feature_importance = None

# Kemans
cluster_centers = None
elbow_method_results = None
cluster_visualisation_plot = None
silhouette_score =None

# Isolate Forest
anomaly_score = None
anomaly_detection_plot = None

decision_function_values =None
one_class_svm_plot =None
support_vectors = None

In [None]:
# test object types

print(type(model_name))
print(type(model_parameters)) 
print(type(confusion_matrix)) 
print(type(accuracy_value)) 
print(type(precision_value)) 
print(type(recall_value)) 
print(type(f1_value)) 
print(type(ruc_curve)) 
print(type(auc_score))
print(type(precision_recall_curve)) # Not used
print(type(feature_importance)) # Not used
print(type(model_training_time))
print(type(silhouette_score)) # Not used
print(type(cluster_centers)) # Not used
print(type(elbow_method_results)) # Not used
print(type(cluster_visualisation_plot)) # Not used
print(type(anomaly_score)) # Not used
print(type(anomaly_detection_plot))# Not used
print(type(decision_function_values)) # Not used
print(type(support_vectors)) # Not used
print(type(one_class_svm_plot) )# Not used

## Sent Results to DB

In [None]:
# call and save to DB
insert_model_results_to_db(config, model_name, model_parameters, confusion_matrix, # note the config is the conneciton details, username, pw, host, etc
                                accuracy_value, precision_value, recall_value, f1_value, 
                                ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
                                silhouette_score, cluster_centers, elbow_method_results, 
                                cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
                                decision_function_values, support_vectors, one_class_svm_plot)

## Save Model

In [None]:
# save model to file.

#save folder location
model_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models\svm_baseline_model_2.1.joblib"

# save model
dump(oc_svm, model_path)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix as cm_function

---

# **One Class SVM Model 2: 10%**

In [None]:
# Initialize One-Class SVM # default settings
oc_svm = OneClassSVM(kernel="linear", nu=0.1)

In [None]:
# Take 1% sample
sample_data_1_perc = train_x_normal.sample(frac=0.1, random_state=42)
sample_data_1_perc.shape

In [None]:
# start timer
start_time = time.time()

# Train the model only on normal data
oc_svm.fit(sample_data_1_perc)

end_time = time.time()

In [None]:
training_time =  end_time - start_time
training_time 

## Explore Results

In [None]:
model_name = "oc_svm_model_baseline_basic_10_perc"

In [None]:
model_parameters = oc_svm.get_params()

In [None]:
model_parameters = json.dumps(model_parameters)
model_parameters

In [None]:
anomaly_scores = -oc_svm.decision_function(sample_data_1_perc)

In [None]:
anomaly_scores

In [None]:
anomaly_scores_test = -oc_svm.decision_function(val_test_x)

# get the roc curve and auc
fpr, tpr, thresholds = roc_curve(val_test_y, anomaly_scores_test, pos_label=1)
roc_auc = auc(fpr, tpr)

# print the auc score
print("AUC: {:.2f}".format(roc_auc))

# plot the roc curve.
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Send the ROC Curve to the DB as a JSON object.

# convert the ROC Curve to JSON
roc_data = {
    "fpr": fpr.tolist(),
    "tpr": tpr.tolist(),
    "roc_auc": roc_auc
}


ruc_curve = json.dumps(roc_data)

In [None]:
auc_score = float(roc_auc)

In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Optimal Threshold:", optimal_threshold)

In [None]:
# change scores to binary labels using the optial 
predicted_labels = (anomaly_scores_test > optimal_threshold).astype(int)

# get values and confusion matrix
conf_matrix = confusion_matrix(val_test_y, predicted_labels)
accuracy = accuracy_score(val_test_y, predicted_labels)
precision, recall, fscore, _ = precision_recall_fscore_support(val_test_y, predicted_labels, average='binary')

# print results
print("Confusion Matrix:")
print(conf_matrix)
confusion_matrix = conf_matrix.tolist()
confusion_matrix = json.dumps(confusion_matrix)

print("\nAccuracy: {:.2f}".format(accuracy))
accuracy_value = float(accuracy)
print("Precision: {:.2f}".format(precision))
precision_value = float(precision)
print("Recall: {:.2f}".format(recall))
recall_value = float(recall)
print("F1 Score: {:.2f}".format(fscore))
f1_value = float(fscore)

In [None]:
# values that are not needed.
# Random forest
precision_recall_curve = None
feature_importance = None

# Kemans
cluster_centers = None
elbow_method_results = None
cluster_visualisation_plot = None
silhouette_score =None

# Isolate Forest
anomaly_score = None
anomaly_detection_plot = None

decision_function_values =None
one_class_svm_plot =None
support_vectors = None

In [None]:
# test object types

print(type(model_name))
print(type(model_parameters)) 
print(type(confusion_matrix)) 
print(type(accuracy_value)) 
print(type(precision_value)) 
print(type(recall_value)) 
print(type(f1_value)) 
print(type(ruc_curve)) 
print(type(auc_score))
print(type(precision_recall_curve)) # Not used
print(type(feature_importance)) # Not used
print(type(model_training_time))
print(type(silhouette_score)) # Not used
print(type(cluster_centers)) # Not used
print(type(elbow_method_results)) # Not used
print(type(cluster_visualisation_plot)) # Not used
print(type(anomaly_score)) # Not used
print(type(anomaly_detection_plot))# Not used
print(type(decision_function_values)) # Not used
print(type(support_vectors)) # Not used
print(type(one_class_svm_plot) )# Not used

## Sent Results to DB

In [None]:
# call and save to DB
insert_model_results_to_db(config, model_name, model_parameters, confusion_matrix, # note the config is the conneciton details, username, pw, host, etc
                                accuracy_value, precision_value, recall_value, f1_value, 
                                ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
                                silhouette_score, cluster_centers, elbow_method_results, 
                                cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
                                decision_function_values, support_vectors, one_class_svm_plot)

## Save Model

In [None]:
# save model to file.

#save folder location
model_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models\svm_baseline_model_2.2.joblib"

# save model
dump(oc_svm, model_path)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix as cm_function

---

# **One Class SVM Model 3: 1% nu=0.17**

In [None]:
# Initialize One-Class SVM # default settings
oc_svm = OneClassSVM(kernel="linear", nu=0.17)

In [None]:
# Take 1% sample
sample_data_1_perc = train_x_normal.sample(frac=0.01, random_state=42)
sample_data_1_perc.shape

In [None]:
# start timer
start_time = time.time()

# Train the model only on normal data
oc_svm.fit(sample_data_1_perc)

end_time = time.time()

In [None]:
training_time =  end_time - start_time
training_time 

## Explore Results

In [None]:
model_name = "oc_svm_model_baseline_basic_1_perc_nu_017"

In [None]:
model_parameters = oc_svm.get_params()

In [None]:
model_parameters = json.dumps(model_parameters)
model_parameters

In [None]:
anomaly_scores = -oc_svm.decision_function(sample_data_1_perc)

In [None]:
anomaly_scores

In [None]:
anomaly_scores_test = -oc_svm.decision_function(val_test_x)

# get the roc curve and auc
fpr, tpr, thresholds = roc_curve(val_test_y, anomaly_scores_test, pos_label=1)
roc_auc = auc(fpr, tpr)

# print the auc score
print("AUC: {:.2f}".format(roc_auc))

# plot the roc curve.
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Send the ROC Curve to the DB as a JSON object.

# convert the ROC Curve to JSON
roc_data = {
    "fpr": fpr.tolist(),
    "tpr": tpr.tolist(),
    "roc_auc": roc_auc
}


ruc_curve = json.dumps(roc_data)

In [None]:
auc_score = float(roc_auc)

In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Optimal Threshold:", optimal_threshold)

In [None]:
# change scores to binary labels using the optial 
predicted_labels = (anomaly_scores_test > optimal_threshold).astype(int)

# get values and confusion matrix
conf_matrix = confusion_matrix(val_test_y, predicted_labels)
accuracy = accuracy_score(val_test_y, predicted_labels)
precision, recall, fscore, _ = precision_recall_fscore_support(val_test_y, predicted_labels, average='binary')

# print results
print("Confusion Matrix:")
print(conf_matrix)
confusion_matrix = conf_matrix.tolist()
confusion_matrix = json.dumps(confusion_matrix)

print("\nAccuracy: {:.2f}".format(accuracy))
accuracy_value = float(accuracy)
print("Precision: {:.2f}".format(precision))
precision_value = float(precision)
print("Recall: {:.2f}".format(recall))
recall_value = float(recall)
print("F1 Score: {:.2f}".format(fscore))
f1_value = float(fscore)

In [None]:
# values that are not needed.
# Random forest
precision_recall_curve = None
feature_importance = None

# Kemans
cluster_centers = None
elbow_method_results = None
cluster_visualisation_plot = None
silhouette_score =None

# Isolate Forest
anomaly_score = None
anomaly_detection_plot = None

decision_function_values =None
one_class_svm_plot =None
support_vectors = None

In [None]:
# test object types

print(type(model_name))
print(type(model_parameters)) 

print(type(confusion_matrix)) 
print(type(accuracy_value)) 
print(type(precision_value)) 
print(type(recall_value)) 
print(type(f1_value)) 

print(type(ruc_curve)) 
print(type(auc_score))

print(type(precision_recall_curve)) # Not used
print(type(feature_importance)) # Not used

print(type(model_training_time))
print(type(silhouette_score)) # Not used
print(type(cluster_centers)) # Not used
print(type(elbow_method_results)) # Not used
print(type(cluster_visualisation_plot)) # Not used
print(type(anomaly_score)) # Not used
print(type(anomaly_detection_plot))# Not used
print(type(decision_function_values)) # Not used
print(type(support_vectors)) # Not used
print(type(one_class_svm_plot) )# Not used

## Sent Results to DB

In [None]:
# call and save to DB
insert_model_results_to_db(config, model_name, model_parameters, confusion_matrix, # note the config is the conneciton details, username, pw, host, etc
                                accuracy_value, precision_value, recall_value, f1_value, 
                                ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
                                silhouette_score, cluster_centers, elbow_method_results, 
                                cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
                                decision_function_values, support_vectors, one_class_svm_plot)

## Save Model

In [None]:
# save model to file.

#save folder location
model_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models\svm_baseline_model_2.3.joblib"

# save model
dump(oc_svm, model_path)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix as cm_function

---

# **One Class SVM Model 4: 10% nu=0.17**

In [None]:
# Initialize One-Class SVM # default settings
oc_svm = OneClassSVM(kernel="linear", nu=0.17)

In [None]:
# Take 1% sample
sample_data_1_perc = train_x_normal.sample(frac=0.1, random_state=42)
sample_data_1_perc.shape

In [None]:
# start timer
start_time = time.time()

# Train the model only on normal data
oc_svm.fit(sample_data_1_perc)

end_time = time.time()

In [None]:
training_time =  end_time - start_time
training_time 

## Explore Results

In [None]:
model_name = "oc_svm_model_baseline_basic_10_perc_nu_017"

In [None]:
model_parameters = oc_svm.get_params()

In [None]:
model_parameters = json.dumps(model_parameters)
model_parameters

In [None]:
anomaly_scores = -oc_svm.decision_function(sample_data_1_perc)

In [None]:
anomaly_scores

In [None]:
anomaly_scores_test = -oc_svm.decision_function(val_test_x)

# get the roc curve and auc
fpr, tpr, thresholds = roc_curve(val_test_y, anomaly_scores_test, pos_label=1)
roc_auc = auc(fpr, tpr)

# print the auc score
print("AUC: {:.2f}".format(roc_auc))

# plot the roc curve.
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Send the ROC Curve to the DB as a JSON object.

# convert the ROC Curve to JSON
roc_data = {
    "fpr": fpr.tolist(),
    "tpr": tpr.tolist(),
    "roc_auc": roc_auc
}


ruc_curve = json.dumps(roc_data)

In [None]:
auc_score = float(roc_auc)

In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Optimal Threshold:", optimal_threshold)

In [None]:
# change scores to binary labels using the optial 
predicted_labels = (anomaly_scores_test > optimal_threshold).astype(int)

# get values and confusion matrix
conf_matrix = confusion_matrix(val_test_y, predicted_labels)
accuracy = accuracy_score(val_test_y, predicted_labels)
precision, recall, fscore, _ = precision_recall_fscore_support(val_test_y, predicted_labels, average='binary')

# print results
print("Confusion Matrix:")
print(conf_matrix)
confusion_matrix = conf_matrix.tolist()
confusion_matrix = json.dumps(confusion_matrix)

print("\nAccuracy: {:.2f}".format(accuracy))
accuracy_value = float(accuracy)
print("Precision: {:.2f}".format(precision))
precision_value = float(precision)
print("Recall: {:.2f}".format(recall))
recall_value = float(recall)
print("F1 Score: {:.2f}".format(fscore))
f1_value = float(fscore)

In [None]:
# values that are not needed.
# Random forest
precision_recall_curve = None
feature_importance = None

# Kemans
cluster_centers = None
elbow_method_results = None
cluster_visualisation_plot = None
silhouette_score =None

# Isolate Forest
anomaly_score = None
anomaly_detection_plot = None

decision_function_values =None
one_class_svm_plot =None
support_vectors = None

In [None]:
# test object types

print(type(model_name))
print(type(model_parameters)) 
print(type(confusion_matrix)) 
print(type(accuracy_value)) 
print(type(precision_value)) 
print(type(recall_value)) 
print(type(f1_value)) 
print(type(ruc_curve)) 
print(type(auc_score))
print(type(precision_recall_curve)) # Not used
print(type(feature_importance)) # Not used
print(type(model_training_time))
print(type(silhouette_score)) # Not used
print(type(cluster_centers)) # Not used
print(type(elbow_method_results)) # Not used
print(type(cluster_visualisation_plot)) # Not used
print(type(anomaly_score)) # Not used
print(type(anomaly_detection_plot))# Not used
print(type(decision_function_values)) # Not used
print(type(support_vectors)) # Not used
print(type(one_class_svm_plot) )# Not used

## Sent Results to DB

In [None]:
# call and save to DB
insert_model_results_to_db(config, model_name, model_parameters, confusion_matrix, # note the config is the conneciton details, username, pw, host, etc
                                accuracy_value, precision_value, recall_value, f1_value, 
                                ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
                                silhouette_score, cluster_centers, elbow_method_results, 
                                cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
                                decision_function_values, support_vectors, one_class_svm_plot)

## Save Model

In [None]:
# save model to file.

#save folder location
model_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models\svm_baseline_model_2.3.joblib"

# save model
dump(oc_svm, model_path)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix as cm_function

---

# **One Class SVM Model 5: 20% nu=0.17**

In [None]:
# Initialize One-Class SVM # default settings
oc_svm = OneClassSVM(kernel="linear", nu=0.17)

In [None]:
# Take 1% sample
sample_data_1_perc = train_x.sample(frac=0.2, random_state=42)
sample_data_1_perc.shape

In [None]:
# start timer
start_time = time.time()

# Train the model only on normal data
oc_svm.fit(sample_data_1_perc)

end_time = time.time()

In [None]:
training_time =  end_time - start_time
training_time 

## Explore Results

In [None]:
model_name = "oc_svm_model_baseline_basic_20_perc_nu_017"

In [None]:
model_parameters = oc_svm.get_params()

In [None]:
model_parameters = json.dumps(model_parameters)
model_parameters

In [None]:
anomaly_scores = -oc_svm.decision_function(sample_data_1_perc)

In [None]:
anomaly_scores

In [None]:
anomaly_scores_test = -oc_svm.decision_function(val_test_x)

# get the roc curve and auc
fpr, tpr, thresholds = roc_curve(val_test_y, anomaly_scores_test, pos_label=1)
roc_auc = auc(fpr, tpr)

# print the auc score
print("AUC: {:.2f}".format(roc_auc))

# plot the roc curve.
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Send the ROC Curve to the DB as a JSON object.

# convert the ROC Curve to JSON
roc_data = {
    "fpr": fpr.tolist(),
    "tpr": tpr.tolist(),
    "roc_auc": roc_auc
}


ruc_curve = json.dumps(roc_data)

In [None]:
auc_score = float(roc_auc)

In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Optimal Threshold:", optimal_threshold)

In [None]:
# change scores to binary labels using the optial 
predicted_labels = (anomaly_scores_test > optimal_threshold).astype(int)

# get values and confusion matrix
conf_matrix = confusion_matrix(val_test_y, predicted_labels)
accuracy = accuracy_score(val_test_y, predicted_labels)
precision, recall, fscore, _ = precision_recall_fscore_support(val_test_y, predicted_labels, average='binary')

# print results
print("Confusion Matrix:")
print(conf_matrix)
confusion_matrix = conf_matrix.tolist()
confusion_matrix = json.dumps(confusion_matrix)

print("\nAccuracy: {:.2f}".format(accuracy))
accuracy_value = float(accuracy)
print("Precision: {:.2f}".format(precision))
precision_value = float(precision)
print("Recall: {:.2f}".format(recall))
recall_value = float(recall)
print("F1 Score: {:.2f}".format(fscore))
f1_value = float(fscore)

In [None]:
# values that are not needed.
# Random forest
precision_recall_curve = None
feature_importance = None

# Kemans
cluster_centers = None
elbow_method_results = None
cluster_visualisation_plot = None
silhouette_score =None

# Isolate Forest
anomaly_score = None
anomaly_detection_plot = None

decision_function_values =None
one_class_svm_plot =None
support_vectors = None

In [None]:
# test object types

print(type(model_name))
print(type(model_parameters)) 
print(type(confusion_matrix)) 
print(type(accuracy_value)) 
print(type(precision_value)) 
print(type(recall_value)) 
print(type(f1_value)) 
print(type(ruc_curve)) 
print(type(auc_score))
print(type(precision_recall_curve)) # Not used
print(type(feature_importance)) # Not used
print(type(model_training_time))
print(type(silhouette_score)) # Not used
print(type(cluster_centers)) # Not used
print(type(elbow_method_results)) # Not used
print(type(cluster_visualisation_plot)) # Not used
print(type(anomaly_score)) # Not used
print(type(anomaly_detection_plot))# Not used
print(type(decision_function_values)) # Not used
print(type(support_vectors)) # Not used
print(type(one_class_svm_plot) )# Not used

## Sent Results to DB

In [None]:
# call and save to DB
insert_model_results_to_db(config, model_name, model_parameters, confusion_matrix, # note the config is the conneciton details, username, pw, host, etc
                                accuracy_value, precision_value, recall_value, f1_value, 
                                ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
                                silhouette_score, cluster_centers, elbow_method_results, 
                                cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
                                decision_function_values, support_vectors, one_class_svm_plot)

## Save Model

In [None]:
# save model to file.

#save folder location
model_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models\svm_baseline_model_2.4.joblib"

# save model
dump(oc_svm, model_path)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix as cm_function

---

In [None]:
print("Full notebook ran and finished with no errors. Yipee!")

In [22]:
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support

# **Incremental SGD Classifer Model**

In [10]:
svm_baseline = SGDClassifier(loss='hinge')

In [24]:
# get the start time
start_time = time.time()

svm_baseline.fit(train_x, train_y)

# record the end time.
end_time = time.time()

In [25]:
test_predictions = svm_baseline.predict(val_test_x.to_numpy())

In [26]:
test_accuracy = accuracy_score(val_test_y, test_predictions)
test_accuracy

0.933117847655013

In [27]:
training_time =  end_time - start_time
training_time 

16.658982276916504

## Explore Results

In [28]:
model_name = "sgd_svm_model_baseline_basic"

In [29]:
model_parameters = svm_baseline.get_params()

In [30]:
model_parameters = json.dumps(model_parameters)
model_parameters

'{"alpha": 0.0001, "average": false, "class_weight": null, "early_stopping": false, "epsilon": 0.1, "eta0": 0.0, "fit_intercept": true, "l1_ratio": 0.15, "learning_rate": "optimal", "loss": "hinge", "max_iter": 1000, "n_iter_no_change": 5, "n_jobs": null, "penalty": "l2", "power_t": 0.5, "random_state": null, "shuffle": true, "tol": 0.001, "validation_fraction": 0.1, "verbose": 0, "warm_start": false}'

In [31]:
confusion_matrix_results = cm_function(val_test_y, test_predictions)
confusion_matrix_results

array([[1292588,   46437],
       [  61492,  213202]], dtype=int64)

In [33]:
report = classification_report(val_test_y,  test_predictions, output_dict=True)

In [34]:
confusion_matrix = confusion_matrix_results.tolist()
confusion_matrix

[[1292588, 46437], [61492, 213202]]

In [35]:
confusion_matrix = json.dumps(confusion_matrix)

In [38]:
report = classification_report(val_test_y,  test_predictions, output_dict=True)

In [39]:
accuracy_value = report["accuracy"]
accuracy_value 

0.933117847655013

In [40]:
precision_value = report["macro avg"]["precision"]
precision_value

0.8878677205818821

In [41]:
recall_value = report["macro avg"]["recall"]
recall_value

0.870731962603903

In [42]:
f1_value = report["macro avg"]["f1-score"]
f1_value 

0.8789678403593513

In [43]:
model_training_time = end_time - start_time
model_training_time

16.658982276916504

## Sent Results to DB

In [52]:
# Not needed to send for ISO model
ruc_curve = None
auc_score = None
precision_recall_curve = None
feature_importance = None

cluster_centers = None
elbow_method_results = None
cluster_visualisation_plot = None

decision_function_values = None
support_vectors = None
one_class_svm_plot = None
anomaly_score = None
anomaly_detection_plot = None
silhouette_score = None

In [53]:
# test object types

print(type(model_name))
print(type(model_parameters)) 

print(type(confusion_matrix)) 
print(type(accuracy_value)) 
print(type(precision_value)) 
print(type(recall_value)) 
print(type(f1_value)) 

print(type(ruc_curve)) 
print(type(auc_score))

print(type(precision_recall_curve)) # Not used
print(type(feature_importance)) # Not used

print(type(model_training_time))
print(type(silhouette_score)) # Not used
print(type(cluster_centers)) # Not used
print(type(elbow_method_results)) # Not used
print(type(cluster_visualisation_plot)) # Not used
print(type(anomaly_score)) # Not used
print(type(anomaly_detection_plot))# Not used
print(type(decision_function_values)) # Not used
print(type(support_vectors)) # Not used
print(type(one_class_svm_plot) )# Not used

<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'float'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>


## Sent Results to DB

In [54]:
# call and save to DB
insert_model_results_to_db(config, model_name, model_parameters, confusion_matrix, # note the config is the conneciton details, username, pw, host, etc
                                accuracy_value, precision_value, recall_value, f1_value, 
                                ruc_curve, auc_score, precision_recall_curve, feature_importance, model_training_time, 
                                silhouette_score, cluster_centers, elbow_method_results, 
                                cluster_visualisation_plot, anomaly_score, anomaly_detection_plot, 
                                decision_function_values, support_vectors, one_class_svm_plot)

The model results have been inserted successfully
The Conneciton to MySQL is closed.


## Save Model

In [55]:
# save model to file.

#save folder location
model_path = r"C:\Users\Kolobane\OneDrive\CIT MSc Data Science Modules\_Semester Three - Final Project\Project Two - Network Project\ML Models\sgb_svm_baseline_model.joblib"

# save model
dump(svm_baseline, model_path)

['C:\\Users\\Kolobane\\OneDrive\\CIT MSc Data Science Modules\\_Semester Three - Final Project\\Project Two - Network Project\\ML Models\\sgb_svm_baseline_model.joblib']

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix as cm_function