In [None]:
import os
import joblib
import xgboost
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc

<h4><strong>Preprocess

Train and test split

In [None]:
feature_matrix = np.load("feature_matrix.npy")

X = feature_matrix[:, :-1]
y = feature_matrix[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Scale training and test features

In [None]:
ss = StandardScaler()

X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.fit_transform(X_test)

Perform PCA

In [None]:
pca = PCA(n_components=345)

X_train_pca = pca.fit_transform(X_train_ss)
X_test_pca = pca.fit_transform(X_test_ss)

---

<h4><strong>Train and Test

Apply KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train_pca, y_train)

y_pred_knn = knn.predict(X_test_pca)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn)}")

Apply Naive Bayes Classifier

In [None]:
nb = GaussianNB()

nb.fit(X_train_pca, y_train)

y_pred_nb = nb.predict(X_test_pca)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred_nb)}")

Apply Logistic Regression

In [None]:
lr = LogisticRegression(solver='liblinear', max_iter=2000, C=0.01, random_state=42)

lr.fit(X_train_pca, y_train)

y_pred_lr = lr.predict(X_test_pca)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr)}")

Apply Support Vector Machine

In [None]:
svm = SVC(kernel="rbf", probability=True, random_state=42)

svm.fit(X_train_pca, y_train)

y_pred_svm = svm.predict(X_test_pca)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm)}")

Apply Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier(random_state=42)

dt.fit(X_train_pca, y_train)

y_pred_dt = dt.predict(X_test_pca)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt)}")

Apply Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(n_estimators=200, random_state=42)

rfc.fit(X_train_pca, y_train)

y_pred_rfc = rfc.predict(X_test_pca)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred_rfc)}")

Apply XGBoost

In [None]:
xgb = xgboost.XGBClassifier(n_estimators=1000, learning_rate=0.001, max_depth=6, random_state=42, eval_metric="logloss")

xgb.fit(X_train_pca, y_train)

y_pred_xgb = xgb.predict(X_test_pca)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb)}")

---

<h4><strong>Evaluation

ROC and AUC

In [None]:
# Compute ROC and AUC for KNN

apis_prob_knn = knn.predict_proba(X_test_pca)[:, 0]

fpr_knn, tpr_knn, _ = roc_curve(y_test, apis_prob_knn, pos_label=0)

roc_auc_knn = auc(fpr_knn, tpr_knn)

In [None]:
# Compute ROC and AUC for Naive Bayes

apis_prob_nb = nb.predict_proba(X_test_pca)[:, 0]

fpr_nb, tpr_nb, _ = roc_curve(y_test, apis_prob_nb, pos_label=0)

roc_auc_nb = auc(fpr_nb, tpr_nb)

In [None]:
# Compute ROC and AUC for Logistic Regression

apis_prob_lr = lr.predict_proba(X_test_pca)[:, 0]

fpr_lr, tpr_lr, _ = roc_curve(y_test, apis_prob_lr, pos_label=0)

roc_auc_lr = auc(fpr_lr, tpr_lr)

In [None]:
# Compute ROC and AUC for SVM

apis_prob_svm = svm.predict_proba(X_test_pca)[:, 0]

fpr_svm, tpr_svm, _ = roc_curve(y_test, apis_prob_svm, pos_label=0)

roc_auc_svm = auc(fpr_svm, tpr_svm)

In [None]:
# Compute ROC and AUC for Decision Tree

apis_prob_dt = dt.predict_proba(X_test_pca)[:, 0]

fpr_dt, tpr_dt, _ = roc_curve(y_test, apis_prob_dt, pos_label=0)

roc_auc_dt = auc(fpr_dt, tpr_dt)

In [None]:
# Compute ROC and AUC for Random Forest

apis_prob_rfc = rfc.predict_proba(X_test_pca)[:, 0]

fpr_rfc, tpr_rfc, _ = roc_curve(y_test, apis_prob_rfc, pos_label=0)

roc_auc_rfc = auc(fpr_rfc, tpr_rfc)

In [None]:
# Compute ROC and AUC for XGBoost

apis_prob_xgb = xgb.predict_proba(X_test_pca)[:, 0]

fpr_xgb, tpr_xgb, _ = roc_curve(y_test, apis_prob_xgb, pos_label=0)

roc_auc_xgb = auc(fpr_xgb, tpr_xgb)

In [None]:
plt.figure()

# plot ROC for KNN
plt.plot(fpr_knn, tpr_knn, color="red", label=f"KNN: AUC = {roc_auc_knn:0.2f}")

# plot ROC for Naive Bayes
plt.plot(fpr_nb, tpr_nb, color="purple", label=f"NB: AUC = {roc_auc_nb:0.2f}")

# plot ROC for Logistic Regression
plt.plot(fpr_lr, tpr_lr, color="cyan", label=f"LR: AUC = {roc_auc_lr:0.2f}")

# plot ROC for SVM
plt.plot(fpr_svm, tpr_svm, color="orange", label=f"SVM: AUC = {roc_auc_svm:0.2f}")

# plot ROC for Decision Tree
plt.plot(fpr_dt, tpr_dt, color="brown", label=f"DT: AUC = {roc_auc_dt:0.2f}")

# plot ROC for Random Forest
plt.plot(fpr_rfc, tpr_rfc, color="green", label=f"RFC: AUC = {roc_auc_rfc:0.2f}")

# plot ROC for XGBoost
plt.plot(fpr_xgb, tpr_xgb, color="blue", label=f"XGB: AUC = {roc_auc_xgb:0.2f}")

# plot reference line
plt.plot([0, 1], [0, 1], color="black", ls='--')

# label plot
plt.legend(loc=0)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic Curve")

plt.show()

---

<h4>Save models for future use

In [None]:
# save Standard Scaler
os.makedirs("Models", exist_ok=True)
path = os.path.join("Models", "standard_scaler.pkl")
joblib.dump(ss, path)

In [None]:
# save PCA
path = os.path.join("Models", "pca.pkl")
joblib.dump(pca, path)

In [None]:
# save KNN to disk
path = os.path.join("Models", "knn.pkl")
joblib.dump(knn, path)

In [None]:
# save Naive Bayes to disk
path = os.path.join("Models", "naive_bayes.pkl")
joblib.dump(nb, path)

In [None]:
# save Logistic Regression to disk
path = os.path.join("Models", "logistic_regression.pkl")
joblib.dump(lr, path)

In [None]:
# save SVM to disk
path = os.path.join("Models", "svm.pkl")
joblib.dump(svm, path)

In [None]:
# save Decision Tree to disk
path = os.path.join("Models", "decision_tree.pkl")
joblib.dump(dt, path)

In [None]:
# save Random Forest to disk
path = os.path.join("Models", "random_forest.pkl")
joblib.dump(rfc, path)

In [None]:
# save XGBoost to disk
path = os.path.join("Models", "xgboost.pkl")
joblib.dump(xgb, path)