### K-Nearest Neighbors

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# Open the .csv files
X_train = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/train_word2vec.csv')
X_test = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/test_word2vec.csv')

X_train = X_train.drop(['Unnamed: 0'], axis=1)
X_test = X_test.drop(['Unnamed: 0'], axis=1)

# Check the shape of the data
print(X_train.shape)
print(X_test.shape)

(22500, 512)
(7500, 512)


In [3]:
# Extract the features into a numpy array
X_train = X_train.values
X_test = X_test.values

In [4]:
# Open the labels
y_train_df = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/CSE343-ML-Project/Data/Preprocessed-Data/train_labels.csv')
y_test_df = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/CSE343-ML-Project/Data/Preprocessed-Data/test_labels.csv')

# Check the shape of the labels
print(y_train_df.shape)
print(y_test_df.shape)

(22500, 1)
(7500, 1)


In [5]:
# Convert class to non_suicide = 0 and suicide = 1
y_train_temp = y_train_df['class'].values
y_test_temp = y_test_df['class'].values

# Check the shape of the labels
print(y_train_temp.shape)
print(y_test_temp.shape)

(22500,)
(7500,)


In [6]:
# Making the Labels Numeric
y_train = np.array([0 if label == "non-suicide" else 1 for label in y_train_temp])
y_test = np.array([0 if label == "non-suicide" else 1 for label in y_test_temp])

# Check the shape of the labels
print(y_train.shape)
print(y_test.shape)

(22500,)
(7500,)


### Without Hyperparameter Tuning

In [7]:
# KNN model
knn_base = KNeighborsClassifier()

# Fit
knn_base.fit(X_train, y_train)

# Predict
y_pred = knn_base.predict(X_test)

In [8]:
# Print the accuracy
print("Base KNN: Test Data")
print("Accuracy:", accuracy_score(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred, average='weighted'))

Base KNN: Test Data
Accuracy: 0.8626666666666667
Confusion Matrix:
[[2825  883]
 [ 147 3645]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.76      0.85      3708
           1       0.80      0.96      0.88      3792

    accuracy                           0.86      7500
   macro avg       0.88      0.86      0.86      7500
weighted avg       0.88      0.86      0.86      7500

AUC Score: 0.861550206191198
Weighted F1-score: 0.8611753569783509


In [9]:
# Predict on training data
y_pred_train = knn_base.predict(X_train)

# Print the accuracy
print("Base KNN: Training Data")
print("Accuracy:", accuracy_score(y_train, y_pred_train))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_pred_train))

# Print the classification report
print("Classification Report:")
print(classification_report(y_train, y_pred_train))

# Print AUC score
print("AUC Score:", roc_auc_score(y_train, y_pred_train))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_train, y_pred_train, average='weighted'))

Base KNN: Training Data
Accuracy: 0.8914666666666666
Confusion Matrix:
[[ 9244  2048]
 [  394 10814]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.82      0.88     11292
           1       0.84      0.96      0.90     11208

    accuracy                           0.89     22500
   macro avg       0.90      0.89      0.89     22500
weighted avg       0.90      0.89      0.89     22500

AUC Score: 0.8917395992387402
Weighted F1-score: 0.8909072273554737


### Hyperparameter Tuning

In [11]:
# Perform Grid Search to find the best parameters for K-Nearest Neighbors

# Define a grid of hyperparameters to search
param_grid = {
    'n_neighbors': [3, 5, 7],  # Number of neighbors to consider
    'weights': ['uniform', 'distance'],  # Weighting strategy
    'metric': ['euclidean', 'manhattan']  # Distance metric
}

# Define the KNN classifier
knn = KNeighborsClassifier()

# Create a grid search object
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   7.8s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   8.2s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   8.1s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   8.2s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   8.3s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   8.6s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   8.9s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   9.1s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   8.9s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   8.8s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   8.9s
[CV] END ...metric=euclidean, n_neighbors=5, wei

In [12]:
# Print the best hyperparameters found by the grid search
print("Best Hyperparameters:")
print(grid_search.best_params_)

Best Hyperparameters:
{'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}


In [10]:
# Best Model
knn_best = KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='uniform')

# Fit the best model    
knn_best.fit(X_train, y_train)

# Predict the labels of the test set
y_pred_test_best = knn_best.predict(X_test)

In [11]:
# Print the accuracy
print("Best Logistic Regression: Test Data")
print("Accuracy:", accuracy_score(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred, average='weighted'))

Best Logistic Regression: Test Data
Accuracy: 0.8626666666666667
Confusion Matrix:
[[2825  883]
 [ 147 3645]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.76      0.85      3708
           1       0.80      0.96      0.88      3792

    accuracy                           0.86      7500
   macro avg       0.88      0.86      0.86      7500
weighted avg       0.88      0.86      0.86      7500

AUC Score: 0.861550206191198
Weighted F1-score: 0.8611753569783509


In [12]:
# Training Data
# Predict the labels of the training set
y_pred_train_best = knn_best.predict(X_train)

# Print the accuracy
print("Best Logistic Regression: Training Data")
print("Accuracy:", accuracy_score(y_train, y_pred_train_best))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_pred_train_best))

# Print the classification report
print("Classification Report:")
print(classification_report(y_train, y_pred_train_best))

# Print AUC score
print("AUC Score:", roc_auc_score(y_train, y_pred_train_best))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_train, y_pred_train_best, average='weighted'))

Best Logistic Regression: Training Data
Accuracy: 0.9085777777777778
Confusion Matrix:
[[ 9584  1708]
 [  349 10859]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.85      0.90     11292
           1       0.86      0.97      0.91     11208

    accuracy                           0.91     22500
   macro avg       0.91      0.91      0.91     22500
weighted avg       0.91      0.91      0.91     22500

AUC Score: 0.9088020000136535
Weighted F1-score: 0.9082638619666361


### Principal Component Analysis

In [13]:
# Perform PCA on the data and extract the features that explain 85% of the variance
pca = PCA(n_components=0.75)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
print(X_train_pca.shape)

(22500, 13)


In [15]:
# Train best model on training data
knn_best.fit(X_train_pca, y_train)

In [16]:
# Predict the labels of the test set
y_pred_test_best_pca = knn_best.predict(X_test_pca)

# Print the accuracy
print("Best KNN: PCA Test Data")
print("Accuracy:", accuracy_score(y_test, y_pred_test_best_pca))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test_best_pca))

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_test_best_pca))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred_test_best_pca))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred_test_best_pca, average='weighted'))

Best KNN: PCA Test Data
Accuracy: 0.8718666666666667
Confusion Matrix:
[[3025  683]
 [ 278 3514]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.82      0.86      3708
           1       0.84      0.93      0.88      3792

    accuracy                           0.87      7500
   macro avg       0.88      0.87      0.87      7500
weighted avg       0.88      0.87      0.87      7500

AUC Score: 0.8712457157292478
Weighted F1-score: 0.8714136088598076


In [17]:
# Predict the labels of the training set
y_pred_train_best_pca = knn_best.predict(X_train_pca)

# Print the accuracy
print("Best KNN: PCA Training Data")
print("Accuracy:", accuracy_score(y_train, y_pred_train_best_pca))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_pred_train_best_pca))

# Print the classification report
print("Classification Report:")
print(classification_report(y_train, y_pred_train_best_pca))

# Print AUC score
print("AUC Score:", roc_auc_score(y_train, y_pred_train_best_pca))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_train, y_pred_train_best_pca, average='weighted'))

Best KNN: PCA Training Data
Accuracy: 0.9215555555555556
Confusion Matrix:
[[10064  1228]
 [  537 10671]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.89      0.92     11292
           1       0.90      0.95      0.92     11208

    accuracy                           0.92     22500
   macro avg       0.92      0.92      0.92     22500
weighted avg       0.92      0.92      0.92     22500

AUC Score: 0.9216691186119524
Weighted F1-score: 0.9214905156220387
