# Model: K-Nearest Neighbors (KNN)

### Import packages and modules

In [1]:
import csv
import numpy as np
import pandas as pd
import math

import seaborn as sns
from matplotlib import pyplot as plt

from imblearn.under_sampling import RandomUnderSampler 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *

### Import data

In [3]:
%run /Users/apassan/Documents/03_Professional/07_GitHubRepo/APassan_Portfolio/Classification_CreditCardFraud/scripts/model_prep.py

### Confirm X and Y Shapes

In [4]:
# Print the shapes to ensure the matrix dimensions line up
print('X_train shape:', X_train.shape)
print('Y_train shape:', Y_train.shape)
print('X_test shape:', X_test.shape)
print('Y_test shape:', Y_test.shape)


X_train shape: (139844, 7)
Y_train shape: (139844,)
X_test shape: (34962, 7)
Y_test shape: (34962,)


### low Model

 In general, practice, choosing the value of *k is k = sqrt(N)* where *N* stands for the number of samples in your training dataset ([source](https://towardsdatascience.com/a-simple-introduction-to-k-nearest-neighbors-algorithm-b3519ed98e#:~:text='k'%20in%20KNN%20is%20a,majority%20of%20the%20voting%20process.)
)

In [17]:
# Calculate k - accoriding to reccomendation 
N = 139844
k = math.floor(math.sqrt(N))
print('k:', k)


# Create KNN model 
knn_baseline = KNeighborsClassifier(n_neighbors=k, algorithm = 'auto')

#train model with cv of 5 
knn_baseline_acc = cross_val_score(knn_baseline, X_us, Y_us, cv=5, scoring='accuracy')
knn_baseline_prec = cross_val_score(knn_baseline, X_us, Y_us, cv=5, scoring='precision')
knn_baseline_rec = cross_val_score(knn_baseline, X_us, Y_us, cv=5, scoring='recall')
knn_baseline_f1 = cross_val_score(knn_baseline, X_us, Y_us, cv=5, scoring='f1')

# Get results
print('knn_baseline accuracy:', '{0:.2%}'.format(np.mean(knn_baseline_acc)))   
print('knn_baseline precision', '{0:.2%}'.format(np.mean(knn_baseline_prec)))
print('knn_baseline recall', '{0:.2%}'.format(np.mean(knn_baseline_rec)))
print('knn_baseline f1', '{0:.2%}'.format(np.mean(knn_baseline_f1)))

k: 373
knn_baseline accuracy: 95.92%
knn_baseline precision 92.47%
knn_baseline recall 100.00%
knn_baseline f1 96.08%


Try a few other values just to experiment - both much higher and much lower.

In [18]:
# Lower k
klow = 50

# Create KNN model 
knn_low = KNeighborsClassifier(n_neighbors=klow, algorithm = 'auto')

# Create KNN model 
knn_low = KNeighborsClassifier(n_neighbors=k, algorithm = 'auto')

#train model with cv of 5 
knn_low_acc = cross_val_score(knn_low, X_us, Y_us, cv=5, scoring='accuracy')
knn_low_prec = cross_val_score(knn_low, X_us, Y_us, cv=5, scoring='precision')
knn_low_rec = cross_val_score(knn_low, X_us, Y_us, cv=5, scoring='recall')
knn_low_f1 = cross_val_score(knn_low, X_us, Y_us, cv=5, scoring='f1')

# Get results
print('knn_low accuracy:', '{0:.2%}'.format(np.mean(knn_low_acc)))   
print('knn_low precision', '{0:.2%}'.format(np.mean(knn_low_prec)))
print('knn_low recall', '{0:.2%}'.format(np.mean(knn_low_rec)))
print('knn_low f1', '{0:.2%}'.format(np.mean(knn_low_f1)))

knn_low accuracy: 95.92%
knn_low precision 92.47%
knn_low recall 100.00%
knn_low f1 96.08%


In [19]:
# Higher k
khigh = 500

# Create KNN model 
knn_high = KNeighborsClassifier(n_neighbors=khigh, algorithm = 'auto')

#train model with cv of 5 
knn_high_acc = cross_val_score(knn_high, X_us, Y_us, cv=5, scoring='accuracy')
knn_high_prec = cross_val_score(knn_high, X_us, Y_us, cv=5, scoring='precision')
knn_high_rec = cross_val_score(knn_high, X_us, Y_us, cv=5, scoring='recall')
knn_high_f1 = cross_val_score(knn_high, X_us, Y_us, cv=5, scoring='f1')

# Get results
print('knn_high accuracy:', '{0:.2%}'.format(np.mean(knn_high_acc)))   
print('knn_high precision', '{0:.2%}'.format(np.mean(knn_high_prec)))
print('knn_high recall', '{0:.2%}'.format(np.mean(knn_high_rec)))
print('knn_high f1', '{0:.2%}'.format(np.mean(knn_high_f1)))

knn_high accuracy: 95.60%
knn_high precision 91.94%
knn_high recall 99.97%
knn_high f1 95.79%


### Final Model

In the end the baseline model *knn_baseline* seems to be the best in terms of prediction power and not overfitting too much.