## Rod Pump Failure: K-Nearest Neighbors Model, Iteration 2
**Improvements:**
* Added grid search for `n_neighbors` hyperparameter
* Added cross validation in model training
* Added data balancing function eliminate binning of output values
* Tested classifier on entire dataset to ensure successful implimentation

In [None]:
# General imports
from floridaman import data_cleaning
import pandas as pd
import numpy as np

In [None]:
# Load raw data
raw_data = data_cleaning.load('null_transformed')

In [None]:
# Model-specific imports
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

In [None]:
# Generate candidate dataset by removing nulls from raw_data using parameters
# Drop columns with more than 40% null values
# Drop rows with between 40% and 40% null values
# Impute rows with missing data below 40% null values using 5 n_neighbors

candidate_data = data_cleaning.generate_candidate_dataset(raw_data, .40, .40, 5)

# Create training dataset by balancing the number of occurances of each observed failuretype

train_data = data_cleaning.balance(candidate_data)

# Split training data into X and y

X_train = np.array(train_data[data_cleaning.features(train_data)])
y_train = np.array(train_data['FAILURETYPE'])

# Split all data into X and y for testing

X_test = np.array(candidate_data[data_cleaning.features(candidate_data)])
y_test = np.array(candidate_data['FAILURETYPE'])

# Create KNN classifier and parameter grid to test 1-30 n_neighbors

knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1, 100)}

# Create grid search object with specified parameter grid
# using roc_auc_ovo scoring and cross validation with 8 folds

clf = GridSearchCV(knn, param_grid, scoring='roc_auc_ovo', cv=8, n_jobs=12)

# Fit classifier to X_train and y_train

clf.fit(X_train, y_train)

# Print some results from training and testing data

#print("Scoring method: roc_auc_ovo")
#print("Classifier training score: " + str(clf.best_score_)) # prints best roc score from training set
#print("Classifier testing score: " + str(clf.score(X_test, y_test))) # prints best roc score using all data as test
#print("Best parameters: " + str(clf.best_params_)) # prints ideal values for parameters

print(str(classification_report(y_test, clf.predict(X_test))))
# Plot the confusion matrix

conf_mat = plot_confusion_matrix(clf, X_test, y_test)

In [None]:
candidate_data

In [None]:
import shap
shap.initjs()

In [None]:
import matplotlib.pyplot as plt
knn = KNeighborsClassifier(clf.best_params_['n_neighbors'])
knn.fit(X_train,y_train)
# Produce the SHAP values
knn_explainer = shap.KernelExplainer(knn.predict_proba,shap.sample(X_test,1))
knn_shap_values = knn_explainer.shap_values(X_test)
shap.save_html('KNNShap.html',shap.force_plot(knn_explainer.expected_value[0], knn_shap_values[0], X_test, matplotlib = False))