In [39]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_curve, auc


from regression_module import *
import warnings

warnings.filterwarnings('ignore')

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
# Load Data Set
df = pd.read_csv('data/final_df.csv')

In [41]:
# Remove first two unnecessary columns from DF
df = df.iloc[:,2:]

In [42]:
# Assign target variable
y = df.churn
# Drop target variable from independent features DF
X = df.drop('churn', axis = 1)
# Save columns as list of strings for reassign after scaling
cols = X.columns

In [43]:
# Instantiate a scaling object from SKlearn
mm = MinMaxScaler()
# Fit_Transform the independent features DF to the min-max scaler
scaled_X = mm.fit_transform(X)

In [44]:
# Assign scaled dataset to pandas dataframe
scaled_df = pd.DataFrame(scaled_X)
# Reassign columns names to new dataframe
scaled_df.columns = cols

In [45]:
# Perform a train test split, maintaining test size sample and random state from logistic regression notebook
X_train, X_test, y_train, y_test = train_test_split(scaled_df, y, test_size = .25, random_state = 33)

# Baseline KNN Classifier (default parameters)

In [8]:
knn1 = KNeighborsClassifier()
knn1.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [12]:
test_preds = knn1.predict(X_test)
knn1.score(X_test, y_test)

0.7529812606473595

In [21]:
print(f"Precision Score: {precision_score(y_test, test_preds)}")
print(f"Recall Score: {recall_score(y_test, test_preds)}")
print(f"Accuracy Score: {accuracy_score(y_test, test_preds)}")
print(f"F1 Score: {f1_score(y_test, test_preds)}")

Precision Score: 0.5488069414316703
Recall Score: 0.5270833333333333
Accuracy Score: 0.7529812606473595
F1 Score: 0.5377258235919234


# Cross Val - KNN Classifier

In [58]:
knn_clf = KNeighborsClassifier()
knn_cv_score = cross_val_score(knn_clf, X_train, y_train, cv = 4)
mean_knn_cv_score = np.mean(knn_cv_score)
print(f"Mean Cross Validation Score: {mean_knn_cv_score :.2%}")

Mean Cross Validation Score: 77.04%


In [59]:
knn2 = KNeighborsClassifier()
param_grid = {'n_neighbors':np.arange(1,20)}
knn2_gscv = GridSearchCV(knn2, param_grid, cv = 4)
knn2_gscv.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [60]:
knn2_gscv.best_params_
knn2_gscv.best_score_

0.7890950397576676

In [61]:
knn2_gscv.predict(X_test)

array([1, 0, 0, ..., 0, 0, 0])

In [62]:
knn2_gscv.score(X_test,y_test)

0.7836456558773425

In [70]:
h = .02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

for weights in ['uniform', 'distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1
    y_min, y_max = X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h)).reshape(-1, )
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"
              % (n_neighbors, weights))

plt.show()

ValueError: query data dimension must match training data dimension