In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_curve, auc


from regression_module import *
import warnings

warnings.filterwarnings('ignore')

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# Load Data Set
df = pd.read_csv('data/final_df.csv')

In [3]:
# Remove first two unnecessary columns from DF
df = df.iloc[:,2:]

In [4]:
# Assign target variable
y = df.churn
# Drop target variable from independent features DF
X = df.drop('churn', axis = 1)
# Save columns as list of strings for reassign after scaling
cols = X.columns

In [5]:
# Instantiate a scaling object from SKlearn
mm = MinMaxScaler()
# Fit_Transform the independent features DF to the min-max scaler
scaled_X = mm.fit_transform(X)

In [6]:
# Assign scaled dataset to pandas dataframe
scaled_df = pd.DataFrame(scaled_X)
# Reassign columns names to new dataframe
scaled_df.columns = cols
scaled_df

Unnamed: 0,tenure,monthlycharges,totalcharges,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,dependents_Yes,phoneservice_No,...,streamingmovies_Yes,contract_Month-to-month,contract_One year,contract_Two year,paperlessbilling_No,paperlessbilling_Yes,paymentmethod_Bank transfer (automatic),paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,0.013889,0.115423,0.003437,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.472222,0.385075,0.217564,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.027778,0.354229,0.012453,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.625000,0.239303,0.211951,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.027778,0.521891,0.017462,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.333333,0.662189,0.229194,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7039,1.000000,0.845274,0.847792,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7040,0.152778,0.112935,0.039892,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7041,0.055556,0.558706,0.035303,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [8]:
# Perform a train test split, maintaining test size sample and random state from logistic regression notebook
X_train, X_test, y_train, y_test = train_test_split(scaled_df, y, test_size = .25, random_state = 33)

# KNN Classifier - Default
This KNN classifier is run with default parameters as a baseline model for reference

In [111]:
# Instantiate a KNN object from sklearn.neighbors
clf = KNeighborsClassifier()
# Fit training dataframes to the knn object
clf.fit(X_train, y_train)
# Predict train values
y_hat_train = clf.predict(X_train)
# Predict test values
y_hat_test = clf.predict(X_test)

In [112]:
# Find residual differences between train data and predicted train data
residuals = np.abs(y_train - y_hat_train)

# Print value counts of our predicted values 
print(pd.Series(residuals).value_counts())
print('----------------------------------')
# Print normalized value counts of our predicted values
print(pd.Series(residuals).value_counts(normalize = True))

0    4393
1     889
Name: churn, dtype: int64
----------------------------------
0    0.831693
1    0.168307
Name: churn, dtype: float64


In [113]:
# Repeat previous step with test data
residuals = np.abs(y_test - y_hat_test)
print(pd.Series(residuals).value_counts())
print('---------------------------------')
print(pd.Series(residuals).value_counts(normalize = True))

0    1326
1     435
Name: churn, dtype: int64
---------------------------------
0    0.752981
1    0.247019
Name: churn, dtype: float64


In [114]:
print_metrics(y_train, y_hat_train, y_test, y_hat_test)

Training Precision:  0.6929012345679012
Testing Precision:  0.5488069414316703


Training Recall:  0.646508279337653
Testing Recall:  0.5270833333333333


Training Accuracy:  0.8316925407042787
Testing Accuracy:  0.7529812606473595


Training F1-Score:  0.668901303538175
Testing F1-Score:  0.5377258235919234


## Returning Optimal Parameters

This function will iterate through a default classifier, changing only the parameter k within a range of k = [0,50]. The function will then return the K-value with the highest corresponding F1 score. We will then use this value of k moving forward.

In [115]:
find_best_k(X_train, y_train, X_test, y_test)

Best Value for K: 97
F1-Score: 0.6074866310160427


In [116]:
grid_params = {'n_neighbors': [97],
               'weights':['uniform', 'distance'],
               'metric':['euclidean', 'manhattan']
              }
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv = 3, n_jobs = -1)
gs_results = gs.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    5.8s finished


In [117]:
gs_results.best_score_

0.78833775085195

In [118]:
gs_results.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=97, p=2,
                     weights='uniform')

In [119]:
gs_results.best_params_

{'metric': 'euclidean', 'n_neighbors': 97, 'weights': 'uniform'}