# Assignment 3: K Nearest Neighbors Classifier

In [53]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from knn import *
from scipy.spatial.distance import euclidean
from sklearn.metrics import jaccard_similarity_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import  RFECV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [54]:
def print_binary_classif_error_report(preds, y_test):
    print('Accuracy: ' + str(accuracy_score(preds, y_test)))
    print('Precision: ' + str(precision_score(preds, y_test)))
    print('Recall: ' + str(recall_score(preds, y_test)))
    print('F1: ' + str(f1_score(preds, y_test)))

In [55]:
knn = KNN(5, euclidean)

In [56]:
df = pd.read_csv('churn_data.csv')
df.head()

Unnamed: 0,CustID,Gender,Age,Income,FamilySize,Education,Calls,Visits,Churn
0,123251,Male,34,Lower,4,16,14,5,Yes
1,188922,Male,20,Lower,5,14,49,1,No
2,145322,Female,30,Lower,4,20,19,4,Yes
3,153729,Female,46,Lower,4,14,15,4,Yes
4,103976,Female,23,Lower,4,16,18,0,No


In [57]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 9 columns):
CustID        128 non-null int64
Gender        128 non-null object
Age           128 non-null int64
Income        128 non-null object
FamilySize    128 non-null int64
Education     128 non-null int64
Calls         128 non-null int64
Visits        128 non-null int64
Churn         128 non-null object
dtypes: int64(6), object(3)
memory usage: 9.1+ KB
None


In [58]:
#This makes the binary responses into dummy variables
df = df.drop(['CustID'], axis=1)
le = preprocessing.LabelEncoder()
df.Gender = le.fit_transform(df.Gender)
df.Income = le.fit_transform(df.Income)
df.Churn = le.fit_transform(df.Churn)

In [59]:
df.head()

Unnamed: 0,Gender,Age,Income,FamilySize,Education,Calls,Visits,Churn
0,1,34,0,4,16,14,5,1
1,1,20,0,5,14,49,1,0
2,0,30,0,4,20,19,4,1
3,0,46,0,4,14,15,4,1
4,0,23,0,4,16,18,0,0


In [60]:
#Splitting the training and test data
cs = list(df.columns)
cs.remove('Churn')

data_x = df[cs]
data_y = df['Churn']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size= 0.2, random_state = 4)

In [61]:
#Feature selection based on f score
selector_f = SelectPercentile(f_regression, percentile=25)
selector_f.fit(x_train, y_train)
fscores = []
for name, score, pv in zip(list(df), selector_f.scores_, selector_f.pvalues_):
    if pv <.05:
        fscores.append(name)

for i in df.columns:
    if i not in fscores:
        df = df.drop(i,axis = 1)

In [62]:
#The feature selection removed Calls Visits and Age. This was surprising to me as I would think these variables would have an 
#effect on churn rate.
df.head()

Unnamed: 0,Gender,Income,FamilySize,Education
0,1,0,4,16
1,1,0,5,14
2,0,0,4,20
3,0,0,4,14
4,0,0,4,16


In [63]:
ds = list(df.columns)
data_x = df[ds]


x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size= 0.2, random_state = 4)

In [64]:
print(type(x_train))

<class 'pandas.core.frame.DataFrame'>


In [65]:
#Fitting the data to store the dataframes
knn.fit(x_train, y_train)


In [66]:
preds = knn.pred(x_test)

In [67]:

print_binary_classif_error_report(preds, y_test)

Accuracy: 0.7307692307692307
Precision: 0.7692307692307693
Recall: 0.7142857142857143
F1: 0.7407407407407408


### With a K of 5 and using an euclidean distance function we got an accuracy score of .73 and a recall of .71.

In [68]:
knn2 = KNN(3, euclidean)
knn2.fit(x_train, y_train)
preds2 = knn2.pred(x_test)


In [69]:
print_binary_classif_error_report(preds2, y_test)

Accuracy: 0.7692307692307693
Precision: 0.8461538461538461
Recall: 0.7333333333333333
F1: 0.7857142857142856


### Using a K of 3 improved the accuracy and precision score significantly.

In [70]:
knn23 = KNN(3, jaccard_similarity_score )

In [71]:
knn23.fit(x_train, y_train)

In [72]:
preds23 = knn23.pred(x_test)

In [73]:
print_binary_classif_error_report(preds23, y_test)

Accuracy: 0.4230769230769231
Precision: 0.46153846153846156
Recall: 0.42857142857142855
F1: 0.4444444444444445


### Using a jaccard similarity score yielded poorer results.

In [74]:
#Importing the validation set and performing the same transformations and feature selections to be consistent with our best model.
df2 = pd.read_csv('churn_validation.csv')
df2.Gender = le.fit_transform(df2.Gender)
df2.Income = le.fit_transform(df2.Income)
df2.Churn = le.fit_transform(df2.Churn)
df2.head()

Unnamed: 0,CustID,Gender,Age,Income,FamilySize,Education,Calls,Visits,Churn
0,102522,1,54,1,4,18,48,3,1
1,108050,1,21,0,4,19,44,2,1
2,108118,0,22,0,3,16,22,5,1
3,109501,1,27,1,3,13,19,2,1
4,109782,1,18,0,2,14,6,3,0


In [75]:

cols = list(df2.columns)
data_y = df2['Churn']
cols.remove('Education')
cols.remove('Calls')
cols.remove('Visits')
cols.remove('Churn')
cols.remove('CustID')
df2 = df2[cols]

In [76]:
data_x = df2

In [77]:
preds3 = knn2.pred(data_x)

In [78]:
print_binary_classif_error_report(preds3, data_y)

Accuracy: 0.59375
Precision: 0.0
Recall: 0.0
F1: 0.0


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [79]:
preds4 = knn.pred(data_x)

In [80]:
print_binary_classif_error_report(preds4, data_y)

Accuracy: 0.59375
Precision: 0.0
Recall: 0.0
F1: 0.0


### Using our best model with a K of 3 and a euclidean distance function yielded confusing results. Only an accuracy score was returned. This is likely due to issues with my class creation. For this reason, I cannot reccomend this model be used for accurate predictions.