In [107]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('data/cleaned_data.csv')

In [108]:
df

Unnamed: 0,account length,international plan,voice mail plan,total day minutes,total day calls,total eve minutes,total eve calls,total night minutes,total night calls,total intl minutes,total intl calls,customer service calls,churn
0,128,0,1,265.1,110,197.4,99,244.7,91,10.0,3,1,0
1,107,0,1,161.6,123,195.5,103,254.4,103,13.7,3,1,0
2,137,0,0,243.4,114,121.2,110,162.6,104,12.2,5,0,0
3,84,1,0,299.4,71,61.9,88,196.9,89,6.6,7,2,0
4,75,1,0,166.7,113,148.3,122,186.9,121,10.1,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,192,0,1,156.2,77,215.5,126,279.1,83,9.9,6,2,0
3329,68,0,0,231.1,57,153.4,55,191.3,123,9.6,4,3,0
3330,28,0,0,180.8,109,288.8,58,191.9,91,14.1,6,2,0
3331,184,1,0,213.8,105,159.6,84,139.2,137,5.0,10,2,0


In [109]:
def evaluate(model,dt_test,dt_train,target_train,target_test):
    pred_test = model.predict(dt_test)
    pred_train = model.predict(dt_train)
    
    print('Evaluations for test:\n', confusion_matrix(target_test, pred_test))
    print(classification_report(target_test, pred_test))
    print('\n')
    print('Evaluations for train:\n',confusion_matrix(target_train, pred_train))
    print(classification_report(target_train, pred_train))

In [110]:
one_hot_knn = pd.get_dummies(df)
one_hot_knn.head()

Unnamed: 0,account length,international plan,voice mail plan,total day minutes,total day calls,total eve minutes,total eve calls,total night minutes,total night calls,total intl minutes,total intl calls,customer service calls,churn
0,128,0,1,265.1,110,197.4,99,244.7,91,10.0,3,1,0
1,107,0,1,161.6,123,195.5,103,254.4,103,13.7,3,1,0
2,137,0,0,243.4,114,121.2,110,162.6,104,12.2,5,0,0
3,84,1,0,299.4,71,61.9,88,196.9,89,6.6,7,2,0
4,75,1,0,166.7,113,148.3,122,186.9,121,10.1,3,3,0


In [111]:
labels = one_hot_knn['churn']
one_hot_knn.drop('churn', axis=1, inplace=True)

In [112]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(one_hot_knn, labels, test_size=0.25, random_state=42)

In [121]:
# Normalizing the data after train test split to avoid information leaking

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Transform the train and test sets
data_train = scaler.fit_transform(data_train)
data_test = scaler.transform(data_test)

scaled_df_train = pd.DataFrame(data_train, columns=one_hot_knn.columns)
scaled_df_train.head()

Unnamed: 0,account length,international plan,voice mail plan,total day minutes,total day calls,total eve minutes,total eve calls,total night minutes,total night calls,total intl minutes,total intl calls,customer service calls
0,-1.404508,-0.327448,-0.611418,-1.883677,1.330852,1.037727,0.40134,1.069609,0.00494,2.768109,-0.595235,-0.427903
1,0.366388,-0.327448,-0.611418,0.294083,0.529165,0.516178,0.40134,2.214376,0.670832,1.05197,-1.417963,0.324616
2,0.518179,-0.327448,1.635543,1.056392,-1.875896,0.093407,0.849774,-0.077125,-0.763398,-0.056369,0.227493,1.829653
3,2.010792,-0.327448,-0.611418,-0.679156,1.68159,-0.402459,0.65047,-0.322994,-0.660953,-0.056369,-0.183871,-0.427903
4,0.290493,-0.327448,-0.611418,0.48466,1.080325,-0.718549,-0.296224,-1.186487,0.056162,0.837453,1.050221,1.077134


In [122]:
# Fitting the model
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()

knn_clf = clf.fit(data_train, target_train)

# Predict on the test set
test_preds = clf.predict(data_test)

In [123]:
evaluate(knn_clf, data_test, data_train, target_train, target_test)

Evaluations for test:
 [[699  10]
 [ 82  43]]
              precision    recall  f1-score   support

           0       0.90      0.99      0.94       709
           1       0.81      0.34      0.48       125

    accuracy                           0.89       834
   macro avg       0.85      0.66      0.71       834
weighted avg       0.88      0.89      0.87       834



Evaluations for train:
 [[2124   17]
 [ 179  179]]
              precision    recall  f1-score   support

           0       0.92      0.99      0.96      2141
           1       0.91      0.50      0.65       358

    accuracy                           0.92      2499
   macro avg       0.92      0.75      0.80      2499
weighted avg       0.92      0.92      0.91      2499



In [124]:
# Evaluating the model
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print(confusion_matrix(labels, preds))
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))
    
print_metrics(target_test, test_preds)

[[699  10]
 [ 82  43]]
Precision Score: 0.8113207547169812
Recall Score: 0.344
Accuracy Score: 0.8896882494004796
F1 Score: 0.4831460674157304


In [125]:
# Tuning the model, finding the optimal number of neighbors to use for the classifier 

def find_best_k(data_train, target_train, data_test, target_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(data_train, target_train)
        preds = knn.predict(data_test)
        f1 = f1_score(target_test, preds)
        if f1 > best_score:
            best_k = k
            best_score = f1
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))

In [126]:
find_best_k(scaled_data_train, y_train, scaled_data_test, y_test)

Best Value for k: 1
F1-Score: 0.504424778761062


I managed to improve F1 from 0.48 to 0.5, which has its optimal value at k=1. Still less than we would like F1 to be, but it is an improvement.