In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
import imblearn
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
warnings.filterwarnings('ignore')
df = pd.read_csv('data/cleaned_data.csv')



In [2]:
df

Unnamed: 0,account length,international plan,voice mail plan,total day minutes,total day calls,total eve minutes,total eve calls,total night minutes,total night calls,total intl minutes,total intl calls,customer service calls,churn
0,128,0,1,265.1,110,197.4,99,244.7,91,10.0,3,1,0
1,107,0,1,161.6,123,195.5,103,254.4,103,13.7,3,1,0
2,137,0,0,243.4,114,121.2,110,162.6,104,12.2,5,0,0
3,84,1,0,299.4,71,61.9,88,196.9,89,6.6,7,2,0
4,75,1,0,166.7,113,148.3,122,186.9,121,10.1,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,192,0,1,156.2,77,215.5,126,279.1,83,9.9,6,2,0
3329,68,0,0,231.1,57,153.4,55,191.3,123,9.6,4,3,0
3330,28,0,0,180.8,109,288.8,58,191.9,91,14.1,6,2,0
3331,184,1,0,213.8,105,159.6,84,139.2,137,5.0,10,2,0


In [5]:
def evaluate(model,dt_test,dt_train,target_train,target_test):
    pred_test = model.predict(dt_test)
    pred_train = model.predict(dt_train)
    
    print('Evaluations for test:\n', confusion_matrix(target_test, pred_test))
    print(classification_report(target_test, pred_test))
    print('\n')
    print('Evaluations for train:\n',confusion_matrix(target_train, pred_train))
    print(classification_report(target_train, pred_train))

In [6]:
one_hot_knn = pd.get_dummies(df)
one_hot_knn.head()

Unnamed: 0,account length,international plan,voice mail plan,total day minutes,total day calls,total eve minutes,total eve calls,total night minutes,total night calls,total intl minutes,total intl calls,customer service calls,churn
0,128,0,1,265.1,110,197.4,99,244.7,91,10.0,3,1,0
1,107,0,1,161.6,123,195.5,103,254.4,103,13.7,3,1,0
2,137,0,0,243.4,114,121.2,110,162.6,104,12.2,5,0,0
3,84,1,0,299.4,71,61.9,88,196.9,89,6.6,7,2,0
4,75,1,0,166.7,113,148.3,122,186.9,121,10.1,3,3,0


In [7]:
labels = one_hot_knn['churn']
one_hot_knn.drop('churn', axis=1, inplace=True)

In [8]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(one_hot_knn, labels, test_size=0.25, random_state=42)

In [9]:
data_train_smoted, target_train_smoted = SMOTE(random_state=42).fit_resample(data_train, target_train)
data_train = data_train_smoted
target_train = target_train_smoted

In [10]:
# Normalizing the data after train test split to avoid information leaking

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Transform the train and test sets
data_train = scaler.fit_transform(data_train)
data_test = scaler.transform(data_test)

scaled_df_train = pd.DataFrame(data_train, columns=one_hot_knn.columns)
scaled_df_train.head()

Unnamed: 0,account length,international plan,voice mail plan,total day minutes,total day calls,total eve minutes,total eve calls,total night minutes,total night calls,total intl minutes,total intl calls,customer service calls
0,-1.519452,-0.507383,-0.582492,-1.824218,1.359193,0.957614,0.41171,1.141056,-0.021088,2.856698,-0.564665,-0.582807
1,0.364407,-0.507383,-0.582492,0.113033,0.529269,0.429843,0.41171,2.376974,0.680723,1.039711,-1.457398,0.070857
2,0.525881,-0.507383,1.925132,0.791154,-1.960503,0.002029,0.888895,-0.096985,-0.830871,-0.13376,0.328068,1.378185
3,2.113704,-0.507383,-0.582492,-0.752722,1.722285,-0.499753,0.676813,-0.362432,-0.7229,-0.13376,-0.118299,-0.582807
4,0.28367,-0.507383,-0.582492,0.282563,1.099842,-0.819614,-0.330576,-1.294679,0.032897,0.812588,1.2208,0.724521


In [11]:
# Fitting the model
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()

knn_clf = clf.fit(data_train, target_train)

# Predict on the test set
test_preds = clf.predict(data_test)

In [12]:
evaluate(knn_clf, data_test, data_train, target_train, target_test)

Evaluations for test:
 [[577 132]
 [ 38  87]]
              precision    recall  f1-score   support

           0       0.94      0.81      0.87       709
           1       0.40      0.70      0.51       125

    accuracy                           0.80       834
   macro avg       0.67      0.75      0.69       834
weighted avg       0.86      0.80      0.82       834



Evaluations for train:
 [[1882  259]
 [  12 2129]]
              precision    recall  f1-score   support

           0       0.99      0.88      0.93      2141
           1       0.89      0.99      0.94      2141

    accuracy                           0.94      4282
   macro avg       0.94      0.94      0.94      4282
weighted avg       0.94      0.94      0.94      4282



In [13]:
# Evaluating the model
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print(confusion_matrix(labels, preds))
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))
    
print_metrics(target_test, test_preds)

[[577 132]
 [ 38  87]]
Precision Score: 0.3972602739726027
Recall Score: 0.696
Accuracy Score: 0.7961630695443646
F1 Score: 0.505813953488372


In [14]:
# Tuning the model, finding the optimal number of neighbors to use for the classifier 

def find_best_k(data_train, target_train, data_test, target_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(data_train, target_train)
        preds = knn.predict(data_test)
        f1 = f1_score(target_test, preds)
        if f1 > best_score:
            best_k = k
            best_score = f1
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))

In [18]:
find_best_k(data_train, target_train, data_test, target_test)

Best Value for k: 23
F1-Score: 0.5976331360946746


I managed to improve F1 from 0.51 to 0.6, which has its optimal value at k=23. Still less than we would like F1 to be, but it is an improvement.