In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Library and Data Import

In [None]:
import pandas as pd                                            
import numpy as np
from sklearn.model_selection import train_test_split 
import sklearn.model_selection as ms             
from sklearn.metrics import confusion_matrix 
import statsmodels.api as sm                      

file_path = '/content/drive/MyDrive/INFO-614 Data Mining/churn.csv'
df = pd.read_csv(file_path)

pd.set_option('display.max_columns', None)
df.head() 

# Remove Rows with a Missing Value

In [None]:
display(df.isna().sum())
df = df.dropna()

# Summary Statistics

In [None]:
df_summary = df.describe()
df_summary

# Subset the Data Framework

In [None]:
df.columns = df.columns.str.replace(' ', '_')
df = df[["Gender", "Age", "Married", "Offer", "Number_of_Dependents", "Tenure_in_Months", "Multiple_Lines", "Internet_Service", "Paperless_Billing", "Total_Charges", "Total_Revenue", "Customer_Status"]]

# Preprocessing

In [None]:
df.columns = df.columns.str.replace(' ', '_')

df.Gender = df.Gender.replace('Male',0)
df.Gender = df.Gender.replace('Female',1)
df.Married = df.Married.replace('No',0)
df.Married = df.Married.replace('Yes',1)

display(df['Offer'].value_counts())

df.Offer = df.Offer.replace ('None',0)
df.Offer = df.Offer.replace ('Offer A',1)
df.Offer = df.Offer.replace ('Offer B',2)
df.Offer = df.Offer.replace ('Offer C',3)
df.Offer = df.Offer.replace ('Offer D',4)
df.Offer = df.Offer.replace ('Offer E',5)

display(df['Multiple_Lines'].value_counts())
df.Multiple_Lines = df.Multiple_Lines.replace('No',0)
df.Multiple_Lines = df.Multiple_Lines.replace('Yes',1)

display(df['Internet_Service'].value_counts())
df.Internet_Service = df.Internet_Service.replace('No',0)
df.Internet_Service = df.Internet_Service.replace('Yes',1)

display(df['Paperless_Billing'].value_counts())
df.Paperless_Billing = df.Paperless_Billing.replace('No',0)
df.Paperless_Billing = df.Paperless_Billing.replace('Yes',1)

display(df['Customer_Status'].value_counts())
df.Customer_Status = df.Customer_Status.replace('Stayed',0)
df.Customer_Status = df.Customer_Status.replace('Churned',1)

df = df[df['Customer_Status'] !='Joined']

df.Customer_Status = df.Customer_Status.astype(int)

df.corrwith(df["Customer_Status"])

# Attributes for the Model

In [None]:
df = df[["Gender", "Age", "Married", "Number_of_Dependents", "Tenure_in_Months", "Multiple_Lines", "Internet_Service", "Paperless_Billing", "Total_Charges", "Total_Revenue", "Customer_Status"]]

# Normalization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X = (df.iloc[:,:-1])
y = (df.iloc[:, -1])
y = y.astype('int')

X = scaler.fit_transform(X)

# Set Train and Test Sets

In [None]:
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size = 0.4, random_state = 1)

# Model Fit

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors = 83)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('Train_Accuracy: ', model.score(X_train, y_train),'\n')

# Test

In [None]:
import sklearn.metrics as mt

accuracy = mt.accuracy_score(y_test, y_pred)
recall = mt.recall_score(y_test, y_pred)
precision = mt.precision_score(y_test, y_pred)
f1_score = mt.f1_score(y_test, y_pred)
matrix = mt.confusion_matrix(y_test, y_pred)

print('Accuracy: ', format(accuracy,'.2f'),'\n')
print('Recall: ', format(recall,'.2f'),'\n')
print('Precision: ', format(precision,'.2f'),'\n')
print('F1_score: ', format(f1_score,'.2f'),'\n')
print('Confusion Matrix:','\n', matrix)

# Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import cross_val_predict

model = KNeighborsClassifier(n_neighbors = 3)

y_pred_cross = cross_val_predict(model, X, y, cv=10)
accuracy = mt.accuracy_score(y, y_pred_cross)
recall = mt.recall_score(y, y_pred_cross)
precision = mt.precision_score(y, y_pred_cross)
f1_score = mt.f1_score(y, y_pred_cross)
matrix = mt.confusion_matrix(y, y_pred_cross)

print('Accuracy: ', format(accuracy,'.2f'),'\n')
print('Recall: ', format(recall,'.2f'),'\n')
print('Precision: ', format(precision,'.2f'),'\n')
print('F1_score: ', format(f1_score,'.2f'),'\n')
print('Confusion Matrix:','\n', matrix)

# Parameter Tuning

In [None]:
import matplotlib.pyplot as plt

k_range = range(11,101)

accuracy_list = []

for k in k_range:
    model = KNeighborsClassifier(n_neighbors = k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy_list.append(mt.accuracy_score(y_test, y_pred))

# k = sqrt(N) 

plt.plot(k_range, accuracy_list, 'o--', color = 'blue')
plt.xlabel("k")
plt.ylabel("test accuracy")
plt.show()