In [77]:
#import libraries

import imblearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [78]:
#import file

churnData=pd.read_csv('DATA_Customer-Churn.txt')
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [79]:
#check the data types

churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [80]:
churnData['TotalCharges']

0         29.85
1        1889.5
2        108.15
3       1840.75
4        151.65
         ...   
7038     1990.5
7039     7362.9
7040     346.45
7041      306.6
7042     6844.5
Name: TotalCharges, Length: 7043, dtype: object

In [81]:
#convert TotalCharges into numeric type

churnData['TotalCharges'] = churnData['TotalCharges'].apply(lambda x : pd.to_numeric(x, errors='coerce'))

In [82]:
#Check for null values in the dataframe

churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [83]:
churnData.loc[churnData['TotalCharges'].isnull()]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
488,Female,0,Yes,Yes,0,No,Yes,No,Yes,Yes,Yes,No,Two year,52.55,,No
753,Male,0,No,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.25,,No
936,Female,0,Yes,Yes,0,Yes,Yes,Yes,Yes,No,Yes,Yes,Two year,80.85,,No
1082,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.75,,No
1340,Female,0,Yes,Yes,0,No,Yes,Yes,Yes,Yes,Yes,No,Two year,56.05,,No
3331,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,19.85,,No
3826,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.35,,No
4380,Female,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.0,,No
5218,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,19.7,,No
6670,Female,0,Yes,Yes,0,Yes,No,Yes,Yes,Yes,Yes,No,Two year,73.35,,No


In [84]:
#Since our column is numeric now, we can replace the missing values using K nearest neighbors

#from sklearn.impute import SimpleImputer, KNNImputer 

#to create the real N/A -> np.nan

In [85]:
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(churnData['tenure'] * churnData['MonthlyCharges'])

In [87]:
#Use the following features: tenure, SeniorCitizen, MonthlyCharges and TotalCharges:

Y = churnData["Churn"]

X = churnData[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]

In [88]:
#Split the data into a training set and a test set.

from sklearn.model_selection import train_test_split 

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=532)

In [94]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tenure,5634.0,32.103834,24.486396,0.0,9.0,28.0,55.0,72.0
SeniorCitizen,5634.0,0.160987,0.367551,0.0,0.0,0.0,0.0,1.0
MonthlyCharges,5634.0,64.791072,30.01477,18.25,35.825,70.275,89.85,118.6
TotalCharges,5634.0,2263.508395,2256.358225,0.0,393.225,1384.175,3769.925,8684.8


In [95]:
#Scale the features either by using MinMaxScaler or a standard scaler.

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
pd.DataFrame(X_train_scaled).head()

Unnamed: 0,0,1,2,3
0,0.013889,1.0,0.618336,0.009246
1,0.069444,0.0,0.326358,0.033023
2,0.5,0.0,0.673642,0.34584
3,0.916667,0.0,0.448929,0.482418
4,0.208333,1.0,0.557549,0.130561


In [96]:
#Fit a Knn Classifier (NOT KnnRegressor please!)model on the training data.

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train_scaled, Y_train)


KNeighborsClassifier()

In [98]:
X_test_scaled = scaler.transform(X_test)

In [99]:
y_pred = knn.predict(X_test_scaled)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [103]:
Y_test

1409

In [104]:
y_pred

array(['No', 'No', 'No', ..., 'No', 'Yes', 'No'], dtype=object)

In [109]:
#Fit a Decision Tree Classifier on the training data.

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

model = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, max_depth=10)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
#Compare the accuracy, precision, recall for the previous models on both the train and test sets.