# Libraries

In [7]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import tqdm
import hyperopt
import sys
import scipy

import lightgbm
from catboost import Pool, CatBoostClassifier, cv
# ,CatboostIpythonWidget

from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from IPython.display import display, HTML

# input data

In [9]:
churn = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [5]:
churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [11]:
churn.shape
# churn.isnull().sum(axis=0)

(7043, 21)

# Dividing variables into categorical and Numeric

In [12]:
churn = churn.drop(['customerID'],axis=1)
vars=churn.dtypes
categorical=[]
numeric=[]
for i in range(0,len(vars)):
    if vars[i]=="object": 
        categorical.append(churn.columns[i])
    else:
        numeric.append(churn.columns[i]) 

# Variable processing

In [13]:
#lable encoding for categorical variables
df1=churn[categorical].apply(LabelEncoder().fit_transform)
df2=churn[numeric]
df3=pd.concat([df1, df2], axis=1)
df3['TotalCharges']=df3['TotalCharges'].astype(float)

# Split data in Train and Test datsets

In [14]:
train, test = train_test_split(df3, test_size=0.2)
Churn_X =train['Churn'] 
train = train.drop(['Churn'],axis=1)
Churn_Y =test['Churn'] 
test = test.drop(['Churn'],axis=1)

# 1. Xgboost Model

In [15]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train,Churn_X)
predictions = gbm.predict(test)
accuracy_score(Churn_Y, predictions)

0.8147622427253371

In [27]:
gbm.fit(test, Churn_Y)

# 2.Catboost Model

In [16]:
categorical_features_indices = np.where(train.dtypes != np.float)[0]
categorical_features_indices

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(train.dtypes != np.float)[0]


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 17])

In [17]:
model = CatBoostClassifier(iterations=1200, learning_rate=0.02, depth=7, loss_function='Logloss', eval_metric='AUC', random_seed=99, od_type='Iter', od_wait=100) 
model.fit(train, Churn_X,cat_features=categorical_features_indices,eval_set=(test, Churn_Y),plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.7902047	best: 0.7902047 (0)	total: 54.8ms	remaining: 1m 5s
1:	test: 0.8046166	best: 0.8046166 (1)	total: 58.3ms	remaining: 34.9s
2:	test: 0.8091285	best: 0.8091285 (2)	total: 65.9ms	remaining: 26.3s
3:	test: 0.8106551	best: 0.8106551 (3)	total: 68.9ms	remaining: 20.6s
4:	test: 0.8127962	best: 0.8127962 (4)	total: 75.3ms	remaining: 18s
5:	test: 0.8139671	best: 0.8139671 (5)	total: 81.9ms	remaining: 16.3s
6:	test: 0.8141387	best: 0.8141387 (6)	total: 85.5ms	remaining: 14.6s
7:	test: 0.8128294	best: 0.8141387 (6)	total: 92.1ms	remaining: 13.7s
8:	test: 0.8129872	best: 0.8141387 (6)	total: 96.7ms	remaining: 12.8s
9:	test: 0.8129858	best: 0.8141387 (6)	total: 102ms	remaining: 12.1s
10:	test: 0.8127367	best: 0.8141387 (6)	total: 105ms	remaining: 11.4s
11:	test: 0.8123990	best: 0.8141387 (6)	total: 111ms	remaining: 11s
12:	test: 0.8164459	best: 0.8164459 (12)	total: 118ms	remaining: 10.8s
13:	test: 0.8181482	best: 0.8181482 (13)	total: 124ms	remaining: 10.5s
14:	test: 0.8182700	bes

124:	test: 0.8401031	best: 0.8401086 (123)	total: 862ms	remaining: 7.41s
125:	test: 0.8401501	best: 0.8401501 (125)	total: 869ms	remaining: 7.41s
126:	test: 0.8401667	best: 0.8401667 (126)	total: 877ms	remaining: 7.41s
127:	test: 0.8402567	best: 0.8402567 (127)	total: 884ms	remaining: 7.41s
128:	test: 0.8402401	best: 0.8402567 (127)	total: 891ms	remaining: 7.4s
129:	test: 0.8401571	best: 0.8402567 (127)	total: 898ms	remaining: 7.39s
130:	test: 0.8401958	best: 0.8402567 (127)	total: 905ms	remaining: 7.38s
131:	test: 0.8402373	best: 0.8402567 (127)	total: 914ms	remaining: 7.39s
132:	test: 0.8403674	best: 0.8403674 (132)	total: 921ms	remaining: 7.39s
133:	test: 0.8403148	best: 0.8403674 (132)	total: 928ms	remaining: 7.38s
134:	test: 0.8402761	best: 0.8403674 (132)	total: 934ms	remaining: 7.36s
135:	test: 0.8402872	best: 0.8403674 (132)	total: 941ms	remaining: 7.36s
136:	test: 0.8402789	best: 0.8403674 (132)	total: 948ms	remaining: 7.36s
137:	test: 0.8404256	best: 0.8404256 (137)	total: 95

240:	test: 0.8417182	best: 0.8417778 (231)	total: 1.66s	remaining: 6.62s
241:	test: 0.8416795	best: 0.8417778 (231)	total: 1.67s	remaining: 6.61s
242:	test: 0.8416573	best: 0.8417778 (231)	total: 1.68s	remaining: 6.61s
243:	test: 0.8416435	best: 0.8417778 (231)	total: 1.69s	remaining: 6.62s
244:	test: 0.8417265	best: 0.8417778 (231)	total: 1.7s	remaining: 6.61s
245:	test: 0.8417016	best: 0.8417778 (231)	total: 1.7s	remaining: 6.6s
246:	test: 0.8416435	best: 0.8417778 (231)	total: 1.71s	remaining: 6.59s
247:	test: 0.8416546	best: 0.8417778 (231)	total: 1.72s	remaining: 6.58s
248:	test: 0.8416518	best: 0.8417778 (231)	total: 1.72s	remaining: 6.58s
249:	test: 0.8416850	best: 0.8417778 (231)	total: 1.73s	remaining: 6.57s
250:	test: 0.8417376	best: 0.8417778 (231)	total: 1.74s	remaining: 6.57s
251:	test: 0.8417404	best: 0.8417778 (231)	total: 1.75s	remaining: 6.57s
252:	test: 0.8417847	best: 0.8417847 (252)	total: 1.75s	remaining: 6.57s
253:	test: 0.8417459	best: 0.8417847 (252)	total: 1.76

374:	test: 0.8416449	best: 0.8418359 (350)	total: 2.66s	remaining: 5.85s
375:	test: 0.8416338	best: 0.8418359 (350)	total: 2.67s	remaining: 5.84s
376:	test: 0.8416172	best: 0.8418359 (350)	total: 2.67s	remaining: 5.84s
377:	test: 0.8416477	best: 0.8418359 (350)	total: 2.69s	remaining: 5.84s
378:	test: 0.8416477	best: 0.8418359 (350)	total: 2.69s	remaining: 5.83s
379:	test: 0.8415951	best: 0.8418359 (350)	total: 2.7s	remaining: 5.82s
380:	test: 0.8415785	best: 0.8418359 (350)	total: 2.71s	remaining: 5.82s
381:	test: 0.8415093	best: 0.8418359 (350)	total: 2.71s	remaining: 5.81s
382:	test: 0.8415093	best: 0.8418359 (350)	total: 2.71s	remaining: 5.79s
383:	test: 0.8414290	best: 0.8418359 (350)	total: 2.72s	remaining: 5.79s
384:	test: 0.8414151	best: 0.8418359 (350)	total: 2.73s	remaining: 5.78s
385:	test: 0.8415148	best: 0.8418359 (350)	total: 2.74s	remaining: 5.77s
386:	test: 0.8415148	best: 0.8418359 (350)	total: 2.74s	remaining: 5.75s
387:	test: 0.8415148	best: 0.8418359 (350)	total: 2.

<catboost.core.CatBoostClassifier at 0x7fbfebec10a0>

In [18]:
preds_class = model.predict(test)
preds_proba = model.predict_proba(test)

In [19]:
print("class = ", preds_class)

class =  [0 0 0 ... 1 1 0]


In [20]:
print("proba = ", preds_proba)

proba =  [[0.79347984 0.20652016]
 [0.78600389 0.21399611]
 [0.5610552  0.4389448 ]
 ...
 [0.48170005 0.51829995]
 [0.49858908 0.50141092]
 [0.53367841 0.46632159]]


In [21]:
accuracy_score(Churn_Y, preds_class)

0.8204400283889283

# 3.Litegbm Model

In [22]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(boosting_type='gbdt', objective='binary',
                       num_class=1,early_stopping = 50,num_iteration=10000,num_leaves=31,
                       is_enable_sparse='true',tree_learner='data',min_data_in_leaf=400,max_depth=8,
                       learning_rate=0.1, n_estimators=100, max_bin=255, subsample_for_bin=50000, 
                       min_split_gain=5, min_child_weight=5, min_child_samples=10, subsample=0.995, 
                       subsample_freq=1, colsample_bytree=1, reg_alpha=0, 
                       reg_lambda=0, seed=0, nthread=-1, silent=True)

In [23]:
lgbm.fit(train, Churn_X, eval_set=[(test, Churn_Y)],eval_metric='l1',
        early_stopping_rounds=5)

[1]	valid_0's l1: 0.371194	valid_0's binary_logloss: 0.529813
[2]	valid_0's l1: 0.362365	valid_0's binary_logloss: 0.511777
[3]	valid_0's l1: 0.354408	valid_0's binary_logloss: 0.496869
[4]	valid_0's l1: 0.347087	valid_0's binary_logloss: 0.484469
[5]	valid_0's l1: 0.340589	valid_0's binary_logloss: 0.474382
[6]	valid_0's l1: 0.334664	valid_0's binary_logloss: 0.46579
[7]	valid_0's l1: 0.329287	valid_0's binary_logloss: 0.458199
[8]	valid_0's l1: 0.324355	valid_0's binary_logloss: 0.45222
[9]	valid_0's l1: 0.319991	valid_0's binary_logloss: 0.446371
[10]	valid_0's l1: 0.315862	valid_0's binary_logloss: 0.441255
[11]	valid_0's l1: 0.31223	valid_0's binary_logloss: 0.437654
[12]	valid_0's l1: 0.308597	valid_0's binary_logloss: 0.433633
[13]	valid_0's l1: 0.305457	valid_0's binary_logloss: 0.430656
[14]	valid_0's l1: 0.30271	valid_0's binary_logloss: 0.428378
[15]	valid_0's l1: 0.300039	valid_0's binary_logloss: 0.425694
[16]	valid_0's l1: 0.297556	valid_0's binary_logloss: 0.423378
[17]	



In [24]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
y_pred = lgbm.predict(test, num_iteration=lgbm.best_iteration_)
print('The rmse of prediction is:', mean_squared_error(Churn_Y, y_pred) ** 0.5)

The rmse of prediction is: 0.42374517296492203


In [25]:
print('Feature importances:', list(lgbm.feature_importances_))

Feature importances: [0, 0, 2, 0, 11, 5, 16, 8, 0, 9, 1, 6, 34, 9, 15, 5, 5, 67, 60]
