In [1]:
# Importing Pandas and NumPy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [2]:
churn_data = pd.read_csv("churn_data_cleaned.csv")
churn_data.columns

Index(['mobile_number', 'arpu_6', 'arpu_7', 'arpu_8', 'onnet_mou_6',
       'onnet_mou_7', 'onnet_mou_8', 'offnet_mou_6', 'offnet_mou_7',
       'offnet_mou_8',
       ...
       'fb_7_2.0', 'fb_8_1.0', 'fb_8_2.0', 'total_data_amt_6',
       'total_data_amt_7', 'total_data_amt_8', 'churn',
       'days_forrech_before_month_end_6', 'days_forrech_before_month_end_7',
       'days_forrech_before_month_end_8'],
      dtype='object', length=144)

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
# Putting feature variable to X
X = churn_data.drop(['churn','mobile_number'], axis=1)

X.head()

Unnamed: 0,arpu_6,arpu_7,arpu_8,onnet_mou_6,onnet_mou_7,onnet_mou_8,offnet_mou_6,offnet_mou_7,offnet_mou_8,roam_ic_mou_6,...,fb_7_1.0,fb_7_2.0,fb_8_1.0,fb_8_2.0,total_data_amt_6,total_data_amt_7,total_data_amt_8,days_forrech_before_month_end_6,days_forrech_before_month_end_7,days_forrech_before_month_end_8
0,1069.18,1349.85,3171.48,57.84,54.68,52.29,453.43,567.16,325.91,16.23,...,0,1,0,1,0.0,0.0,0.0,3.0,34.0,65.0
1,378.721,492.223,137.362,413.69,351.03,35.08,94.66,80.63,136.48,0.0,...,1,0,1,0,0.0,354.0,207.0,5.0,36.0,67.0
2,514.453,597.753,637.76,102.41,132.11,85.14,757.93,896.68,983.39,0.0,...,0,1,0,1,0.0,0.0,0.0,0.0,31.0,62.0
3,74.35,193.897,366.966,48.96,50.66,33.58,85.41,89.36,205.89,0.0,...,1,0,1,0,0.0,712.0,540.0,12.0,43.0,74.0
4,977.02,2362.833,409.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,1,0,5285.0,20424.0,455.0,0.0,31.0,62.0


In [5]:
# Putting response variable to y
y = churn_data['churn']

y.head()

0    1
1    0
2    0
3    0
4    0
Name: churn, dtype: int64

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [7]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.7, test_size=0.3, random_state=100)

In [8]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(ratio='auto')
X_train_balance, y_train_balance = smt.fit_sample(X_train,y_train)

In [9]:
### Checking the Churn Rate
churn = (sum(churn_data['churn'])/len(churn_data['churn'].index))*100
churn

5.599214145383105

## Model Building


In [10]:
import statsmodels.api as sm

In [11]:
# Logistic regression model
logm1 = sm.GLM(y_train_balance,(sm.add_constant(X_train_balance)), family = sm.families.Binomial())
logm1.fit().summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,37664
Model:,GLM,Df Residuals:,37526
Model Family:,Binomial,Df Model:,137
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-15356.
Date:,"Mon, 13 May 2019",Deviance:,30712.
Time:,20:12:46,Pearson chi2:,8.91e+04
No. Iterations:,100,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.6808,0.026,-64.105,0.000,-1.732,-1.629
x1,-0.1817,0.066,-2.747,0.006,-0.311,-0.052
x2,0.4209,0.079,5.354,0.000,0.267,0.575
x3,1.2147,0.094,12.881,0.000,1.030,1.400
x4,-6.1407,1.590,-3.863,0.000,-9.256,-3.025
x5,1.9294,1.289,1.497,0.134,-0.597,4.456
x6,1.8155,1.465,1.239,0.215,-1.056,4.688
x7,-4.9840,1.641,-3.038,0.002,-8.200,-1.768
x8,0.2462,1.289,0.191,0.849,-2.280,2.772


##  Feature Selection Using RFE

In [12]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [13]:
from sklearn.feature_selection import RFE
rfe = RFE(logreg, 30)             # running RFE with 13 variables as output
rfe = rfe.fit(X_train_balance, y_train_balance)









In [14]:
rfe.support_

array([False,  True,  True,  True, False, False, False,  True,  True,
       False, False, False, False,  True, False,  True, False, False,
       False,  True, False,  True,  True, False, False, False, False,
       False, False,  True,  True, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False,  True,  True, False, False,
       False,  True,  True, False,  True,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False,  True,  True, False,  True,  True, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False,  True, False, False,  True, False, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False,

In [15]:
list(zip(X.columns, rfe.support_, rfe.ranking_))

[('arpu_6', False, 45),
 ('arpu_7', True, 1),
 ('arpu_8', True, 1),
 ('onnet_mou_6', True, 1),
 ('onnet_mou_7', False, 5),
 ('onnet_mou_8', False, 97),
 ('offnet_mou_6', False, 109),
 ('offnet_mou_7', True, 1),
 ('offnet_mou_8', True, 1),
 ('roam_ic_mou_6', False, 35),
 ('roam_ic_mou_7', False, 77),
 ('roam_ic_mou_8', False, 108),
 ('roam_og_mou_6', False, 59),
 ('roam_og_mou_7', True, 1),
 ('roam_og_mou_8', False, 20),
 ('loc_og_t2t_mou_6', True, 1),
 ('loc_og_t2t_mou_7', False, 54),
 ('loc_og_t2t_mou_8', False, 4),
 ('loc_og_t2m_mou_6', False, 24),
 ('loc_og_t2m_mou_7', True, 1),
 ('loc_og_t2m_mou_8', False, 95),
 ('loc_og_t2f_mou_6', True, 1),
 ('loc_og_t2f_mou_7', True, 1),
 ('loc_og_t2f_mou_8', False, 91),
 ('loc_og_t2c_mou_6', False, 23),
 ('loc_og_t2c_mou_7', False, 55),
 ('loc_og_t2c_mou_8', False, 57),
 ('loc_og_mou_6', False, 47),
 ('loc_og_mou_7', False, 64),
 ('loc_og_mou_8', True, 1),
 ('std_og_t2t_mou_6', True, 1),
 ('std_og_t2t_mou_7', False, 6),
 ('std_og_t2t_mou_8', Fa

In [16]:
##Choosing only the rfe selected feature as the set of columns

col = X.columns[rfe.support_]

##### Assessing the model with StatsModels

In [17]:
X_train_rfe = pd.DataFrame(data=X_train_balance).iloc[:, rfe.support_]
y_train_rfe = y_train_balance

In [18]:
X_train_sm = sm.add_constant(X_train_rfe)
logm2 = sm.GLM(y_train_rfe,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,37664
Model:,GLM,Df Residuals:,37633
Model Family:,Binomial,Df Model:,30
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-16493.
Date:,"Mon, 13 May 2019",Deviance:,32987.
Time:,20:19:14,Pearson chi2:,1.90e+05
No. Iterations:,6,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.5931,0.025,-64.511,0.000,-1.642,-1.545
1,0.5745,0.062,9.200,0.000,0.452,0.697
2,0.6124,0.075,8.203,0.000,0.466,0.759
3,-0.4190,0.136,-3.075,0.002,-0.686,-0.152
7,-1.0308,0.216,-4.766,0.000,-1.455,-0.607
8,-0.5154,0.038,-13.425,0.000,-0.591,-0.440
13,0.4806,0.033,14.397,0.000,0.415,0.546
15,0.4763,0.073,6.552,0.000,0.334,0.619
19,0.6565,0.109,6.042,0.000,0.444,0.869


In [19]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

0    0.143403
1    0.148379
2    0.749978
3    0.782227
4    0.160934
5    0.037214
6    0.739097
7    0.140984
8    0.031555
9    0.043683
dtype: float64

In [20]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

array([0.14340279, 0.14837914, 0.74997751, 0.78222694, 0.16093387,
       0.03721397, 0.73909734, 0.14098393, 0.0315555 , 0.04368279])

##### Creating a dataframe with the actual churn flag and the predicted probabilities

In [21]:
y_train_pred_final = pd.DataFrame({'churn':y_train_rfe, 'Churn_Prob':y_train_pred})
y_train_pred_final.head()

Unnamed: 0,churn,Churn_Prob
0,0,0.143403
1,0,0.148379
2,0,0.749978
3,0,0.782227
4,0,0.160934


##### Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0

In [22]:
y_train_pred_final['predicted'] = y_train_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head()

Unnamed: 0,churn,Churn_Prob,predicted
0,0,0.143403,0
1,0,0.148379,0
2,0,0.749978,1
3,0,0.782227,1
4,0,0.160934,0


In [23]:
from sklearn import metrics

In [24]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.churn, y_train_pred_final.predicted )
print(confusion)

[[14731  4101]
 [ 3004 15828]]


In [25]:
# Predicted     not_churn    churn
# Actual
# not_churn        14927    3903
# churn            3107       15723  

In [26]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.churn, y_train_pred_final.predicted))

0.811358326253186


In [27]:
X_test = pd.DataFrame(data=X_test).iloc[:, rfe.support_]
X_test.head()

Unnamed: 0,1,2,3,7,8,13,15,19,21,22,...,82,91,92,94,95,101,110,113,116,120
0,-0.651262,-0.623588,-0.578246,-0.775681,-0.721164,-0.210483,-0.375524,-0.691663,-0.290483,-0.297826,...,-0.108913,-0.946395,-0.894135,-0.635677,-0.577404,-0.656413,-0.385289,-0.425133,1.286999,1.421897
1,-1.016426,-0.830254,-0.578246,-0.775681,-0.721164,-0.210483,-0.375524,-0.691663,-0.290483,-0.297826,...,-0.108913,-0.52402,-0.142532,-0.994602,-0.783631,-0.482236,-0.385289,2.307925,-0.336358,-0.909653
2,-0.864972,-0.986036,-0.434823,-0.443625,-0.65834,-0.210483,-0.091884,-0.021638,-0.290483,-0.297826,...,-0.108913,-0.946395,-1.001507,-1.016927,-0.940757,-0.4474,-0.385289,-0.425133,-0.336358,1.730972
3,0.081439,0.123816,0.0608,-0.477767,-0.2266,-0.210483,0.222712,-0.340328,-0.290483,-0.278592,...,-0.108913,2.115824,1.682789,0.013478,0.139483,-0.4474,-0.385289,-0.425133,-0.336358,-0.439816
4,-0.319728,-0.289937,4.533992,0.420569,0.54605,-0.210483,9.734719,1.160909,-0.276736,-0.297826,...,-0.108913,-0.840801,-0.786763,-0.312817,-0.351535,0.249308,-0.385289,-0.425133,-0.336358,1.701932


In [28]:
X_test_sm = sm.add_constant(X_test)

## Making predictions on the test set

In [29]:
y_test_pred = res.predict(X_test_sm)

In [30]:
y_test_pred[:10]

0    0.009867
1    0.377061
2    0.628647
3    0.115134
4    0.059572
5    0.049363
6    0.686649
7    0.023962
8    0.254687
9    0.106289
dtype: float64

In [31]:
# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)

In [32]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)

In [33]:
# Removing index for both dataframes to append them side by side 
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [34]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [35]:
y_pred_final.head()

Unnamed: 0,churn,0
0,0,0.009867
1,0,0.377061
2,0,0.628647
3,0,0.115134
4,0,0.059572


In [36]:
#Renaming the column 
y_pred_final= y_pred_final.rename(columns={ 0 : 'Churn_Prob'})

In [37]:
y_pred_final['final_predicted'] = y_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.55 else 0)

In [38]:
y_pred_final.head()

Unnamed: 0,churn,Churn_Prob,final_predicted
0,0,0.009867,0
1,0,0.377061,0
2,0,0.628647,1
3,0,0.115134,0
4,0,0.059572,0


In [39]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_pred_final.churn, y_pred_final.final_predicted)

0.8180542563143124

In [40]:
confusion2 = metrics.confusion_matrix(y_pred_final.churn, y_pred_final.final_predicted )
confusion2

array([[6619, 1455],
       [ 101,  377]], dtype=int64)

In [41]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [42]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

0.7887029288702929

In [43]:
# Let us calculate specificity
TN / float(TN+FP)

0.8197919246965568