In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

## Data Loading and preview

In [32]:
churn = pd.read_csv("customer-churn.csv")
churn.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Preview the data

In [33]:
churn.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


## format columns and categorica;s

In [34]:
churn.columns = churn.columns.str.lower().str.replace(" ", "_")

categorical_columns = churn.dtypes[churn.dtypes == "O"].index

for c in categorical_columns:
    churn[c] = churn[c].str.lower().str.replace(" ", "_")

churn.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


## clean total charges

In [35]:
churn.totalcharges = pd.to_numeric(churn.totalcharges, errors="coerce")
churn.totalcharges

0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: totalcharges, Length: 7043, dtype: float64

## change the target variable to numeric

In [36]:
churn.churn = churn.churn.apply(lambda churn: churn == "yes" ).astype(int)
churn.churn.value_counts()

churn
0    5174
1    1869
Name: count, dtype: int64

In [37]:
df_full_train, df_test =  train_test_split(churn, test_size=0.2, random_state=1)
df_full_train.shape, df_test.shape

((5634, 21), (1409, 21))

In [38]:
df_train, df_val =  train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train.shape, df_val.shape

((4225, 21), (1409, 21))

In [39]:
df_train = df_train.reset_index()
df_test = df_test.reset_index()
df_val = df_val.reset_index()

In [40]:
y_train = df_train.churn
y_val = df_val.churn
y_test = df_test.churn

In [41]:
del df_train["churn"]
del df_val["churn"]
del df_test["churn"]

## EDA

In [42]:
df_full_train = df_full_train.reset_index()
df_full_train.isnull().sum()

index               0
customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        8
churn               0
dtype: int64

In [43]:
df_full_train.churn.value_counts(normalize=True)

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [44]:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)

0.27

In [45]:


numerical = ["tenure", "monthlycharges", "totalcharges"]

# list(set(df_full_train.columns) - set(numerical))

categorical = ['onlinesecurity', 'deviceprotection', 'gender', 'partner', 'paymentmethod', 'paperlessbilling', 'seniorcitizen', 'techsupport', 'phoneservice', 'dependents', 'onlinebackup', 'contract', 'multiplelines', 'streamingmovies', 'internetservice', 'streamingtv']

df_full_train[categorical].nunique()

onlinesecurity      3
deviceprotection    3
gender              2
partner             2
paymentmethod       4
paperlessbilling    2
seniorcitizen       2
techsupport         3
phoneservice        2
dependents          2
onlinebackup        3
contract            3
multiplelines       3
streamingmovies     3
internetservice     3
streamingtv         3
dtype: int64

## Churn rate in Different Group - feature importance 

In [46]:
global_churn_rate = df_full_train.churn.mean()
global_churn_rate

0.26996805111821087

In [47]:
gender_churn_rate = df_full_train.groupby("gender")["churn"].mean()

gender_churn_rate

gender
female    0.276824
male      0.263214
Name: churn, dtype: float64

In [48]:
partner_churn_rate = df_full_train.groupby("partner")["churn"].mean()
partner_churn_rate

partner
no     0.329809
yes    0.205033
Name: churn, dtype: float64

## Risk Ratio - feature importance 

divide group churn rate by global > 1, more likely to churn and < 1, less likely to churn

In [49]:
gender_churn_rate / global_churn_rate

gender
female    1.025396
male      0.974980
Name: churn, dtype: float64

In [50]:
partner_churn_rate / global_churn_rate

partner
no     1.221659
yes    0.759472
Name: churn, dtype: float64

## combined feature importance risk ratio and diff 

In [51]:
from IPython.display import display
for col in categorical:
    print(col)
    print("*" * 20)
    df_group = df_full_train.groupby(col).churn.agg(["mean", "count"])
    df_group["diff"] = df_group["mean"] - global_churn_rate
    df_group["risk"] = df_group["mean"] / global_churn_rate
    display(df_group)
    print("*" * 20)

onlinesecurity
********************


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.150953,1.559152
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.153226,1612,-0.116742,0.56757


********************
deviceprotection
********************


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.125907,1.466379
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.230412,1940,-0.039556,0.85348


********************
gender
********************


Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498


********************
partner
********************


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.059841,1.221659
yes,0.205033,2702,-0.064935,0.759472


********************
paymentmethod
********************


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.101797,0.622928
credit_card_(automatic),0.164339,1217,-0.10563,0.608733
electronic_check,0.45589,1893,0.185922,1.688682
mailed_check,0.19387,1305,-0.076098,0.718121


********************
paperlessbilling
********************


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.097897,0.637375
yes,0.338151,3321,0.068183,1.25256


********************
seniorcitizen
********************


Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208


********************
techsupport
********************


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.148946,1.551717
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.159926,1632,-0.110042,0.59239


********************
phoneservice
********************


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.028652,0.89387
yes,0.273049,5087,0.003081,1.011412


********************
dependents
********************


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.043792,1.162212
yes,0.165666,1666,-0.104302,0.613651


********************
onlinebackup
********************


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.134355,1.497672
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.217232,1915,-0.052736,0.80466


********************
contract
********************


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.161733,1.599082
one_year,0.120573,1186,-0.149395,0.446621
two_year,0.028274,1344,-0.241694,0.10473


********************
multiplelines
********************


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.012561,0.953474
no_phone_service,0.241316,547,-0.028652,0.89387
yes,0.290742,2387,0.020773,1.076948


********************
streamingmovies
********************


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.068938,1.255358
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.307273,2200,0.037305,1.138182


********************
internetservice
********************


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.077621,0.712482
fiber_optic,0.425171,2479,0.155203,1.574895
no,0.077805,1221,-0.192163,0.288201


********************
streamingtv
********************


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.072864,1.269897
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.302723,2167,0.032755,1.121328


********************


## Mutual Information 

tells us how much we learn about a variable from another variable

In [52]:
from sklearn.metrics import mutual_info_score

In [53]:
def mutual_info(series):
    return mutual_info_score(series, df_full_train.churn)

In [55]:
mi = df_full_train[categorical].apply(mutual_info)

mi.sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

## Feature importance correlation 

In [56]:
df_full_train[numerical].corrwith(df_full_train["churn"])

['tenure', 'monthlycharges', 'totalcharges', 'churn']