Libraries

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore", UserWarning)

Ingestion

In [19]:
path = r'../data/processed/cleaned_data.csv'
df_telco = pd.read_csv(path)
df = df_telco.copy()
df.head()

Unnamed: 0.1,Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,0
1,1,5575-gnvde,male,0,no,no,34,yes,no,dsl,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,0
2,2,3668-qpybk,male,0,no,no,2,yes,no,dsl,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,1
3,3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,0
4,4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,1


In [20]:
df.iloc[:,-1].head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

Experimentation

Train Test Split

In [22]:
from sklearn.model_selection import train_test_split
df_trainfull, df_test = train_test_split(df, test_size = 0.2, random_state=1)
df_train, df_val = train_test_split(df_trainfull, test_size=0.33, random_state=11)

X_train, X_test, X_val = df_train.iloc[:,1:-1] , df_test.iloc[:,1:-1] , df_val.iloc[:,1:-1]
y_train, y_test, y_val = df_train.iloc[:,-1] , df_test.iloc[:,-1] , df_val.iloc[:,-1]

EDA

In [23]:
df_trainfull['churn'].value_counts()

churn
0    4113
1    1521
Name: count, dtype: int64

In [24]:
categorical_features = ['gender', 'seniorcitizen', 'partner', 'dependents', #'customerid',
                        'phoneservice', 'multiplelines', 'internetservice', #'tenure',
                        'onlinesecurity', 'onlinebackup', 'deviceprotection', 
                        'techsupport', 'streamingtv', 'streamingmovies', 
                        'contract', 'paperlessbilling', 'paymentmethod', 
                        #'monthlycharges', 'totalcharges', 
                        # 'churn'
                        ]
                    
numerical_features = ['tenure', 'monthlycharges', 'totalcharges']

In [25]:
df_trainfull[categorical_features].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [26]:
print('Global_mean = {:.3f}'.format(df_trainfull['churn'].mean() ))
print('Female_mean = {:.3f}'.format(df_trainfull[df['gender']=='female']['churn'].mean() ))
print('Male_mean   = {:.3f}'.format(df_trainfull[df['gender']=='male']['churn'].mean() ))
print('Partner Yes = {:.3f}'.format(df_trainfull[df['partner']=='yes']['churn'].mean() ))
print('Partner No  = {:.3f}'.format(df_trainfull[df['partner']=='no']['churn'].mean() ))

Global_mean = 0.270
Female_mean = 0.277
Male_mean   = 0.263
Partner Yes = 0.205
Partner No  = 0.330


Risks Ratios

In [27]:
global_churn = df['churn'].mean()

churn_metrics = {'FEATURE' : ['Global'],
                 'CATEGORY' : ['Global'],
                 'CHURN_RATE' : [round(global_churn,3)],
                 'CHURN_DIFF' : [0],
                 'RISK_RATIO' : [1]
                 }

for feature in categorical_features:
    for category in df[feature].unique():
        
        churn_rate  = df['churn'][df[feature]==category].mean()
        risk_ratio  = churn_rate / global_churn
        churn_diff  = churn_rate - global_churn
        
        churn_metrics['FEATURE'].append(feature)
        churn_metrics['CATEGORY'].append(category)
        churn_metrics['CHURN_RATE'].append(round(churn_rate,3))
        churn_metrics['CHURN_DIFF'].append(round(churn_diff,3))
        churn_metrics['RISK_RATIO'].append(round(risk_ratio,3))

df_riskratio = pd.DataFrame(churn_metrics)
df_riskratio#.sort_values(#by='CHURN_RATE',
                         #by='CHURN_DIFF',
                         #by='RISK_RATIO', 
                         #ascending=False
                         #)

Unnamed: 0,FEATURE,CATEGORY,CHURN_RATE,CHURN_DIFF,RISK_RATIO
0,Global,Global,0.265,0.0,1.0
1,gender,female,0.269,0.004,1.014
2,gender,male,0.262,-0.004,0.986
3,seniorcitizen,0,0.236,-0.029,0.89
4,seniorcitizen,1,0.417,0.151,1.571
5,partner,yes,0.197,-0.069,0.741
6,partner,no,0.33,0.064,1.242
7,dependents,no,0.313,0.047,1.179
8,dependents,yes,0.155,-0.111,0.582
9,phoneservice,no,0.249,-0.016,0.939


Mutual Information Score

In [28]:
from sklearn.metrics import mutual_info_score

def calc_mi(data):
    return mutual_info_score(data, df_trainfull['churn'])

df_mi = df_trainfull[categorical_features].apply(calc_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


Correlation coeffcient

In [29]:
df_cc = df_trainfull[numerical_features].corrwith(df_trainfull['churn'])
df_cc = df_cc.sort_values(ascending=False).to_frame(name='CC')
df_cc

Unnamed: 0,CC
monthlycharges,0.196805
totalcharges,-0.196353
tenure,-0.351885
