In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/refs/heads/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [3]:
# !wget $data -O data-week-3.csv

In [4]:
df = pd.read_csv('data-week-3.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [7]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

In [8]:
df.churn = (df.churn == 'yes').astype(int)

# Setting up the validation framework

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [11]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [12]:
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [13]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [14]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [15]:
y_train, y_val, y_test

(array([0, 0, 1, ..., 1, 0, 1], shape=(4225,)),
 array([0, 0, 0, ..., 0, 1, 1], shape=(1409,)),
 array([0, 0, 0, ..., 0, 0, 1], shape=(1409,)))

In [43]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

KeyError: 'churn'

# EDA

In [17]:
df_full_train = df_full_train.reset_index(drop=True)

In [18]:
df_full_train.churn.value_counts()

churn
0    4113
1    1521
Name: count, dtype: int64

In [19]:
df_full_train.churn.value_counts(normalize=True)

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [20]:
global_churn_rate = round(df_full_train.churn.mean(), 2)
global_churn_rate

np.float64(0.27)

In [21]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [22]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [23]:
categorical = ['gender', 'seniorcitizen', 'partner',
       'dependents', 'phoneservice', 'multiplelines',
       'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection',
       'techsupport', 'streamingtv', 'streamingmovies', 'contract',
       'paperlessbilling', 'paymentmethod']

In [24]:
df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [25]:
df_full_train.paymentmethod.value_counts()

paymentmethod
electronic_check             1893
mailed_check                 1305
bank_transfer_(automatic)    1219
credit_card_(automatic)      1217
Name: count, dtype: int64

### Feature importance: Churn rate and risk ratio

In [26]:
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,0
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,1
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,0


In [27]:
churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_female

np.float64(0.27682403433476394)

In [28]:
churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean()
churn_male

np.float64(0.2632135306553911)

In [29]:
global_churn = df_full_train.churn.mean()
global_churn

np.float64(0.26996805111821087)

In [30]:
df_full_train.partner.value_counts()

partner
no     2932
yes    2702
Name: count, dtype: int64

In [31]:
churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_partner

np.float64(0.20503330866025166)

In [32]:
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_no_partner

np.float64(0.3298090040927694)

In [33]:
from IPython.display import display

In [34]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count', 'sum'])
    df_group['diff'] = df_group['mean'] - global_churn
    df_group['risk'] = df_group['mean'] / global_churn
    display(df_group)
    print('-------')

gender


Unnamed: 0_level_0,mean,count,sum,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,0.276824,2796,774,0.006856,1.025396
male,0.263214,2838,747,-0.006755,0.97498


-------
seniorcitizen


Unnamed: 0_level_0,mean,count,sum,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.24227,4722,1144,-0.027698,0.897403
1,0.413377,912,377,0.143409,1.531208


-------
partner


Unnamed: 0_level_0,mean,count,sum,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,0.329809,2932,967,0.059841,1.221659
yes,0.205033,2702,554,-0.064935,0.759472


-------
dependents


Unnamed: 0_level_0,mean,count,sum,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,0.31376,3968,1245,0.043792,1.162212
yes,0.165666,1666,276,-0.104302,0.613651


-------
phoneservice


Unnamed: 0_level_0,mean,count,sum,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,0.241316,547,132,-0.028652,0.89387
yes,0.273049,5087,1389,0.003081,1.011412


-------
multiplelines


Unnamed: 0_level_0,mean,count,sum,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,0.257407,2700,695,-0.012561,0.953474
no_phone_service,0.241316,547,132,-0.028652,0.89387
yes,0.290742,2387,694,0.020773,1.076948


-------
internetservice


Unnamed: 0_level_0,mean,count,sum,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dsl,0.192347,1934,372,-0.077621,0.712482
fiber_optic,0.425171,2479,1054,0.155203,1.574895
no,0.077805,1221,95,-0.192163,0.288201


-------
onlinesecurity


Unnamed: 0_level_0,mean,count,sum,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,0.420921,2801,1179,0.150953,1.559152
no_internet_service,0.077805,1221,95,-0.192163,0.288201
yes,0.153226,1612,247,-0.116742,0.56757


-------
onlinebackup


Unnamed: 0_level_0,mean,count,sum,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,0.404323,2498,1010,0.134355,1.497672
no_internet_service,0.077805,1221,95,-0.192163,0.288201
yes,0.217232,1915,416,-0.052736,0.80466


-------
deviceprotection


Unnamed: 0_level_0,mean,count,sum,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,0.395875,2473,979,0.125907,1.466379
no_internet_service,0.077805,1221,95,-0.192163,0.288201
yes,0.230412,1940,447,-0.039556,0.85348


-------
techsupport


Unnamed: 0_level_0,mean,count,sum,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,0.418914,2781,1165,0.148946,1.551717
no_internet_service,0.077805,1221,95,-0.192163,0.288201
yes,0.159926,1632,261,-0.110042,0.59239


-------
streamingtv


Unnamed: 0_level_0,mean,count,sum,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,0.342832,2246,770,0.072864,1.269897
no_internet_service,0.077805,1221,95,-0.192163,0.288201
yes,0.302723,2167,656,0.032755,1.121328


-------
streamingmovies


Unnamed: 0_level_0,mean,count,sum,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,0.338906,2213,750,0.068938,1.255358
no_internet_service,0.077805,1221,95,-0.192163,0.288201
yes,0.307273,2200,676,0.037305,1.138182


-------
contract


Unnamed: 0_level_0,mean,count,sum,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
month-to-month,0.431701,3104,1340,0.161733,1.599082
one_year,0.120573,1186,143,-0.149395,0.446621
two_year,0.028274,1344,38,-0.241694,0.10473


-------
paperlessbilling


Unnamed: 0_level_0,mean,count,sum,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,0.172071,2313,398,-0.097897,0.637375
yes,0.338151,3321,1123,0.068183,1.25256


-------
paymentmethod


Unnamed: 0_level_0,mean,count,sum,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bank_transfer_(automatic),0.168171,1219,205,-0.101797,0.622928
credit_card_(automatic),0.164339,1217,200,-0.10563,0.608733
electronic_check,0.45589,1893,863,0.185922,1.688682
mailed_check,0.19387,1305,253,-0.076098,0.718121


-------


### Feature importance: Mutual information
'Mutual information' concept

In [44]:
from sklearn.metrics import mutual_info_score

In [48]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

0.0983203874041556

In [49]:
mutual_info_score(df_full_train.gender, df_full_train.churn)

0.0001174846211139946

In [50]:
mutual_info_score(df_full_train.churn, df_full_train.partner)

0.009967689095399745

In [45]:
for c in categorical:
    print(c, ':', mutual_info_score(df_full_train[c], df_full_train.churn))

gender : 0.0001174846211139946
seniorcitizen : 0.009410216144208144
partner : 0.009967689095399745
dependents : 0.012345815445534689
phoneservice : 0.00022871269738296285
multiplelines : 0.0008574478744731856
internetservice : 0.055867945893496467
onlinesecurity : 0.06308524972985574
onlinebackup : 0.0469234640537918
deviceprotection : 0.04345286925268559
techsupport : 0.06103245991777444
streamingtv : 0.03185333110086085
streamingmovies : 0.03158089669519908
contract : 0.0983203874041556
paperlessbilling : 0.01758882715925275
paymentmethod : 0.043210027531582915


In [46]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)

In [47]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

### Feature importance: Correlation
'Correlation Coefficient' concept

In [52]:
df_full_train.tenure.max()

(0       12
 1       42
 2       71
 3       71
 4       30
         ..
 5629     9
 5630    60
 5631    28
 5632     2
 5633    16
 Name: tenure, Length: 5634, dtype: int64,
 np.int64(72))

In [56]:
df_full_train[numerical].corrwith(df_full_train.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [69]:
df_full_train[numerical].corrwith(df_full_train.churn).abs()

tenure            0.351885
monthlycharges    0.196805
totalcharges      0.196353
dtype: float64

In [70]:
df_full_train[df_full_train.tenure <= 2].churn.mean()

np.float64(0.5953420669577875)

In [71]:
df_full_train[(df_full_train.tenure > 2) & (df_full_train.tenure <= 12)].churn.mean()

np.float64(0.3994413407821229)

In [72]:
df_full_train[df_full_train.tenure > 12].churn.mean()

np.float64(0.17634908339788277)

In [73]:
df_full_train[df_full_train.monthlycharges <= 20].churn.mean()

np.float64(0.08795411089866156)

In [74]:
df_full_train[(df_full_train.monthlycharges > 20) & (df_full_train.monthlycharges <= 50)].churn.mean()

np.float64(0.18340943683409436)

In [75]:
df_full_train[df_full_train.monthlycharges > 50].churn.mean()

np.float64(0.32499341585462205)

### One-hot encoding

In [76]:
from sklearn.feature_extraction import DictVectorizer

In [82]:
df_train[['gender', 'contract']].iloc[:10]

Unnamed: 0,gender,contract
0,female,two_year
1,male,month-to-month
2,female,month-to-month
3,female,month-to-month
4,female,two_year
5,male,month-to-month
6,male,month-to-month
7,female,month-to-month
8,female,two_year
9,female,month-to-month


In [92]:
dicts = df_train[['gender', 'contract']].iloc[:10].to_dict(orient = 'records')
dicts

[{'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'}]

In [93]:
dv = DictVectorizer(sparse=False)

In [94]:
dv.fit(dicts)

0,1,2
,"dtype  dtype: dtype, default=np.float64 The type of feature values. Passed to Numpy array/scipy.sparse matrix constructors as the dtype argument.",<class 'numpy.float64'>
,"separator  separator: str, default=""="" Separator string used when constructing new features for one-hot coding.",'='
,"sparse  sparse: bool, default=True Whether transform should produce scipy.sparse matrices.",False
,"sort  sort: bool, default=True Whether ``feature_names_`` and ``vocabulary_`` should be sorted when fitting.",True
