In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

!wget $data -O data-week-3.csv


In [None]:

df = pd.read_csv('data-week-3.csv')

# transpose the data frame
df.head().T

<H2> DATA PREPARATION</H2>

In [None]:
df.columns = df.columns.str.lower().str.replace(" ","_")

categorical_columns = list(df.dtypes[df.dtypes== 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(" ","_")

In [None]:
df.head().T

In [None]:
tc= pd.to_numeric(df.totalcharges,errors = 'coerce') #ignore erros

In [None]:
df.totalcharges = df.totalcharges.fillna(0)

In [None]:
df.churn # transform yes,no to 0 and 1 respectively

df.churn = (df.churn == 'yes').astype(int)


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_full_train , df_test =train_test_split(df,test_size=0.2,random_state=1) # Using 20% test Size


In [None]:
len(df_full_train),len(df_test)

In [None]:
# validation data set of full train 
# 20/80 = 1/4 = 25% of full train data set

df_train , df_val = train_test_split(df_full_train,test_size=0.25,random_state=1) # Using 20% test Size


In [None]:
len(df_train) , len(df_val) , len (df_test) # refer to img 1

In [None]:
df_train= df_train.reset_index(drop=True) # not necessary for models to work
df_val = df_val.reset_index(drop=True) # not necessary for models to work
df_test = df_test.reset_index(drop=True) # not necessary for models to work

In [None]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [None]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

# TODO check why df_full_train's churn wasn't deleted. 

<h1> 3.4 EDA
</h1>

<li>Check missing values
 </li>
<li>Look at the target variable (churn)
 </li>
<li>Look at numerical and categorical variables
 </li>

In [None]:
df_full_train = df_full_train.reset_index(drop=True)

In [None]:
df_full_train.isnull().sum()

In [None]:
df_full_train.churn.value_counts(normalize=True) # counts ammount of churns and checks porcentage with normalize
# CHURN RATE -> YES = 0.27 %

In [None]:
df_full_train.churn.mean()

In [None]:
df_full_train.churn.value_counts()

In [None]:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate,2)

In [None]:
df_full_train.dtypes

In [None]:
# 3 numerical variables
# tenure , montly charges , total charges 

numerical = ['tenure','montlycharges','totalcharges']
df_full_train.columns

In [None]:
categorical = [    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]


In [None]:
df_full_train[categorial].nunique() # most of them are binary

In [None]:
df_full_train.head()


In [None]:
churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_female

In [None]:
churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean()
churn_male

In [None]:
global_churn = df_full_train.churn.mean()
global_churn

In [None]:
df_full_train.partner.value_counts()

In [None]:
churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_partner # 5% less than the global rate

In [None]:
churn_partner_no = df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_partner_no # 10% more than the global rate

In [None]:
global_churn - churn_partner

In [None]:
global_churn - churn_partner_no

In [None]:
# gender doesn't really matter , but the existence of a partner does change the result of churns

In [None]:
churn_partner_no / global_churn # Number that's higher than 1, people without a partner are more likely to churn



In [None]:
churn_partner/ global_churn

$$ RISK = \frac{GROUP}{GLOBAL} $$
 <li> > 1 More likely to Chrun </li>
 <li> < 1 less likely to Chrun </li>

SQL code to translate to python:
~~~~sql
SELECT
    gender,
    AVG(churn),
    AVG(churn) - global_churn AS diff,
    AVG(churn) / global_churn AS risk
FROM 
    data
GROUP by
    gender;
~~~~



In [None]:
df_group = df_full_train.groupby('gender').churn.agg(['mean','count'])
df_group['diff'] = df_group['mean'] - global_churn  # ADDING COLUMNS
df_group['risk'] = df_group['mean'] / global_churn # ADDING COLUMNS
df_group

In [None]:
# for each column in categorical do this agregation
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).churn.agg(['mean','count'])
    df_group['diff'] = df_group['mean'] - global_churn  # ADDING COLUMNS
    df_group['risk'] = df_group['mean'] / global_churn # ADDING COLUMNS
    display(df_group)
    



<h1> 3.6 Feature Importance - Mutual information</h1>

Mutual information - concept from information theory, it tells us how much we can learn about one variable if we know the value of another



In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
mutual_info_score(df_full_train.churn,df_full_train.contract) # mutual information score

In [None]:
mutual_info_score(df_full_train.totalcharges,df_full_train.churn) # mutual information score

In [None]:
def mutual_info_churn_score(series):
    return mutual_info_score(series,df_full_train.churn)

In [None]:
df_full_train[categorical].apply(mutual_info_churn_score)

In [90]:
#SORT BY ASC

mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64