In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [3]:
!wget $data

--2023-09-11 03:57:53--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977501 (955K) [text/plain]
Saving to: 'WA_Fn-UseC_-Telco-Customer-Churn.csv.1'

     0K .......... .......... .......... .......... ..........  5%  391K 2s
    50K .......... .......... .......... .......... .......... 10% 1.28M 1s
   100K .......... .......... .......... .......... .......... 15%  756K 1s
   150K .......... .......... .......... .......... .......... 20% 2.21M 1s
   200K .......... .......... .......... .......... .......... 26%  748K 1s
   250K .......... .......... .......... .......... .......... 31% 1.79M 1s
   300K .......... ...

In [4]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

df.churn = (df.churn == 'yes').astype(int)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [7]:
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

In [8]:
y_train = df_train.churn.values
y_val = df_val.churn.values

In [9]:
del df_train['churn']
del df_val['churn']

In [10]:
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [11]:
df_train_full.churn.value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [12]:
global_mean = df_train_full.churn.mean()
round(global_mean, 3)

0.27

In [13]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [14]:
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

### Feature importance

In [15]:
female_mean = df_train_full[df_train_full.gender =='female'].churn.mean()
print('gender ==female:', round(female_mean, 3))

male_mean = df_train_full[df_train_full.gender =='male'].churn.mean()
print('gender ==male:', round(male_mean, 3))

gender ==female: 0.277
gender ==male: 0.263
