# Churn Prediction

# Loading & Preparation

In [1]:
import pandas as pd
import numpy as np
import wget

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
link = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

wget.download(link)

  0% [                                                                            ]      0 / 977501  0% [                                                                            ]   8192 / 977501  1% [.                                                                           ]  16384 / 977501  2% [.                                                                           ]  24576 / 977501  3% [..                                                                          ]  32768 / 977501  4% [...                                                                         ]  40960 / 977501  5% [...                                                                         ]  49152 / 977501  5% [....                                                                        ]  57344 / 977501  6% [.....                                                                       ]  65536 / 977501  7% [.....                                                                       ]  73728 / 977501

'WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# excluding (...) for large number of columns
pd.set_option("display.max.columns", None) 

# alternatively we may take a look at all the columns using transpose method
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [4]:
# checking datatypes

df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
# lowercase column names and replace spaces ' ' with underscores '_'

df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
# filtering columns with object datatype, because we cannot apply string methods to non-string values

string_columns = list(df.dtypes[df.dtypes == 'object'].index)
string_columns

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'totalcharges',
 'churn']

In [7]:
# lowercase values and replace spaces ' ' with underscores '_'

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')
    
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,yes,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,no,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,yes,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,no,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [8]:
# changing churn values (our target) from Yes/No to 1/0

df.churn = (df.churn == 'yes').astype(int)
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [9]:
# checking totalcharches fiels, which is object, but apparently should be a number
# this is happening when some values are not numeric, but rather spaces (' '), underscores ('_') and so forth

df.totalcharges.head()

0      29.85
1     1889.5
2     108.15
3    1840.75
4     151.65
Name: totalcharges, dtype: object

In [10]:
# in order to convert it to numbers, we may use "errors='coerce'", which will write them as NaN
# to check how many NaN's are there we may use .isnull().sum()

pd.to_numeric(df.totalcharges, errors='coerce').isnull().sum()

11

In [11]:
# converting & filling NaN's with zeros (alternitevly we could fill them with mean values)

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

# checking
df.totalcharges.isnull().sum()

0

# Setting up the validation framework

In [12]:
# import the library

from sklearn.model_selection import train_test_split

In [15]:
# because sci-kit splits dataset into 2 parts, we'll divide train dataset into training & validation

# splitting into train and test
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# splitting train into train & validation
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [16]:
# checking datasets lenght

len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [17]:
# resetting the index

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [18]:
# creating our 'y' values (churn) and transforming Series to NumPy array using .values

y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

# deleting 'y' (churn) from 'X' dataset

del df_train['churn']
del df_val['churn']
del df_test['churn']

In [21]:
# checking

df_train

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges
0,6776-tlwoi,male,0,no,no,3,yes,no,no,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,month-to-month,no,mailed_check,19.85,64.55
1,0036-ihmot,female,0,yes,yes,55,yes,no,fiber_optic,no,yes,yes,yes,yes,yes,one_year,yes,bank_transfer_(automatic),103.70,5656.75
2,2843-cqmeg,male,0,no,no,24,yes,no,dsl,no,yes,no,no,no,no,month-to-month,yes,mailed_check,49.70,1218.25
3,3247-mhjkm,male,0,no,no,1,yes,no,no,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,month-to-month,no,mailed_check,20.20,20.20
4,1194-bhjyc,male,0,yes,no,61,no,no_phone_service,dsl,yes,no,yes,yes,yes,yes,two_year,yes,mailed_check,62.15,3778.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4220,1271-sjbgz,male,1,no,no,12,no,no_phone_service,dsl,no,no,yes,yes,no,yes,month-to-month,yes,electronic_check,43.65,526.95
4221,5977-ckhon,female,0,yes,yes,43,yes,yes,fiber_optic,no,no,yes,no,no,yes,month-to-month,yes,bank_transfer_(automatic),92.55,4039.00
4222,3635-jbpsg,female,0,no,no,15,no,no_phone_service,dsl,no,yes,no,no,no,yes,two_year,yes,mailed_check,38.80,603.00
4223,4475-nvtlu,male,0,yes,yes,45,yes,no,no,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,electronic_check,19.20,903.70


# Exploratory data analysis

In [25]:
# we'll do EDA on our train & val combined dataset

# first, we'll reset its index

df_full_train.reset_index(drop=True, inplace=True)
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,4223-bkeor,female,0,no,yes,21,yes,no,dsl,yes,no,yes,no,no,yes,one_year,no,mailed_check,64.85,1336.8,0
1,6035-riiom,female,0,no,no,54,yes,yes,fiber_optic,no,yes,no,no,yes,yes,two_year,yes,bank_transfer_(automatic),97.2,5129.45,0
2,3797-vtidr,male,0,yes,no,1,no,no_phone_service,dsl,no,no,no,no,no,no,month-to-month,yes,electronic_check,23.45,23.45,1
3,2568-brgyx,male,0,no,no,4,yes,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,70.2,237.95,1
4,2775-sefee,male,0,no,yes,0,yes,yes,dsl,yes,yes,no,yes,no,no,two_year,yes,bank_transfer_(automatic),61.9,0.0,0


In [27]:
# checking missing values

df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [29]:
# checking distribution of our target variable

df_full_train.churn.value_counts()

0    4138
1    1496
Name: churn, dtype: int64

In [31]:
# we may check the share of each condition adding 'normalize=True'
# we have a 26.55% global churn rate

df_full_train.churn.value_counts(normalize=True)

0    0.734469
1    0.265531
Name: churn, dtype: float64

In [34]:
round(df_full_train.churn.mean(), 2)

0.27

In [35]:
# filtering numerical variable

numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [41]:
# filtering categorical variables

categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

In [42]:
# checking number of unique values in categorical variables

df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64