In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline

In [33]:
df = pd.read_csv('./data/Customer-Churn.csv')

In [34]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [35]:
df.shape

(7043, 21)

In [36]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [78]:
def data_prep(df):
    
    df1 = df.copy()
    
    df1.columns = df1.columns.str.lower().str.replace(' ', '_')

    categorical_columns = list(df1.dtypes[df1.dtypes == 'object'].index)

    for col in categorical_columns:
        df1[col] = df1[col].str.lower().str.replace(' ', '_')
        
    y = np.where(df1.churn == 'yes', 1, 0)
    
    df1['totalcharges'] = pd.to_numeric(df1.totalcharges, errors='coerce')
    df1['totalcharges'] = df1['totalcharges'].fillna(0)
    df1['gender'] = np.where(df1['gender'] == 'female', 1, 0)
    df1['partner'] = np.where(df1['partner'] == 'yes', 1, 0)
    df1['dependents'] = np.where(df1['dependents'] == 'yes', 1, 0)
    df1['phoneservice'] = np.where(df1['phoneservice'] == 'yes', 1, 0)
    df1['multiplelines'] = np.where(df1['multiplelines'] == 'yes', 1, 0)
    df1['internetservicefiber'] = np.where(df1['internetservice'] == 'fiber_optic', 1, 0)
    df1['internetservicedsl'] = np.where(df1['internetservice'] == 'dsl', 1, 0)
    df1['onlinesecurity'] = np.where(df1['onlinesecurity'] == 'yes', 1, 0)
    df1['onlinebackup'] = np.where(df1['onlinebackup'] == 'yes', 1, 0)
    df1['deviceprotection'] = np.where(df1['deviceprotection'] == 'yes', 1, 0)
    df1['techsupport'] = np.where(df1['techsupport'] == 'yes', 1, 0)
    df1['streamingtv'] = np.where(df1['streamingtv'] == 'yes', 1, 0)
    df1['streamingmovies'] = np.where(df1['streamingmovies'] == 'yes', 1, 0)
    df1['paperlessbilling'] = np.where(df1['paperlessbilling'] == 'yes', 1, 0)
    
    
    columns_to_drop = ['churn', 'customerid', 'internetservice']
    df1.drop(columns=columns_to_drop, inplace=True)
    
    return df1, y

In [79]:
df1, y = data_prep(df)

In [80]:
df1.dtypes

gender                    int64
seniorcitizen             int64
partner                   int64
dependents                int64
tenure                    int64
phoneservice              int64
multiplelines             int64
onlinesecurity            int64
onlinebackup              int64
deviceprotection          int64
techsupport               int64
streamingtv               int64
streamingmovies           int64
contract                 object
paperlessbilling          int64
paymentmethod            object
monthlycharges          float64
totalcharges            float64
internetservicefiber      int64
internetservicedsl        int64
dtype: object

In [77]:
df1.dependents.value_counts()

dependents
no     4933
yes    2110
Name: count, dtype: int64

In [91]:
np.sum(y) / y.shape

array([0.26536987])

In [88]:
X_train, X_test, y_train, y_test = train_test_split(df1, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [92]:
X_train 

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,internetservicefiber,internetservicedsl
6157,0,0,0,0,3,1,0,0,0,0,0,0,0,month-to-month,0,mailed_check,19.85,64.55,0,0
817,1,0,1,1,55,1,0,0,1,1,1,1,1,one_year,1,bank_transfer_(automatic),103.70,5656.75,1,0
6977,0,0,0,0,24,1,0,0,1,0,0,0,0,month-to-month,1,mailed_check,49.70,1218.25,0,1
1932,0,0,0,0,1,1,0,0,0,0,0,0,0,month-to-month,0,mailed_check,20.20,20.20,0,0
4168,0,0,1,0,61,0,0,1,0,1,1,1,1,two_year,1,mailed_check,62.15,3778.85,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,0,1,0,0,12,0,0,0,0,1,1,0,1,month-to-month,1,electronic_check,43.65,526.95,0,1
1409,1,0,1,1,43,1,1,0,0,1,0,0,1,month-to-month,1,bank_transfer_(automatic),92.55,4039.00,1,0
6834,1,0,0,0,15,0,0,0,1,0,0,0,1,two_year,1,mailed_check,38.80,603.00,0,1
5685,0,0,1,1,45,1,0,0,0,0,0,0,0,two_year,0,electronic_check,19.20,903.70,0,0
