In [151]:
import pandas as pd
import numpy as np
import plotly.express as px 
from plotly.offline import iplot

from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from imblearn.combine import SMOTETomek

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

import warnings


# set Default
pd.set_option("display.max_columns", None)
pd.options.display.float_format = '{:,.2f}'.format
warnings.filterwarnings("ignore")
pd.options.mode.copy_on_write = "warn"

# Used Color
used_color = ["#ED1C24", "#FAF7F1", "#C0C0C0", "#330072", "#F1ECDF"]

In [134]:
def update_layout(
    title_font_size = 28,
    hover_font_size = 16,
    hover_bgcolor = '#111',
    showlegend = False
):
    fig.update_layout(
        showlegend = showlegend,
        title = {
            'font' : {
                'size' : title_font_size,
                'family' : '<b>poppins'
            }
        },
        
        hoverlabel = {
            'bgcolor' : hover_bgcolor,
            'font_size' : hover_font_size,
            'font_family' : 'poppins'
        }
    )

In [135]:
def count_viz_finc(
    data_frame,
    column_name,
    title = 'Chart Title',
    title_font_size = 30,
    x_label = 'X',
    y_label = 'Y',
    showlegend = False,
    hover_template = 'None'
):
    value_counts = df[column_name].value_counts(normalize=1)*100
    
    fig = px.bar(
        data_frame = value_counts,
        x = value_counts.index,
        y = value_counts,
        color = value_counts.index,
        color_discrete_sequence = used_color,
        title = title,
        labels = {column_name:x_label, 'y': y_label},
        template = 'plotly_dark',
        text = value_counts.apply(lambda x: f'{x:.1f}%')
    )
    
    fig.update_layout(
        showlegend = showlegend,
        title = {
            'font' : {
                'size' : title_font_size,
                'family' : '<b>poppins'
            }
        },
        
        hoverlabel = {
            'bgcolor' : '#222',
            'font_size' : 16,
            'font_family' : 'poppins'
        }
    )
    
    fig.update_traces(
        textfont = {
            'size' : 18,
            'family' : 'consolas',
            'color' : '#222'
        },
        
        marker = dict(line=dict(color = '#222', width=1)),
        
        hovertemplate = hover_template
    )
    
    return fig

### __About Dataset__
* `Senior Citizen`: Indicates if the customer is 65 or older: Yes, No
* `Dependents`: Indicates if the customer lives with any dependents: Yes, No. Dependents could be children, parents, grandparents, etc
* `Tenure in Months`: Indicates the total amount of months that the customer has been with the company
* `Internet Service`: Indicates if the customer subscribes to Internet service with the company: No, DSL, Fiber Optic, Cable.
* `Contract`: Indicates the customer’s current contract type: Month-to-Month, One Year, Two Year.
* `Payment Method`: Indicates how the customer pays their bill: Bank Withdrawal, Credit Card, Mailed Check.
* `Monthly Charge`: Indicates the customer’s current total monthly charge for all their services from the company.
* `Total Charges`: Indicates the customer’s total charges, calculated to the end of the quarter specified above.
* `Churn: 1` = the customer left the company this quarter. 0 = the customer remained with the company

In [136]:
df = pd.read_csv('./WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [137]:
print(f'Number of Records: {df.shape[0]:.0f}')
print(f'Number of Features: {df.shape[1]}')

Number of Records: 7043
Number of Features: 21


In [138]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### __Data Cleaning & Wrangling__

In [139]:
# Check For Null Values
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [140]:
# Duplicated Records
df.duplicated().sum()

0

In [141]:
# Check for Duplicated Customer ID
df['customerID'].duplicated().sum()

0

__Data almost clean and it's important that the `customerID` is unique for each customer because of it we need to remove the `customerID`__

In [142]:
df.drop(columns='customerID', inplace=True)

In [143]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SeniorCitizen,7043.0,0.16,0.37,0.0,0.0,0.0,0.0,1.0
tenure,7043.0,32.37,24.56,0.0,9.0,29.0,55.0,72.0
MonthlyCharges,7043.0,64.76,30.09,18.25,35.5,70.35,89.85,118.75


### __Dig Deeper Into Each Column__

##### __Gender Column__

In [144]:
gender = df["gender"].value_counts(normalize=1)*100
gender

gender
Male     50.48
Female   49.52
Name: proportion, dtype: float64

In [152]:
fig = count_viz_finc(
    df, 'gender', title= '<b>Gender Distributions', x_label='Gender', y_label='Frequency in PCT (%)', hover_template='Gender: %{x}<br>Frequency PCT (%): %{y:.0f}'
)

iplot(fig)

In [146]:
len(df[df['Churn'] == 'Yes']) / len(df) * 100

26.536987079369588

##### __Senior Citizen Column__

In [147]:
f = df['Contract'] == 'Month-to-month'
df.loc[f, 'PaymentMethod']

0                Electronic check
2                    Mailed check
4                Electronic check
5                Electronic check
6         Credit card (automatic)
                  ...            
7033      Credit card (automatic)
7034      Credit card (automatic)
7035    Bank transfer (automatic)
7040             Electronic check
7041                 Mailed check
Name: PaymentMethod, Length: 3875, dtype: object

In [148]:
senior_citizen = df['SeniorCitizen'].value_counts(normalize=1)*100
senior_citizen.apply(lambda x: f'{x:.2f}%')

SeniorCitizen
0    83.79%
1    16.21%
Name: proportion, dtype: object

In [169]:
fig = px.bar(
    data_frame= senior_citizen,
    x= ['Greater Than 65' if i == 1 else 'Less Than 65' for i in senior_citizen.index],
    y= senior_citizen,
    color= ['Greater Than 65' if i == 1 else 'Less Than 65' for i in senior_citizen.index],
    color_discrete_sequence=used_color,
    title = '<b>Senior Citizen Distributions',
    labels = {'x': 'Is Senior Citizen', 'y': 'Frequency in PCT (%)'},
    template = 'plotly_dark',
    text = senior_citizen.apply(lambda x: f'{x:.1f}%')
)

update_layout()

fig.update_traces(
    textfont = {
        'size': 18,
        'family' : 'consolas',
        'color' : '#222'
    },
    
    marker = dict(line = dict(color = '#C0C0C0', width=3))
)
iplot(fig)

##### __Dependents Column__

In [172]:
dependents = (df['Dependents'].value_counts(normalize=1)*100).apply(lambda x: f'{x:.2f}%')
dependents

Dependents
No     70.04%
Yes    29.96%
Name: proportion, dtype: object