In [517]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import pandas_profiling

In [518]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [519]:
# pandas_profiling.ProfileReport(df)

In [520]:
# Count number of rows and columns
df.shape

(7043, 21)

In [521]:
# Check Columns
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [522]:
# Describe Data
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [523]:
# Validating Data Types
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [524]:
df.columns.to_series().groupby(df.dtypes).groups

{int64: ['SeniorCitizen', 'tenure'], float64: ['MonthlyCharges'], object: ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn']}

In [525]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [526]:
# Convert Total Charges to Numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].astype("float")

In [527]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


TotalCharges has successfully converted to float datatype

In [528]:
# Check for missing values
df.isna().any()

customerID          False
gender              False
SeniorCitizen       False
Partner             False
Dependents          False
tenure              False
PhoneService        False
MultipleLines       False
InternetService     False
OnlineSecurity      False
OnlineBackup        False
DeviceProtection    False
TechSupport         False
StreamingTV         False
StreamingMovies     False
Contract            False
PaperlessBilling    False
PaymentMethod       False
MonthlyCharges      False
TotalCharges         True
Churn               False
dtype: bool

There are missing values in TotalCharges

In [529]:
# Check count of null values
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [530]:
# Calculate average and fill missing values
na_cols = df.isna().any()
na_cols = na_cols[na_cols == True].reset_index()
na_cols = na_cols["index"].tolist()

for col in df.columns[1:]:
    if col in na_cols:
        if df[col].dtype != 'object':
            df[col] = df[col].fillna(df[col].mean()).round(0)

In [531]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [532]:
# Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Label encoding for columns with 2 or less unique

le_count = 0
for col in df.columns[1:]:
    if df[col].dtype == 'object':
        if len(list(df[col].unique())) <=2:
            le.fit(df[col])
            df[col] = le.transform(df[col])
            le_count +=1
print('{} columns label encoded'.format(le_count))

6 columns label encoded


Data Evaluation

In [468]:
df2 = df[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges']]
fig = plt.figure(figsize=(15, 10))

for i in range(df2.shape[1]):
    plt.subplot(6, 3, i+1)
    f=plt.gca()
    f.set_title(df2.columns.values[i])

vals = np.size(df2.iloc[:, i].unique())
if vals >= 100:
    vals = 100

plt.hist(df2.iloc[:, i], bins=vals, color = '#f39519')
plt.tight_layout()

In [469]:
df.corr(method='pearson')

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn
gender,1.0,-0.001874,-0.001808,0.010517,0.005106,-0.006488,-0.011754,-0.014569,4.8e-05,-0.008612
SeniorCitizen,-0.001874,1.0,0.016479,-0.211185,0.016567,0.008576,0.15653,0.220173,0.102395,0.150889
Partner,-0.001808,0.016479,1.0,0.452676,0.379697,0.017706,-0.014877,0.096848,0.318811,-0.150448
Dependents,0.010517,-0.211185,0.452676,1.0,0.159712,-0.001762,-0.111377,-0.11389,0.064534,-0.164221
tenure,0.005106,0.016567,0.379697,0.159712,1.0,0.008448,0.006152,0.2479,0.824757,-0.352229
PhoneService,-0.006488,0.008576,0.017706,-0.001762,0.008448,1.0,0.016505,0.247398,0.112854,0.011942
PaperlessBilling,-0.011754,0.15653,-0.014877,-0.111377,0.006152,0.016505,1.0,0.35215,0.157674,0.191825
MonthlyCharges,-0.014569,0.220173,0.096848,-0.11389,0.2479,0.247398,0.35215,1.0,0.650469,0.193356
TotalCharges,4.8e-05,0.102395,0.318811,0.064534,0.824757,0.112854,0.157674,0.650469,1.0,-0.199426
Churn,-0.008612,0.150889,-0.150448,-0.164221,-0.352229,0.011942,0.191825,0.193356,-0.199426,1.0


In [470]:
# Correlation

plt.figure(figsize=(15,8))
df.corr()['Churn'].sort_values(ascending = False).plot(kind='bar')

<AxesSubplot:>

In [471]:
# Pie Chart
from plotly.offline import init_notebook_mode,iplot
import plotly.graph_objects as go
import cufflinks as cf
init_notebook_mode(connected=True)

In [472]:
# Churn Rate by Gender
plot_by_gender = df.groupby('gender').Churn.mean().reset_index()
plot_data = [
    go.Bar(
        x=plot_by_gender['gender'],
        y=plot_by_gender['Churn'],
       width = [0.8],
        marker = dict(
            color=['#ed7071', '#ffa14a']
        )
    )
]

layout=go.Layout(
    xaxis={"type": "category"},
    yaxis={"title": "Churn Rate"},
    title="Churn Rate by Gender",
    plot_bgcolor = 'white',
    paper_bgcolor = 'white',
)

fig = go.Figure(data=plot_data, layout=layout)
iplot(fig)

In [533]:
# Churn Rate by Internet Service
plot_by_payment = df.groupby('InternetService').Churn.mean().reset_index()
plot_data = [
    go.Bar(
        x=plot_by_payment['InternetService'],
        y=plot_by_payment['Churn'],
       width = [0.8],
        marker = dict(
            color=['#ed7071','#ffa14a', '#9da4d8', '#21ced2']
        )
    )
]

layout=go.Layout(
    xaxis={"type": "category"},
    yaxis={"title": "Churn Rate"},
    title="Churn Rate by Internet Service",
    plot_bgcolor = 'white',
    paper_bgcolor = 'white',
)

fig = go.Figure(data=plot_data, layout=layout)
iplot(fig)

In [473]:
# Churn Rate by Contract Duration
plot_by_contract = df.groupby('Contract').Churn.mean().reset_index()
plot_data = [
    go.Bar(
        x=plot_by_contract['Contract'],
        y=plot_by_contract['Churn'],
        width = [0.8],
        marker = dict(
            color=['#ffa14a', '#9da4d8', '#21ced2']
        )
    )
]

layout=go.Layout(
    xaxis={"type": "category"},
    yaxis={"title": "Churn Rate"},
    title="Churn Rate by Contract Duration",
    plot_bgcolor = 'white',
    paper_bgcolor = 'white',
)

fig = go.Figure(data=plot_data, layout=layout)
iplot(fig)

In [552]:
# Churn Rate by Payment Method
plot_by_payment = df.groupby('PaymentMethod').Churn.mean().reset_index()
plot_data = [
    go.Bar(
        x=plot_by_payment['PaymentMethod'],
        y=plot_by_payment['Churn'],
        width = [0.8],
        marker = dict(
            color=['#ed7071','#ffa14a', '#9da4d8', '#21ced2']
        )
    )
]

layout=go.Layout(
    xaxis={"type": "category"},
    yaxis={"title": "Churn Rate"},
    title="Churn Rate by Payment Method",
    plot_bgcolor = 'white',
    paper_bgcolor = 'white',
)

fig = go.Figure(data=plot_data, layout=layout)
iplot(fig)

In [475]:
# Relationship of Tenure and Churn Rate
plot_by_tenure = df.groupby('tenure').Churn.mean().reset_index()
plot_data = [
    go.Scatter(
        x=plot_by_tenure['tenure'],
        y=plot_by_tenure['Churn'],
        mode = "markers",
        name = "Low",
        marker = dict(
            size = 5,
            line = dict(width=0.8),
            color='green'
        ),
    )
]

layout=go.Layout(
    yaxis={"title": "Churn Rate"},
    xaxis={"title": "Tenure"},
    title="Churn Rate and Tenure Relationship",
    plot_bgcolor = 'white',
    paper_bgcolor = 'white',
)

fig = go.Figure(data=plot_data, layout=layout)
iplot(fig)

In [476]:
#labels
lab = df["SeniorCitizen"].value_counts().keys().tolist()
#values
val = df["SeniorCitizen"].value_counts().values.tolist()
trace = go.Pie(labels=lab, 
                values=val, 
                marker=dict(colors=['#9da4d8', '#21ced2']),
                hole = 0.4,
                # Seting values to 
                hoverinfo="value")
data = [trace]

layout = go.Layout(dict(title="% of Senior Citizens",
              plot_bgcolor = "white",
              paper_bgcolor = "white",))
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [477]:
# Histogram - Dependents
# defining data
trace = go.Histogram(x=df['Dependents'],nbinsx=3,
                marker = dict(color = '#ed7071'))
data = [trace]
# defining layout
layout = go.Layout(title="Dependents Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [478]:
# Histogram - Partner 
# defining data
trace = go.Histogram(
    x=df['Partner'],
    nbinsx=3,
    marker = dict(color = '#9da4d8')
    )
data = [trace]
# defining layout
layout = go.Layout(title="Partner Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [479]:
# Histogram - Churn
# defining data
trace = go.Histogram(x=df['Churn'],nbinsx=3,marker = dict(color = '#ffa14a'))
data = [trace]
# defining layout
layout = go.Layout(title="Churn Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [536]:
# Histogram - Device Protection
# defining data
trace = go.Histogram(x=df['DeviceProtection'],nbinsx=3,marker = dict(color = '#6D9886'))
data = [trace]
# defining layout
layout = go.Layout(title="Device Protection Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [539]:
# Histogram - Multiple Lines
# defining data
trace = go.Histogram(x=df['MultipleLines'],nbinsx=3,marker = dict(color = '#6D9886'))
data = [trace]
# defining layout
layout = go.Layout(title="Multiple Lines Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [540]:
# Histogram - Online Backup
# defining data
trace = go.Histogram(x=df['OnlineBackup'],nbinsx=3,marker = dict(color = '#6D9886'))
data = [trace]
# defining layout
layout = go.Layout(title="Online Backup Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [541]:
# Histogram - Online Security
# defining data
trace = go.Histogram(x=df['OnlineSecurity'],nbinsx=3,marker = dict(color = '#6D9886'))
data = [trace]
# defining layout
layout = go.Layout(title="Online Security Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [542]:
# Histogram - Paperless Billing
# defining data
trace = go.Histogram(x=df['PaperlessBilling'],nbinsx=3,marker = dict(color = '#6D9886'))
data = [trace]
# defining layout
layout = go.Layout(title="Paperless Billing Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [543]:
# Histogram - Phone Service
# defining data
trace = go.Histogram(x=df['PhoneService'],nbinsx=3,marker = dict(color = '#6D9886'))
data = [trace]
# defining layout
layout = go.Layout(title="Phone Service Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [544]:
# Histogram - Streaming Movies
# defining data
trace = go.Histogram(x=df['StreamingMovies'],nbinsx=3,marker = dict(color = '#6D9886'))
data = [trace]
# defining layout
layout = go.Layout(title="Streaming Movies Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [545]:
# Histogram - Streaming TV
# defining data
trace = go.Histogram(x=df['StreamingTV'],nbinsx=3,marker = dict(color = '#6D9886'))
data = [trace]
# defining layout
layout = go.Layout(title="Streaming TV Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [546]:
# Histogram - Tech Support
# defining data
trace = go.Histogram(x=df['TechSupport'],nbinsx=3,marker = dict(color = '#6D9886'))
data = [trace]
# defining layout
layout = go.Layout(title="Technical Support Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [481]:
# Correlation
plt.figure(figsize=(10,5))
df.corr()['Churn'].sort_values(ascending=False).plot(kind='bar')

<AxesSubplot:>

In [482]:
import seaborn as sns # For creating plots
ax = sns.distplot(df['tenure'], hist=True, kde=False, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
ax.set_ylabel('# of Customers')
ax.set_xlabel('Tenure (months)')
ax.set_title('# of Customers by their tenure')


`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).



Text(0.5, 1.0, '# of Customers by their tenure')

In [483]:
services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
           'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

fig, axes = plt.subplots(nrows = 3,ncols = 3,figsize = (15,12))
for i, item in enumerate(services):
    if i < 3:
        ax = df[item].value_counts().plot(kind = 'bar',ax=axes[i,0],rot = 0)
        
    elif i >=3 and i < 6:
        ax = df[item].value_counts().plot(kind = 'bar',ax=axes[i-3,1],rot = 0)
        
    elif i < 9:
        ax = df[item].value_counts().plot(kind = 'bar',ax=axes[i-6,2],rot = 0)
    ax.set_title(item)

ALGORITHMS

In [484]:
# independent variable - all columns aside from 'Churn'
X = df.iloc[:,:-1].values
X

array([['7590-VHVEG', 0, 0, ..., 'Electronic check', 29.85, 30.0],
       ['5575-GNVDE', 1, 0, ..., 'Mailed check', 56.95, 1890.0],
       ['3668-QPYBK', 1, 0, ..., 'Mailed check', 53.85, 108.0],
       ...,
       ['4801-JZAZL', 0, 0, ..., 'Electronic check', 29.6, 346.0],
       ['8361-LTMKD', 1, 1, ..., 'Mailed check', 74.4, 307.0],
       ['3186-AJIEK', 1, 0, ..., 'Bank transfer (automatic)', 105.65,
        6844.0]], dtype=object)

In [485]:
# dependent variable - Churn
y = df.iloc[:,20]
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int32

In [486]:
# Remove customer ID
df2 = df.iloc[:,1:]

In [487]:
# Convert predictor variables in a binary numeric variable
df2['Churn'].replace(to_replace='Yes', value=1, inplace=True)
df2['Churn'].replace(to_replace='No', value=0, inplace=True)

In [488]:
# Converting categorical variables into dummy variables
df_dummies = pd.get_dummies(df2)
df_dummies.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,1,29.85,30.0,0,...,1,0,0,1,0,0,0,0,1,0
1,1,0,0,0,34,1,0,56.95,1890.0,0,...,1,0,0,0,1,0,0,0,0,1
2,1,0,0,0,2,1,1,53.85,108.0,1,...,1,0,0,1,0,0,0,0,0,1
3,1,0,0,0,45,0,0,42.3,1841.0,0,...,1,0,0,0,1,0,1,0,0,0
4,0,0,0,0,2,1,1,70.7,152.0,1,...,1,0,0,1,0,0,0,0,1,0


In [489]:
#Perform One Hot Encoding using get_dummies method
df= pd.get_dummies(df, columns = ['Contract','Dependents','DeviceProtection','gender',
                                                        'InternetService','MultipleLines','OnlineBackup',
                                                        'OnlineSecurity','PaperlessBilling','Partner',
                                                        'PaymentMethod','PhoneService','SeniorCitizen',
                                                        'StreamingMovies','StreamingTV','TechSupport'],
                              drop_first=True)

In [490]:
df.head()

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,Contract_One year,Contract_Two year,Dependents_1,DeviceProtection_No internet service,DeviceProtection_Yes,...,PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneService_1,SeniorCitizen_1,StreamingMovies_No internet service,StreamingMovies_Yes,StreamingTV_No internet service,StreamingTV_Yes,TechSupport_No internet service,TechSupport_Yes
0,7590-VHVEG,1,29.85,30.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,5575-GNVDE,34,56.95,1890.0,0,1,0,0,0,1,...,0,1,1,0,0,0,0,0,0,0
2,3668-QPYBK,2,53.85,108.0,1,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
3,7795-CFOCW,45,42.3,1841.0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,9237-HQITU,2,70.7,152.0,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0


In [491]:
from sklearn.preprocessing import StandardScaler
standardscaler = StandardScaler()
columns_for_fit_scaling = ['tenure', 'MonthlyCharges', 'TotalCharges']
df_dummies[columns_for_fit_scaling] = standardscaler.fit_transform(df_dummies[columns_for_fit_scaling])

In [492]:
df_dummies.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,-1.277445,0,1,-1.160323,-0.994903,0,...,1,0,0,1,0,0,0,0,1,0
1,1,0,0,0,0.066327,1,0,-0.259629,-0.173652,0,...,1,0,0,0,1,0,0,0,0,1
2,1,0,0,0,-1.236724,1,1,-0.36266,-0.960464,1,...,1,0,0,1,0,0,0,0,0,1
3,1,0,0,0,0.514251,0,0,-0.746535,-0.195287,0,...,1,0,0,0,1,0,1,0,0,0
4,0,0,0,0,-1.236724,1,1,0.197365,-0.941036,1,...,1,0,0,1,0,0,0,0,1,0


In [493]:
churners_number = len(df[df['Churn'] == 1])
print("Number of churners", churners_number)

churners = (df[df['Churn'] == 1])

non_churners = df[df['Churn'] == 0].sample(n=churners_number)
print("Number of non-churners", len(non_churners))
df2 = churners.append(non_churners)

Number of churners 1869
Number of non-churners 1869


In [494]:
try:
    customer_id = df2['customerID'] # Store this as customer_id variable
    del df2['customerID'] # Don't need in ML DF
except:
    print("already removed customerID")

In [495]:
# Correlation
plt.figure(figsize=(15,8))
df_dummies.corr()['Churn'].sort_values(ascending=False).plot(kind='bar')

<AxesSubplot:>

In [496]:
# Splitting Data into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions X_train dataset: ", y_train.shape)
print("Number transactions X_train dataset: ", X_test.shape)
print("Number transactions X_train dataset: ", y_test.shape)

Number transactions X_train dataset:  (5634, 20)
Number transactions X_train dataset:  (5634,)
Number transactions X_train dataset:  (1409, 20)
Number transactions X_train dataset:  (1409,)


In [497]:
y = df_dummies['Churn'].values
X = df_dummies.drop(columns = ['Churn'])

# Scaling all the variables to a range of 0 to 1
from sklearn.preprocessing import MinMaxScaler
features = X.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features

In [498]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [499]:
# Running logistic regression model
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
logmodel = LogisticRegression(random_state=50)
result = logmodel.fit(X_train, y_train)

Xnew = X_test.values
pred = logmodel.predict(X_test)

logmodel_accuracy = round(metrics.accuracy_score(y_test, pred)*100, 2)
print (logmodel_accuracy)

80.13



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [500]:
df.head()

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,Contract_One year,Contract_Two year,Dependents_1,DeviceProtection_No internet service,DeviceProtection_Yes,...,PaymentMethod_Electronic check,PaymentMethod_Mailed check,PhoneService_1,SeniorCitizen_1,StreamingMovies_No internet service,StreamingMovies_Yes,StreamingTV_No internet service,StreamingTV_Yes,TechSupport_No internet service,TechSupport_Yes
0,7590-VHVEG,1,29.85,30.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,5575-GNVDE,34,56.95,1890.0,0,1,0,0,0,1,...,0,1,1,0,0,0,0,0,0,0
2,3668-QPYBK,2,53.85,108.0,1,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
3,7795-CFOCW,45,42.3,1841.0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,9237-HQITU,2,70.7,152.0,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0


In [501]:
proba = logmodel.predict_proba(Xnew)[:,1]


X does not have valid feature names, but LogisticRegression was fitted with feature names



In [502]:
for i in range(len(Xnew)):
	df['Churn Probability'] = print(proba[i])

0.028302793102380985
0.4294604926403142
0.1862893151740407
0.2889838492175024
0.7886084157595115
0.05019091330943099
0.616361406383436
0.10401957683082166
0.04615401840183331
0.3561820723764173
0.013893332566361964
0.4054656476589537
0.10581134485999143
0.14088238646594173
0.04486403856502725
0.48858615735348626
0.3271692019158535
0.009241539499774902
0.06473887123273844
0.24163658112216638
0.7006857456146837
0.0736199336098462
0.7783648342291286
0.18366285719436465
0.05311263185097446
0.7763547932751473
0.21892662360266948
0.16437784974011688
0.0816694363940566
0.06211140778969029
0.011091795192053451
0.7306797541000322
0.225687528107335
0.7502476791943271
0.004391961340197886
0.03022990456324537
0.2420462827068541
0.003770308174011013
0.2589668991765835
0.21005664401287583
0.03414808491556128
0.27872766414422256
0.004337767708739424
0.37253773495091996
0.012308343443194838
0.3316597952294555
0.0019728942581323395
0.16762161088333452
0.657684297196039
0.5725058870078876
0.169136186243

In [503]:
for i in range(len(Xnew)):
	df['Churn Probability'][i] = proba[i]

In [504]:
df.head()

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,Contract_One year,Contract_Two year,Dependents_1,DeviceProtection_No internet service,DeviceProtection_Yes,...,PaymentMethod_Mailed check,PhoneService_1,SeniorCitizen_1,StreamingMovies_No internet service,StreamingMovies_Yes,StreamingTV_No internet service,StreamingTV_Yes,TechSupport_No internet service,TechSupport_Yes,Churn Probability
0,7590-VHVEG,1,29.85,30.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.028303
1,5575-GNVDE,34,56.95,1890.0,0,1,0,0,0,1,...,1,1,0,0,0,0,0,0,0,0.42946
2,3668-QPYBK,2,53.85,108.0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0.186289
3,7795-CFOCW,45,42.3,1841.0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0.288984
4,9237-HQITU,2,70.7,152.0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0.788608


In [505]:
# Create a Dataframe showcasing probability of Churn of each customer
df[['customerID','Churn Probability']]

Unnamed: 0,customerID,Churn Probability
0,7590-VHVEG,0.028303
1,5575-GNVDE,0.42946
2,3668-QPYBK,0.186289
3,7795-CFOCW,0.288984
4,9237-HQITU,0.788608
...,...,...
7038,6840-RESVB,
7039,2234-XADUH,
7040,4801-JZAZL,
7041,8361-LTMKD,


In [506]:
predicted = df[['customerID','Churn Probability']]

In [507]:
weights = pd.Series(logmodel.coef_[0],
                 index=X.columns.values)
print (weights.sort_values(ascending = False)[:10].plot(kind='bar'))

AxesSubplot(0.125,0.125;0.775x0.755)


In [508]:
from sklearn.tree import DecisionTreeClassifier
dtmodel = DecisionTreeClassifier(criterion = 'gini', random_state=50)
dtmodel.fit(X_train, y_train)

dt_pred = dtmodel.predict(X_test)

dt_accuracy = round(metrics.accuracy_score(y_test, dt_pred)*100,2)

In [509]:
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
rfmodel = RandomForestClassifier(n_estimators=1000 , oob_score = True, n_jobs = -1,
                                  random_state =50, max_features = "auto",
                                  max_leaf_nodes = 30)
rfmodel.fit(X_train, y_train)

# Make predictions
rf_pred = rfmodel.predict(X_test)
print (metrics.accuracy_score(y_test, rf_pred))

rf_accuracy = round(metrics.accuracy_score(y_test, rf_pred)*100,2)


X does not have valid feature names, but RandomForestClassifier was fitted with feature names



0.7998580553584103


In [510]:
importances = rfmodel.feature_importances_
weights = pd.Series(importances,
                 index=X.columns.values)
weights.sort_values()[-10:].plot(kind = 'barh')

<AxesSubplot:>

In [511]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, y_train)
xgb_pred = model.predict(X_test)
metrics.accuracy_score(y_test, xgb_pred)

xgb_accuracy = round(metrics.accuracy_score(y_test, xgb_pred)*100,2)







In [512]:
Model_Comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost'],
    'Score': [logmodel_accuracy, dt_accuracy, rf_accuracy, xgb_accuracy]})
Model_Comparison_df = Model_Comparison.sort_values(by='Score', ascending=False)
Model_Comparison_df - Model_Comparison_df.set_index('Score')
Model_Comparison_df.reset_index()

Unnamed: 0,index,Model,Score
0,0,Logistic Regression,80.13
1,2,Random Forest,79.99
2,3,XGBoost,77.79
3,1,Decision Tree,71.89


In [513]:
y_hat_train = logmodel.predict(X_train)
y_hat_test = logmodel.predict(X_test)

svc_accuracy = round(metrics.accuracy_score(y_test, y_hat_test)* 100, 2)

In [514]:
from sklearn.metrics import confusion_matrix
conf_mat_logmodel = confusion_matrix(y_test,y_hat_test)
conf_mat_logmodel

array([[934,  92],
       [188, 195]], dtype=int64)

In [515]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_test,y_hat_test )) 
print(accuracy_score(y_test, y_hat_test ))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1026
           1       0.68      0.51      0.58       383

    accuracy                           0.80      1409
   macro avg       0.76      0.71      0.73      1409
weighted avg       0.79      0.80      0.79      1409

0.801277501774308


In [516]:
predicted.to_csv("predicted.csv", index="true")