# Logistic Regression

Logistic Regression on a Telecom Data Set for Customer Churn Prediction

will the customer churn. Yes or NO. 
Will the customer leave the Telecom Provider

In [161]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from pylab import rcParams
%matplotlib inline



In [162]:
df = pd.read_csv('1Telco-Customer-Churn1.csv')

In [163]:
df.shape

(7043, 21)

In [164]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [165]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

TotalCharges is of 'Object' datatype. Looking at the data set it should be of 'float' data type

Convert 'TotalCharges' to numeric data type

In [166]:
pd.to_numeric(df['TotalCharges'])

ValueError: Unable to parse string " " at position 488

ValueError: Unable to parse string " " at position 488. 

In [167]:
df.iloc[488]

customerID                         4472-LVYGI
gender                                 Female
SeniorCitizen                               0
Partner                                   Yes
Dependents                                Yes
tenure                                      0
PhoneService                               No
MultipleLines                No phone service
InternetService                           DSL
OnlineSecurity                            Yes
OnlineBackup                               No
DeviceProtection                          Yes
TechSupport                               Yes
StreamingTV                               Yes
StreamingMovies                            No
Contract                             Two year
PaperlessBilling                          Yes
PaymentMethod       Bank transfer (automatic)
MonthlyCharges                          52.55
TotalCharges                                 
Churn                                      No
Name: 488, dtype: object

The error is because of whitespace in the 'TotalCharges' column. If there is a missing observation pandas would have filled with NaN but since there is a whitespace character the entire feature is converted to string data type.

Replace the whitespace with 'NaN'

In [168]:
df = df.replace('^\s*$',np.nan, regex = True)

In [169]:
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

Now we are able to see 11 observations as NaN

In [170]:
df.shape

(7043, 21)

In [171]:
df.dropna(axis = 0 ,inplace = True)
df.shape #11 missing observations are removed

(7032, 21)

In [172]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

There are no null values in the data now.

In [173]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

In [174]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [175]:
df['Churn'].value_counts()

No     5163
Yes    1869
Name: Churn, dtype: int64

In [176]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [177]:
col = ['Partner','Dependents','PhoneService','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']

Using .value_counts() we can find the number of categories inside a categorical variable.

col_2 indicate variables with two categories and col_3 indicate variables with 3 categories

In [178]:
col_2 = ['Partner','Dependents','PhoneService','PaperlessBilling','Churn']

In [179]:
df['Partner'].value_counts()

No     3639
Yes    3393
Name: Partner, dtype: int64

In [180]:
col_3 = ['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

In [181]:
df['StreamingMovies'].value_counts()

No                     2781
Yes                    2731
No internet service    1520
Name: StreamingMovies, dtype: int64

for all the variables in col_3 the three categories remain same 'Yes','No','No internet service'

In [182]:
df['gender'].replace(('Male','Female'),(1,0),inplace = True)

In [183]:
df_2 = df[col_2]

In [184]:
df_2.head()

Unnamed: 0,Partner,Dependents,PhoneService,PaperlessBilling,Churn
0,Yes,No,No,Yes,No
1,No,No,Yes,No,No
2,No,No,Yes,Yes,Yes
3,No,No,No,No,No
4,No,No,Yes,Yes,Yes


In [185]:
for i,j in enumerate(df_2.columns):
    df[j] = df[j].replace(('Yes','No'),(1,0))
    #print(df[i],df[j])

In [186]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,1,0,1,0,No phone service,DSL,No,...,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,1,0,0,0,34,1,No,DSL,Yes,...,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,1,0,0,0,2,1,No,DSL,Yes,...,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,1,0,0,0,45,0,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,0,0,0,0,2,1,No,Fiber optic,No,...,No,No,No,No,Month-to-month,1,Electronic check,70.7,151.65,1


In [187]:
print(pd.DataFrame(enumerate(df.columns)))

     0                 1
0    0        customerID
1    1            gender
2    2     SeniorCitizen
3    3           Partner
4    4        Dependents
5    5            tenure
6    6      PhoneService
7    7     MultipleLines
8    8   InternetService
9    9    OnlineSecurity
10  10      OnlineBackup
11  11  DeviceProtection
12  12       TechSupport
13  13       StreamingTV
14  14   StreamingMovies
15  15          Contract
16  16  PaperlessBilling
17  17     PaymentMethod
18  18    MonthlyCharges
19  19      TotalCharges
20  20             Churn


In [188]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,1,0,1,0,No phone service,DSL,No,...,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,1,0,0,0,34,1,No,DSL,Yes,...,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,1,0,0,0,2,1,No,DSL,Yes,...,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,1,0,0,0,45,0,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,0,0,0,0,2,1,No,Fiber optic,No,...,No,No,No,No,Month-to-month,1,Electronic check,70.7,151.65,1


In [189]:
df_3 = df[col_3]

In [190]:
df_3.head()

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,No,No,No
2,Yes,Yes,No,No,No,No
3,Yes,No,Yes,Yes,No,No
4,No,No,No,No,No,No


In [191]:
for i,j in enumerate(df_3.columns):
    df[j] = df[j].replace(('No internet service','No','Yes'),(0,1,2))

In [192]:
df['MultipleLines'].value_counts()

No                  3385
Yes                 2967
No phone service     680
Name: MultipleLines, dtype: int64

In [193]:
df['MultipleLines'].replace(('No phone service','No','Yes'),(0,1,2),inplace = True)

In [194]:
df['InternetService'].value_counts()

Fiber optic    3096
DSL            2416
No             1520
Name: InternetService, dtype: int64

Fiber optic networks can deliver speeds up to 1 Gpbs (1000 Mbps), whereas DSL speeds typically top out around 6 Mbps. Some domain understanding will help in encoding.

In [195]:
df['InternetService'].replace(('No','DSL','Fiber optic'),(0,1,2),inplace = True)

In [196]:
df['Contract'].value_counts()

Month-to-month    3875
Two year          1685
One year          1472
Name: Contract, dtype: int64

In [197]:
df['Contract'].replace(('Month-to-month','One year','Two year'),(1,2,3),inplace = True)

In [198]:
df['PaymentMethod'].value_counts()

Electronic check             2365
Mailed check                 1604
Bank transfer (automatic)    1542
Credit card (automatic)      1521
Name: PaymentMethod, dtype: int64

We need to do One Hot Encoding for the 'PaymentMethod' for which we will use pd.get_dummies. 'CustomerID' is not useful in predicting the target so we can remove it from the list of independent variables.

In [199]:
del df['customerID']

In [200]:
df.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [201]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,0,1,1,2,1,1,1,1,1,1,Electronic check,29.85,29.85,0
1,1,0,0,0,34,1,1,1,2,1,2,1,1,1,2,0,Mailed check,56.95,1889.5,0
2,1,0,0,0,2,1,1,1,2,2,1,1,1,1,1,1,Mailed check,53.85,108.15,1
3,1,0,0,0,45,0,0,1,2,1,2,2,1,1,2,0,Bank transfer (automatic),42.3,1840.75,0
4,0,0,0,0,2,1,1,2,1,1,1,1,1,1,1,1,Electronic check,70.7,151.65,1


In [202]:
df.dtypes

gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService       int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
PaperlessBilling      int64
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

In [203]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,0,1,1,2,1,1,1,1,1,1,Electronic check,29.85,29.85,0
1,1,0,0,0,34,1,1,1,2,1,2,1,1,1,2,0,Mailed check,56.95,1889.5,0
2,1,0,0,0,2,1,1,1,2,2,1,1,1,1,1,1,Mailed check,53.85,108.15,1
3,1,0,0,0,45,0,0,1,2,1,2,2,1,1,2,0,Bank transfer (automatic),42.3,1840.75,0
4,0,0,0,0,2,1,1,2,1,1,1,1,1,1,1,1,Electronic check,70.7,151.65,1


In [204]:
df_final = pd.get_dummies(df)

In [205]:
df_final.shape

(7032, 23)

In [206]:
df_final.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'MonthlyCharges',
       'TotalCharges', 'Churn', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [207]:
df_final.iloc[:,15:]

Unnamed: 0,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,29.85,29.85,0,0,0,1,0
1,0,56.95,1889.50,0,0,0,0,1
2,1,53.85,108.15,1,0,0,0,1
3,0,42.30,1840.75,0,1,0,0,0
4,1,70.70,151.65,1,0,0,1,0
...,...,...,...,...,...,...,...,...
7038,1,84.80,1990.50,0,0,0,0,1
7039,1,103.20,7362.90,0,0,1,0,0
7040,1,29.60,346.45,0,0,0,1,0
7041,1,74.40,306.60,1,0,0,0,1


In [208]:
y = df_final['Churn']
X = df_final.drop('Churn', axis = 1)

import libraries for machine learning

In [209]:
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score,f1_score,confusion_matrix, accuracy_score

In [210]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size =0.2, shuffle = True)

In [211]:
print(X_train.shape)
print(y_train.shape)

(5625, 22)
(5625,)


logistic regression

In [212]:
lr = LogisticRegression().fit(X_train,y_train)
lr_pred = lr.predict(X_test)

In [213]:
y_test.head(2) 

2648    0
6695    0
Name: Churn, dtype: int64

In [214]:

lr_precision = precision_score(y_test,lr_pred)
lr_recall = recall_score(y_test,lr_pred)
lr_f1 = f1_score(y_test,lr_pred)
lr_ac = accuracy_score(y_test,lr_pred)

In [215]:
print('LR Precision',lr_precision)
print('LR recall', lr_recall)
print('LR F1 score',lr_f1)
print('LR Accuracy', lr_ac)


LR Precision 0.6688311688311688
LR recall 0.5464190981432361
LR F1 score 0.6014598540145984
LR Accuracy 0.8059701492537313
