In [104]:
#Import the required libraries and modules that you would need.

import pandas as pd
import numpy as np
import warnings
import statsmodels.api as sm
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.preprocessing import StandardScaler


pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [105]:
#read that data into Python and call the dataframe churnData

churndata = pd.read_csv('Customer-Churn.csv')
churndata

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [106]:
#Check the datatypes of all the columns in the data. You would see that the column TotalCharges is object type. 
#Convert this column into numeric type using pd.to_numeric function.

In [107]:
churndata.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [108]:
churndata['totalcharges'] = pd.to_numeric(churndata['TotalCharges'], errors='coerce')

In [109]:
churndata.dtypes #sanity check

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
totalcharges        float64
dtype: object

In [110]:
#Check for null values in the dataframe. Replace the null values.
churndata.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges         0
Churn                0
totalcharges        11
dtype: int64

In [113]:
#totalcharges has 11 nan values

In [114]:
# creating bool series True for NaN values 
bs = pd.isnull(churndata["totalcharges"]) 
    
# filtering data to show total charges = NaN 
churndata[bs] 

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn,totalcharges
488,Female,0,Yes,Yes,0,No,Yes,No,Yes,Yes,Yes,No,Two year,52.55,,No,
753,Male,0,No,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.25,,No,
936,Female,0,Yes,Yes,0,Yes,Yes,Yes,Yes,No,Yes,Yes,Two year,80.85,,No,
1082,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.75,,No,
1340,Female,0,Yes,Yes,0,No,Yes,Yes,Yes,Yes,Yes,No,Two year,56.05,,No,
3331,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,19.85,,No,
3826,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.35,,No,
4380,Female,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.0,,No,
5218,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,19.7,,No,
6670,Female,0,Yes,Yes,0,Yes,No,Yes,Yes,Yes,Yes,No,Two year,73.35,,No,


In [59]:
#Comment: I check the original values on the dataframe for totalcharges column. 
#It seems that tenure multiplied by monthly fees is how it is calculated the total charge, since all of this rows have 0 tenure, I'll fill the nans with 0


In [115]:
churndata['totalcharges'] = churndata['totalcharges'].fillna(0)

In [116]:
churndata.isnull().values.sum()

0

* Use the following features: tenure, SeniorCitizen, MonthlyCharges and TotalCharges:
    * Scale the features either by using normalizer or a standard scaler.
    * Split the data into a training set and a test set.
    * Fit a logistic regression model on the training data.
    * Check the accuracy on the test data.

In [118]:
churndata.select_dtypes(np.number)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,totalcharges
0,0,1,29.85,29.85
1,0,34,56.95,1889.50
2,0,2,53.85,108.15
3,0,45,42.30,1840.75
4,0,2,70.70,151.65
...,...,...,...,...
7038,0,24,84.80,1990.50
7039,0,72,103.20,7362.90
7040,0,11,29.60,346.45
7041,1,4,74.40,306.60


In [119]:
churndata['tenure'].dtypes

dtype('int64')

In [120]:
#as tenure apears as int, it has to be coverted to float
churndata['tenure'] = churndata['tenure'].astype(float)

In [121]:
churndata['tenure'].dtypes

dtype('float64')

In [122]:
#Split the data

y = churndata['Churn']

num = churndata.select_dtypes(np.number)

In [123]:
num

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,totalcharges
0,0,1.0,29.85,29.85
1,0,34.0,56.95,1889.50
2,0,2.0,53.85,108.15
3,0,45.0,42.30,1840.75
4,0,2.0,70.70,151.65
...,...,...,...,...
7038,0,24.0,84.80,1990.50
7039,0,72.0,103.20,7362.90
7040,0,11.0,29.60,346.45
7041,1,4.0,74.40,306.60


In [124]:
transformer = StandardScaler().fit(num)
X_stn = transformer.transform(num)
X_stn.shape

(7043, 4)

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X_stn, y, test_size=0.3, random_state=42)

In [126]:
classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(X_train, y_train)
predictions = classification.predict(X_test)
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1539
         Yes       0.68      0.45      0.54       574

    accuracy                           0.79      2113
   macro avg       0.75      0.69      0.70      2113
weighted avg       0.78      0.79      0.78      2113



In [127]:
classification.score(X_test, y_test)

0.7941315664931378

* Note: So far we have not balanced the data.

* Managing imbalance in the dataset

    * Check for the imbalance.
    * Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.
    * Each time fit the model and see how the accuracy of the model is.

In [128]:
y.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [None]:
#upsampling

In [129]:
churndata.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'MonthlyCharges', 'TotalCharges', 'Churn', 'totalcharges'],
      dtype='object')

In [130]:
y_no = churndata[churndata['Churn'] == 'No']
y_yes = churndata[churndata['Churn'] == 'Yes']

y_yes = y_yes.sample(len(y_no), replace=True) # sampling y_yes values to the lenght of y_no values 
(y_yes.shape)


(5174, 17)

In [131]:
data = pd.concat([y_no, y_yes], axis=0)

#shuffling the data
data = data.sample(frac=1)
print(data['Churn'].value_counts())

#Now data is balance

No     5174
Yes    5174
Name: Churn, dtype: int64


In [132]:
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn,totalcharges
2724,Male,0,Yes,Yes,62.0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,19.95,1244.8,No,1244.80
663,Male,0,Yes,Yes,65.0,Yes,Yes,Yes,No,No,No,Yes,Two year,69.55,4459.15,No,4459.15
1791,Female,0,Yes,No,44.0,Yes,Yes,Yes,Yes,No,No,No,One year,61.50,2722.2,No,2722.20
6557,Female,0,No,No,33.0,Yes,No,Yes,No,No,Yes,Yes,Month-to-month,100.00,3320.6,No,3320.60
1615,Female,0,Yes,No,49.0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,20.05,923.1,No,923.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6687,Female,1,Yes,No,2.0,Yes,No,No,Yes,No,No,No,Month-to-month,79.20,172.85,Yes,172.85
3175,Male,1,Yes,No,17.0,Yes,No,No,No,No,No,Yes,Month-to-month,81.50,1329.2,Yes,1329.20
3258,Male,0,No,No,1.0,No,No,No,No,No,No,No,Month-to-month,24.40,24.4,No,24.40
1780,Female,0,No,Yes,18.0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,20.15,390.85,Yes,390.85


In [133]:
y = data['Churn']

num = data.select_dtypes(np.number)

In [134]:
transformer = StandardScaler().fit(num)
X_stan = transformer.transform(num)
X_stan.shape

(10348, 4)

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X_stan, y, test_size=0.3, random_state=42)

In [136]:
classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(X_train, y_train)
predictions = classification.predict(X_test)
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.74      0.72      0.73      1576
         Yes       0.72      0.74      0.73      1529

    accuracy                           0.73      3105
   macro avg       0.73      0.73      0.73      3105
weighted avg       0.73      0.73      0.73      3105



In [137]:
classification.score(X_test, y_test)

0.7294685990338164

In [None]:
#Comment: with upsampling strategy the model have lower scores but higher precision when calculating yes

In [None]:
#downsampling

In [146]:
y_no = churndata[churndata['Churn'] == 'No']
y_yes = churndata[churndata['Churn'] == 'Yes']


y_no = y_no.sample(len(y_yes)) # sampling y_no values to the lenght of y_yes values 
print(y_no.shape)
print(y_yes.shape)

print("------------------")

data2 = pd.concat([y_no, y_yes], axis=0)

#shuffling the data

data2 = data2.sample(frac=1)
data2['Churn'].value_counts()

(1869, 17)
(1869, 17)
------------------


No     1869
Yes    1869
Name: Churn, dtype: int64

In [147]:
data2.isnull().values.sum()

0

In [148]:
y = data2['Churn']


num = ['tenure', 'SeniorCitizen', 'MonthlyCharges', 'totalcharges']
num = data2[num]


In [149]:
transformer = StandardScaler().fit(num)
X_stan = transformer.transform(num)
X_stan.shape

(3738, 4)

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X_stan, y, test_size=0.3, random_state=42)

In [151]:
classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(X_train, y_train)
predictions = classification.predict(X_test)
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.75      0.77      0.76       553
         Yes       0.77      0.75      0.76       569

    accuracy                           0.76      1122
   macro avg       0.76      0.76      0.76      1122
weighted avg       0.76      0.76      0.76      1122



In [152]:
classification.score(X_test, y_test)

0.7566844919786097

In [None]:
#Comment: comparing the results from upsampling vs downsampling we see
#There is a difference of 0.01 in precision, recall and f1-score between upsampling and downsampling.

#In the classification.score we get 0.729 when upsampling, and 0.756 when downsampling. 