In [48]:
import pandas as pd                   
import numpy as np                   
import matplotlib.pyplot as plt       
import seaborn as sns               
import statsmodels.api as sm         
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline

from imblearn.under_sampling import RandomUnderSampler

# 1. load the dataset

1.1 Dataset reading

In [49]:
churnData = pd.read_csv('Customer-Churn.csv')
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


1.2. Getting data types 

In [50]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

# 2. change 'object' to numerical where required 

since 'total charges' is an object-type column, we want to convert it to numerical (float)

In [51]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [52]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7032 non-null   float64
 15  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

we can see now that 'total charges' is a numeric (float 64) data type 

# 3. Deal with null values 

In [53]:
# check for null values
print(churnData.isnull().sum())

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


we see that there are initially 11 null values in 'TotalCharges' (7032 are non-null)

In [54]:
# this way we replace null values with cero

churnData['TotalCharges'].fillna(0, inplace=True)

# check for null values after replacement
print(churnData.isnull().sum())

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [55]:
# we see that after replacement there are no 'null values'

# 4. machine learning 

4.1. normalising

we scale and normalise the selectec features, 'tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges'

In [56]:
selected_features = ['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']
X = churnData[selected_features]

# scale with normalizer
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X)

# create the normalised dataset 
churnData_normalized = pd.DataFrame(X_normalized, columns=selected_features)

# check for values in the normalised dataset 
print("Normalized Data:")
print(churnData_normalized.head())

Normalized Data:
     tenure  SeniorCitizen  MonthlyCharges  TotalCharges
0  0.023682            0.0        0.706908      0.706908
1  0.017983            0.0        0.030122      0.999384
2  0.016552            0.0        0.445662      0.895048
3  0.024433            0.0        0.022967      0.999438
4  0.011952            0.0        0.422512      0.906279


4.2. scaling

In [57]:
selected_features = ['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']
X = churnData[selected_features]

# apply standard scaler 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# create dataset with scaled variables 
churnData_scaled = pd.DataFrame(X_scaled, columns=selected_features)

#check for dataset with scaled variables 
print("\nScaled Data:")
print(churnData_scaled.head())


Scaled Data:
     tenure  SeniorCitizen  MonthlyCharges  TotalCharges
0 -1.277445      -0.439916       -1.160323     -0.992611
1  0.066327      -0.439916       -0.259629     -0.172165
2 -1.236724      -0.439916       -0.362660     -0.958066
3  0.514251      -0.439916       -0.746535     -0.193672
4 -1.236724      -0.439916        0.197365     -0.938874


4.3. train-test 

In [58]:
# we create the separate dataframes for the target and predictor variables

selected_features = ['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']
X = churnData[selected_features]
y = churnData['Churn']

# scale features with standard scaler 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# split into train and test 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# print train and test shape 
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (5634, 4)
y_train shape: (5634,)
X_test shape: (1409, 4)
y_test shape: (1409,)


4.4. logistic regression and test model accuracy 

In [59]:
# we create a logistic regression model and fit into the training data 

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [60]:
# assess the accuracy of the test model 

y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.8062455642299503


Test model is highly accurate from the analysis that we have performed 

# 5. handling imbalanced model 

5.1. checking for imbalance

In [61]:
# count occurrences of each class ('yes' and 'no')
class_counts = churnData['Churn'].value_counts()

# print class counts
print(class_counts)

No     5174
Yes    1869
Name: Churn, dtype: int64


From the class counts, we can tell that there is an important imbalance between 'yes' and 'no' in the "churn" variable. By applying oversampling to the minority class and undersampling to the majority class we might achieve a more balanced model 

5.2. data resample

In [62]:
# create SMOTE pipeline for over-sampling 
pipeline = make_pipeline(
    SMOTE(random_state=42),
    RandomUnderSampler(random_state=42),
    LogisticRegression()
)

# fit pipeline on the training data
pipeline.fit(X_train, y_train)

# make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7430801987224982


In this case, applying SMOTE for over-sampling does not improve the accuracy score of the model 
(0.7430801987224982 vs. 0.8062455642299503) 

In [64]:
# apply random under sampling 
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# create a logistic regression model
logreg = LogisticRegression()

# fit the model on the undersampled data
logreg.fit(X_train_resampled, y_train_resampled)

# make predictions on the test set
y_pred = logreg.predict(X_test)

# Calculate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7452093683463449


5.3. balanced model accuracy assessment 

In this case, applying under-sampling does not improve the accuracy score of the model 
(0.7452093683463449 vs. 0.8062455642299503), eventhough is slightly higher compared to over-sampling accuracy score (0.7430801987224982). 