In [39]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, f1_score, cohen_kappa_score


In [40]:
data = pd.read_csv('customer_churn.csv')

In [41]:
data.columns = data.columns.str.lower()
print(data.columns)

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')


In [42]:
#selecting only the columns required for the exercice

data = data[['tenure', 'seniorcitizen', 'monthlycharges','churn']]
data.head()

# Identify numerical columns
numerical_cols = ['tenure', 'seniorcitizen', 'monthlycharges']

# Target column designation
target_col = 'churn' 

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

#spliting into X&Y
X = data.drop('churn', axis=1)
y = data['churn']

In [44]:
#realize Test/train Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)


In [45]:
# set scalers
numerical_transformer = MinMaxScaler()

# MinMaxScaler
X_train_num = numerical_transformer.fit_transform(X_train[numerical_cols])
X_test_num = numerical_transformer.transform(X_test[numerical_cols])

# Convert numpy arrays back to DataFrames
X_train_num = pd.DataFrame(X_train_num, columns=numerical_cols, index=X_train.index)
X_test_num = pd.DataFrame(X_test_num, columns=numerical_cols, index=X_test.index)

# Combine numerical and categorical data
X_train_preprocessed = X_train_num.copy()
X_test_preprocessed = X_test_num.copy()

In [46]:
# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(max_iter=600)
log_reg.fit(X_train_preprocessed, y_train)

In [47]:
# Make predictions
y_pred = log_reg.predict(X_test_preprocessed)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.83      0.91      0.87      1027
         Yes       0.67      0.48      0.56       382

    accuracy                           0.79      1409
   macro avg       0.75      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409

[[934  93]
 [197 185]]


In [48]:
# Precision on the Yes predictions seems off as the class is heavily imbalanced, performance on recall and f1 score seems to back that statement as "NO" has more performant indicators

In [52]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE 
smote = SMOTE(random_state=56)

# Making sure to use the preprocesed X_train here
X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed, y_train)


In [53]:
# deploy Log Regression model for the SMOTE data
smote_model = LogisticRegression(max_iter=600)
smote_model.fit(X_train_smote, y_train_smote)

In [54]:
#prediction
y_pred_smote = smote_model.predict(X_test_preprocessed)


In [55]:
print(classification_report(y_test, y_pred_smote))
print(confusion_matrix(y_test, y_pred_smote))

              precision    recall  f1-score   support

          No       0.89      0.74      0.81      1027
         Yes       0.52      0.75      0.62       382

    accuracy                           0.75      1409
   macro avg       0.71      0.75      0.71      1409
weighted avg       0.79      0.75      0.76      1409

[[763 264]
 [ 95 287]]


In [None]:
#with Smote, precision has slighly decreased for NO, but recall and f1 score perform better (but still weak indicators) the model works better to provide balanced predictions but should be reinforced to obtain balanced indicators and a confusion matrix with less false positive or false negatives