In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
churnData = pd.read_csv('files_for_lab/Customer-Churn.csv')
churnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [3]:
churnData['TotalCharges'] = churnData['TotalCharges'].replace(' ', np.nan) 

In [4]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'])

In [5]:
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(churnData['TotalCharges'].median())

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, RobustScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

### Apply SMOTE for upsampling the data

In [8]:
X = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = churnData[['Churn']]

In [9]:
y['Churn'] = np.where(y['Churn'] == 'No', 0, 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Churn'] = np.where(y['Churn'] == 'No', 0, 1)


In [12]:
sm = SMOTE(k_neighbors=3)

X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)

In [13]:
scalar = StandardScaler()

In [14]:
y_train_SMOTE.value_counts()/len(y_train_SMOTE)

Churn
0        0.5
1        0.5
dtype: float64

### Use logistic regression to fit the model and compute the accuracy of the model.

In [15]:
model = LogisticRegression()
model.fit(X_train_SMOTE, y_train_SMOTE)

  y = column_or_1d(y, warn=True)


In [16]:
pred_train_SMOTE = model.predict(X_train_SMOTE)
pred_test_SMOTE = model.predict(X_test)
print(classification_report(y_train_SMOTE, pred_train_SMOTE))
print(classification_report(y_test, pred_test_SMOTE))

              precision    recall  f1-score   support

           0       0.75      0.66      0.71      4138
           1       0.70      0.78      0.74      4138

    accuracy                           0.72      8276
   macro avg       0.73      0.72      0.72      8276
weighted avg       0.73      0.72      0.72      8276

              precision    recall  f1-score   support

           0       0.91      0.67      0.78      1036
           1       0.48      0.83      0.60       373

    accuracy                           0.71      1409
   macro avg       0.70      0.75      0.69      1409
weighted avg       0.80      0.71      0.73      1409



### Use decision tree classifier to fit the model and compute the accuracy of the model.

In [19]:
model = DecisionTreeClassifier()
model.fit(X_train_SMOTE, y_train_SMOTE)

In [20]:
pred_train_SMOTE = model.predict(X_train_SMOTE)
pred_test_SMOTE = model.predict(X_test)
print(classification_report(y_train_SMOTE, pred_train_SMOTE))
print(classification_report(y_test, pred_test_SMOTE))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4138
           1       1.00      0.99      0.99      4138

    accuracy                           0.99      8276
   macro avg       0.99      0.99      0.99      8276
weighted avg       0.99      0.99      0.99      8276

              precision    recall  f1-score   support

           0       0.82      0.74      0.78      1036
           1       0.43      0.54      0.48       373

    accuracy                           0.69      1409
   macro avg       0.62      0.64      0.63      1409
weighted avg       0.72      0.69      0.70      1409



### Compare the accuracies of the two models.

In [39]:
# Overfitting: decision tree has a great performance in the train set but low in the test set
# Logistic Regression has a slightly better accuracy and fits better between train and test sets

### Apply TomekLinks for downsampling

In [24]:
tl = TomekLinks()

X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)

In [25]:
y_train_tl.value_counts()/len(y_train_tl)

Churn
0        0.710694
1        0.289306
dtype: float64

In [26]:
model = LogisticRegression()
model.fit(X_train_tl, y_train_tl)

  y = column_or_1d(y, warn=True)


In [27]:
pred_train_tl = model.predict(X_train_tl)
pred_test_tl = model.predict(X_test)
print(classification_report(y_train_tl, pred_train_tl))
print(classification_report(y_test, pred_test_tl))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      3675
           1       0.67      0.52      0.59      1496

    accuracy                           0.79      5171
   macro avg       0.75      0.71      0.72      5171
weighted avg       0.78      0.79      0.78      5171

              precision    recall  f1-score   support

           0       0.85      0.88      0.86      1036
           1       0.62      0.56      0.59       373

    accuracy                           0.79      1409
   macro avg       0.73      0.72      0.72      1409
weighted avg       0.79      0.79      0.79      1409



### Use decision tree classifier to fit the model and compute the accuracy of the model.

In [34]:
tl = TomekLinks()

X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)

In [35]:
y_train_tl.value_counts()/len(y_train_tl)

Churn
0        0.710694
1        0.289306
dtype: float64

In [36]:
model = DecisionTreeClassifier()
model.fit(X_train_tl, y_train_tl)

In [37]:
pred_train_tl = model.predict(X_train_tl)
pred_test_tl = model.predict(X_test)
print(classification_report(y_train_tl, pred_train_tl))
print(classification_report(y_test, pred_test_tl))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3675
           1       0.99      0.97      0.98      1496

    accuracy                           0.99      5171
   macro avg       0.99      0.98      0.99      5171
weighted avg       0.99      0.99      0.99      5171

              precision    recall  f1-score   support

           0       0.83      0.79      0.81      1036
           1       0.49      0.55      0.52       373

    accuracy                           0.73      1409
   macro avg       0.66      0.67      0.66      1409
weighted avg       0.74      0.73      0.73      1409



### Compare the accuracies of the two models.

In [40]:
# Overfitting: decision tree has a great performance in the train set but significantly lower in the test set
# Logistic Regression has hugher accuracy and fits better between train and test sets

### You can also apply this algorithm one more time and check the how the imbalance in the two classes changed from the last time.

In [42]:
X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)

In [43]:
y_train_tl.value_counts()/len(y_train_tl)

Churn
0        0.710694
1        0.289306
dtype: float64

In [44]:
### Distribution didn´t change if applying again