In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('files_for_lab/Customer-Churn.csv')
data.columns = [columns_name.lower() for columns_name in data.columns]
data.columns = [columns_name.replace(' ', '_') for columns_name in data.columns]
data

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,monthlycharges,totalcharges,churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


### Part 1 - Applying SMOTE

In [3]:
X = data.drop('churn', axis=1)
y = data['churn']
X = pd.get_dummies(X, columns=X.select_dtypes(include=['object']).columns)

In [4]:
smote = SMOTE()

X_sm_train, X_sm_test, y_sm_train, y_sm_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_sm_train, y_sm_train = smote.fit_resample(X_sm_train, y_sm_train)

In [5]:
# LogisticRegression

classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(X_sm_train, y_sm_train)

predictions = classification.predict(X_sm_test)
print(classification_report(y_sm_test, predictions))

              precision    recall  f1-score   support

          No       0.87      0.84      0.85      1035
         Yes       0.59      0.65      0.62       374

    accuracy                           0.79      1409
   macro avg       0.73      0.74      0.74      1409
weighted avg       0.80      0.79      0.79      1409



In [6]:
# DecisionTreeClassifier

classification = DecisionTreeClassifier(random_state=42)
classification.fit(X_sm_train, y_sm_train)

predictions = classification.predict(X_sm_test)
print(classification_report(y_sm_test, predictions))

              precision    recall  f1-score   support

          No       0.84      0.84      0.84      1035
         Yes       0.55      0.55      0.55       374

    accuracy                           0.76      1409
   macro avg       0.69      0.69      0.69      1409
weighted avg       0.76      0.76      0.76      1409



#### Comparisons

The results show that, after using SMOTE, LogisticRegression has been more accurate than DecisionTreeClassifier in this ocassion

### Part 2 - Applying TomekLinks

In [7]:
tomek = TomekLinks()

X_tl_train, X_tl_test, y_tl_train, y_tl_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_tl_train, y_tl_train = tomek.fit_resample(X_tl_train, y_tl_train)

In [8]:
# LogisticRegression

classification = LogisticRegression(random_state=42, max_iter=100000)
classification.fit(X_tl_train, y_tl_train)

predictions = classification.predict(X_tl_test)
print(classification_report(y_tl_test, predictions))

              precision    recall  f1-score   support

          No       0.86      0.85      0.86      1035
         Yes       0.60      0.62      0.61       374

    accuracy                           0.79      1409
   macro avg       0.73      0.74      0.73      1409
weighted avg       0.79      0.79      0.79      1409



In [9]:
# DecisionTreeClassifier

classification = DecisionTreeClassifier(random_state=42)
classification.fit(X_tl_train, y_tl_train)

predictions = classification.predict(X_tl_test)
print(classification_report(y_tl_test, predictions))

              precision    recall  f1-score   support

          No       0.85      0.83      0.84      1035
         Yes       0.55      0.58      0.57       374

    accuracy                           0.76      1409
   macro avg       0.70      0.71      0.70      1409
weighted avg       0.77      0.76      0.77      1409



#### Comparisons

Again, the results show that, after using TomekLinks, LogisticRegression has been more accurate than DecisionTreeClassifier in this ocassion