In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
#Balancing the data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

Instructions:
    
    * Load the dataset and explore the variables.

In [5]:
df = pd.read_csv("customer_churn.csv")

In [6]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [7]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [8]:
df.Churn.value_counts()/len(df)
# Yes is the minority class

No     0.73463
Yes    0.26537
Name: Churn, dtype: float64

In [9]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


* We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.

In [11]:
df.tenure.value_counts()

1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: tenure, Length: 73, dtype: int64

In [12]:
df.SeniorCitizen.value_counts()

0    5901
1    1142
Name: SeniorCitizen, dtype: int64

In [13]:
df.MonthlyCharges.value_counts()

20.05     61
19.85     45
19.95     44
19.90     44
20.00     43
          ..
23.65      1
114.70     1
43.65      1
87.80      1
78.70      1
Name: MonthlyCharges, Length: 1585, dtype: int64

* Extract the target variable.

In [14]:
y = df.Churn

* Extract the independent variables and scale them.

In [15]:
X = df.loc[: , ['tenure', 'SeniorCitizen','MonthlyCharges']]

In [16]:
X.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,1,0,29.85
1,34,0,56.95
2,2,0,53.85
3,45,0,42.3
4,2,0,70.7


In [17]:
y.value_counts() 

No     5174
Yes    1869
Name: Churn, dtype: int64

In [18]:
#Transform the target variable to a boolean type

y = pd.get_dummies (y, drop_first = True)

In [19]:
y.value_counts() 
#0 is No
#1 is Yes

Yes
0      5174
1      1869
dtype: int64

In [20]:
#Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

In [21]:
#Scale the variables

In [22]:
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

* Build the logistic regression model.

In [23]:
from sklearn.metrics import confusion_matrix, classification_report

In [25]:
model = LogisticRegression()
model.fit(X_train, y_train)

pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print(classification_report(y_test, pred_test))
print(classification_report(y_train, pred_train))


              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1298
           1       0.61      0.46      0.52       463

    accuracy                           0.78      1761
   macro avg       0.72      0.68      0.69      1761
weighted avg       0.77      0.78      0.77      1761

              precision    recall  f1-score   support

           0       0.82      0.91      0.87      3876
           1       0.65      0.47      0.54      1406

    accuracy                           0.79      5282
   macro avg       0.74      0.69      0.71      5282
weighted avg       0.78      0.79      0.78      5282



  y = column_or_1d(y, warn=True)


In [None]:
* Evaluate the model:
    ** Our model has higher scores for the two label on the train than on the test, what means that the model it's too simple: it's not learning from the data and can't predict our target variable.
    ** Altough the scores are higher when predicting 0 than 1, this could be because of the imbalance of the data. 

* Even a simple model will give us more than 70% accuracy. Why?
    ** Accuracy is just counting the corrected predictions among all the data. This can lead us to think that the predictions are better than they really are
    

* Synthetic Minority Oversampling Technique (SMOTE) is an over sampling technique based on nearest neighbors that adds new points between existing points. 
* Apply imblearn.over_sampling.SMOTE to the dataset. Build and evaluate the logistic regression model. Is it there any improvement?

In [34]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(k_neighbors=3)

X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)

In [35]:
print(X_train.shape)
print(X_train_SMOTE.shape)

(5282, 3)
(7752, 3)


In [37]:
y_train.value_counts()/len(y_train)

Yes
0      0.733813
1      0.266187
dtype: float64

In [38]:
y_train_SMOTE.value_counts()/len(y_train_SMOTE)

Yes
0      0.5
1      0.5
dtype: float64

In [36]:
model = LogisticRegression()
model.fit(X_train_SMOTE, y_train_SMOTE)

pred_train_SMOTE = model.predict(X_train_SMOTE)
pred_test_SMOTE = model.predict(X_test)
print(classification_report(y_test, pred_test_SMOTE))
print(classification_report(y_train_SMOTE, pred_train_SMOTE))

              precision    recall  f1-score   support

           0       0.88      0.72      0.79      1298
           1       0.48      0.72      0.57       463

    accuracy                           0.72      1761
   macro avg       0.68      0.72      0.68      1761
weighted avg       0.77      0.72      0.73      1761

              precision    recall  f1-score   support

           0       0.74      0.73      0.74      3876
           1       0.74      0.74      0.74      3876

    accuracy                           0.74      7752
   macro avg       0.74      0.74      0.74      7752
weighted avg       0.74      0.74      0.74      7752



  y = column_or_1d(y, warn=True)


In [None]:
* Evaluate the model:
    ** In this case, precision and f1-score of test are better than train when predicting label 0. But recall is higher on train than on test, which means among all the churn, the model couldn't predict the 0s.
    ** There is still a big difference between train vs test when predicting label 1. The model can't predict this label.



* Tomek links are pairs of very close instances, but of opposite classes. Removing the instances of the majority class of each pair increases the space between the two classes, facilitating the classification process. Apply imblearn.under_sampling.TomekLinks to the dataset. 
* Build and evaluate the logistic regression model. Is it there any improvement?

In [39]:
from imblearn.under_sampling import TomekLinks

In [40]:
y_train

Unnamed: 0,Yes
3296,0
6397,0
6043,1
5309,0
3000,1
...,...
4931,0
3264,0
1653,0
2607,1


In [44]:
X= X_train
y= y_train

In [45]:
tl = TomekLinks()
X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)

In [46]:
display(X_train_tl.shape)
X_train.shape

(4921, 3)

(5282, 3)

In [47]:
y_train_tl.value_counts()/len(y_train_tl)

Yes
0      0.714286
1      0.285714
dtype: float64

In [48]:
y_train.value_counts()

Yes
0      3876
1      1406
dtype: int64

In [49]:
model = LogisticRegression()
model.fit(X_train_tl, y_train_tl)

pred_train_tl = model.predict(X_train_tl)
pred_test_tl = model.predict(X_test)
print(classification_report(y_test, pred_test_tl))
print(classification_report(y_train_tl, pred_train_tl))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84      1298
           1       0.56      0.52      0.54       463

    accuracy                           0.77      1761
   macro avg       0.70      0.69      0.69      1761
weighted avg       0.76      0.77      0.76      1761

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      3515
           1       0.68      0.53      0.60      1406

    accuracy                           0.79      4921
   macro avg       0.75      0.72      0.73      4921
weighted avg       0.79      0.79      0.79      4921



  y = column_or_1d(y, warn=True)


In [None]:
** When applying TomekLinks the metrics for train are still higher than for test, now even for label 0.
** The model is still basic to be able to learn or predict from the data. 
