##Lab - Imbalanced Data

In [10]:
#We will try to predict variable Churn using a logistic regression on variables 
#*tenure*, 
#*SeniorCitizen*,
#*MonthlyCharges*.

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [11]:
#Load the dataset and explore the variables.
data = pd.read_csv('/Users/caitlinsanderson/Documents/ironhack_course_work/Sian_repo/Notes/unit_3/fri_6_nov/customer_churn.csv')

In [12]:
data.shape

(7043, 21)

In [13]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [20]:
data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [23]:
data['Churn'].value_counts().to_frame()

Unnamed: 0,Churn
No,5174
Yes,1869


In [24]:
data['tenure'].value_counts().to_frame()

Unnamed: 0,tenure
1,613
72,362
2,238
3,200
4,176
...,...
28,57
39,56
44,51
36,50


In [25]:
data['tenure'].unique()

array([ 1, 34,  2, 45,  8, 22, 10, 28, 62, 13, 16, 58, 49, 25, 69, 52, 71,
       21, 12, 30, 47, 72, 17, 27,  5, 46, 11, 70, 63, 43, 15, 60, 18, 66,
        9,  3, 31, 50, 64, 56,  7, 42, 35, 48, 29, 65, 38, 68, 32, 55, 37,
       36, 41,  6,  4, 33, 67, 23, 57, 61, 14, 20, 53, 40, 59, 24, 44, 19,
       54, 51, 26,  0, 39])

In [26]:
data['SeniorCitizen'].value_counts().to_frame()

Unnamed: 0,SeniorCitizen
0,5901
1,1142


In [27]:
data['MonthlyCharges'].value_counts().to_frame()

Unnamed: 0,MonthlyCharges
20.05,61
19.85,45
19.95,44
19.90,44
20.00,43
...,...
114.75,1
103.60,1
113.40,1
57.65,1


In [29]:
data['MonthlyCharges'].unique()

array([29.85, 56.95, 53.85, ..., 63.1 , 44.2 , 78.7 ])

In [37]:
#Extract the target variable.
y = pd.DataFrame(data=data, columns=['Churn'])

In [38]:
#Extract the independent variables and scale them.
X = data[['tenure', 'SeniorCitizen','MonthlyCharges']]

In [39]:
transformer = StandardScaler().fit(X)
scaled_x = transformer.transform(X)

In [43]:
#Build the logistic regression model.
model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(scaled_x, data['Churn'])

#Evaluate the model.
#Even a simple model will give us more than 70% accuracy. Why?
model.score(scaled_x, data['Churn'])

0.7911401391452506

In [44]:
#Synthetic Minority Oversampling TEchnique (SMOTE) is an over sampling technique based on nearest neighbors 
    #that adds new points between existing points. 
#Apply imblearn.over_sampling.SMOTE to the dataset. 
#Build and evaluate the logistic regression model. Is it there any improvement?

from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_sm, y_sm = smote.fit_sample(scaled_x, y)
y_sm.value_counts()

1    5174
0    5174
Name: Churn, dtype: int64

In [45]:
model = LogisticRegression(random_state=0, solver='lbfgs',multi_class='ovr').fit(X_sm, y_sm)

In [46]:
model.score(scaled_x, y)

0.7306545506176345

In [47]:
from sklearn.metrics import confusion_matrix

y_pred=model.predict(X_sm)
confusion_matrix(y_pred, y_sm)

y_pred=model.predict(scaled_x)
confusion_matrix(y_pred, y)

array([[3775,  498],
       [1399, 1371]])

In [None]:
#Tomek links are pairs of very close instances, but of opposite classes. 
#Removing the instances of the majority class of each pair increases the space between the two classes, 
    #facilitating the classification process. 
#Apply imblearn.under_sampling.TomekLinks to the dataset. 
#Build and evaluate the logistic regression model. Is it there any improvement?

In [48]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_sample(X, y)
y_tl.value_counts()

0    4711
1    1869
Name: Churn, dtype: int64

In [49]:
model = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X_tl, y_tl)

In [50]:
model.score(X_tl, y_tl)

0.7908814589665654

In [51]:
y_pred=model.predict(X_tl)
confusion_matrix(y_pred, y_tl)

array([[4232,  897],
       [ 479,  972]])

In [52]:
model.score(scaled_x, y)

0.7346301292063041

In [53]:
y_pred=model.predict(scaled_x)
confusion_matrix(y_pred, y)

array([[5174, 1869],
       [   0,    0]])