In [15]:
import imblearn
import pandas as pd
import numpy as np

In [16]:
#Load the dataset and explore the variables.
data = pd.read_csv('customer_churn.csv')

In [17]:
data.sample(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
2621,8766-PAFNE,Male,0,Yes,No,71,Yes,No,DSL,No,...,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),79.1,5564.85,No
3008,8084-OIVBS,Female,0,No,No,11,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,20.0,211.95,No
2883,7225-IILWY,Male,0,Yes,Yes,68,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,1686.15,No
953,0851-DFJKB,Female,0,No,No,15,Yes,No,DSL,Yes,...,Yes,Yes,No,No,Month-to-month,No,Electronic check,58.95,955.15,No
1879,3494-JCHRQ,Male,0,No,No,1,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.05,70.05,Yes
3127,1432-FPAXX,Female,0,No,No,29,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,30.6,856.35,Yes
2063,7833-PKIHD,Male,0,Yes,Yes,51,Yes,No,DSL,Yes,...,No,Yes,Yes,Yes,One year,Yes,Bank transfer (automatic),76.4,3966.3,No
4390,0377-JBKKT,Male,0,Yes,Yes,22,Yes,No,DSL,Yes,...,No,Yes,No,No,Month-to-month,Yes,Mailed check,57.95,1271.8,No
6055,6599-SFQVE,Female,0,No,No,6,Yes,No,DSL,No,...,No,No,No,Yes,Month-to-month,Yes,Electronic check,55.0,340.4,No
1260,9992-UJOEL,Male,0,No,No,2,Yes,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,50.3,92.75,No


In [18]:
#We will try to predict variable Churn using a 
#logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.
data[['tenure', 'SeniorCitizen', 'MonthlyCharges']]

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,1,0,29.85
1,34,0,56.95
2,2,0,53.85
3,45,0,42.30
4,2,0,70.70
...,...,...,...
7038,24,0,84.80
7039,72,0,103.20
7040,11,0,29.60
7041,4,1,74.40


In [19]:
data['Churn'] = pd.get_dummies(data['Churn'])
data['Churn']

0       1
1       1
2       0
3       1
4       0
       ..
7038    1
7039    1
7040    1
7041    0
7042    1
Name: Churn, Length: 7043, dtype: uint8

In [20]:
#Extract the target variable.
#Extract the independent variables and scale them.
X = data[['tenure', 'SeniorCitizen', 'MonthlyCharges']]
y = data[['Churn']]

In [21]:
from sklearn.preprocessing import StandardScaler
transformer = StandardScaler().fit(X)
X_scaled = pd.DataFrame(transformer.transform(X),columns=X.columns)
X_scaled.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,-1.277445,-0.439916,-1.160323
1,0.066327,-0.439916,-0.259629
2,-1.236724,-0.439916,-0.36266
3,0.514251,-0.439916,-0.746535
4,-1.236724,-0.439916,0.197365


In [22]:
#Build the logistic regression model.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state=0, solver='saga')
LR.fit(X, y)

  return f(**kwargs)


LogisticRegression(random_state=0, solver='saga')

In [24]:
#Evaluate the model.
predictions = LR.predict(X_test)
LR.score(X_test, y_test)

0.8041163946061036

In [25]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))


precision:  0.8327495621716288
recall:  0.917953667953668
f1:  0.8732782369146005


In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[182, 191],
       [ 85, 951]], dtype=int64)

In [None]:
#Even a simple model will give us more than 70% accuracy. Why?

In [32]:
#Synthetic Minority Oversampling Technique (SMOTE) is an over sampling technique based on
#nearest neighbors that adds new points between existing points. 
#Apply imblearn.over_sampling.SMOTE to the dataset. Build and evaluate the logistic 
#regression model. Is it there any improvement?
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42, sampling_strategy ='minority',k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_sample(X_train,y_train)

In [33]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.8970760233918129
recall:  0.7403474903474904
f1:  0.8112109994711794


  return f(**kwargs)


In [34]:
confusion_matrix(y_test,pred)

array([[285,  88],
       [269, 767]], dtype=int64)

In [None]:
##### VERDICT - There is no improvement with the new model, it predicted less correct answers and it had a higher rate of 
# 'false negative' results, which is the opposite of what we wanted to predict (which clients will churn).