In [11]:
import imblearn
import pandas as pd
import numpy as np

In [12]:
#Load the dataset and explore the variables.
data = pd.read_csv('customer_churn.csv')

In [13]:
data.sample(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
863,9944-HKVVB,Female,0,No,No,3,Yes,No,Fiber optic,No,...,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,95.1,307.4,Yes
5747,9730-DRTMJ,Male,0,Yes,No,32,Yes,Yes,DSL,Yes,...,No,Yes,Yes,No,One year,Yes,Credit card (automatic),72.8,2333.05,No
1940,4890-VMUAV,Male,0,No,No,63,Yes,Yes,DSL,Yes,...,Yes,Yes,No,No,One year,No,Electronic check,71.5,4576.3,No
5148,6196-HBOBZ,Male,0,Yes,No,65,Yes,Yes,Fiber optic,Yes,...,Yes,No,Yes,No,Two year,Yes,Electronic check,99.35,6347.55,No
5569,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,Fiber optic,No,...,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4,Yes
4605,3796-ENZGF,Male,0,Yes,No,63,Yes,Yes,DSL,No,...,No,Yes,No,Yes,Two year,No,Mailed check,67.25,4234.15,No
6433,2346-LOCWC,Female,0,Yes,Yes,58,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),20.5,1191.4,No
2765,9506-UXUSK,Male,0,No,No,13,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.15,931.75,No
2478,7878-JGDKK,Male,0,No,No,4,Yes,No,DSL,No,...,No,No,No,No,Month-to-month,No,Mailed check,44.55,220.75,No
1682,9225-BZLNZ,Male,0,Yes,No,72,Yes,Yes,DSL,Yes,...,Yes,No,Yes,Yes,Two year,Yes,Electronic check,85.25,6083.1,No


In [49]:
#We will try to predict variable Churn using a 
#logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.
data[['tenure', 'SeniorCitizen', 'MonthlyCharges']]

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,1,0,29.85
1,34,0,56.95
2,2,0,53.85
3,45,0,42.30
4,2,0,70.70
...,...,...,...
7038,24,0,84.80
7039,72,0,103.20
7040,11,0,29.60
7041,4,1,74.40


In [36]:
data['Churn'] = pd.get_dummies(data['Churn'])
data['Churn']

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: uint8

In [37]:
#Extract the target variable.
#Extract the independent variables and scale them.
X = data[['tenure', 'SeniorCitizen', 'MonthlyCharges']]
y = data[['Churn']]

In [38]:
from sklearn.preprocessing import StandardScaler
transformer = StandardScaler().fit(X)
X_scaled = pd.DataFrame(transformer.transform(X),columns=X.columns)
X_scaled.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,-1.277445,-0.439916,-1.160323
1,0.066327,-0.439916,-0.259629
2,-1.236724,-0.439916,-0.36266
3,0.514251,-0.439916,-0.746535
4,-1.236724,-0.439916,0.197365


In [39]:
#Build the logistic regression model.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state=0, solver='saga')
LR.fit(X, y)

  return f(**kwargs)


LogisticRegression(random_state=0, solver='saga')

In [41]:
#Evaluate the model.
predictions = LR.predict(X_test)
LR.score(X_test, y_test)

0.8041163946061036

In [42]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))


precision:  0.6816479400749064
recall:  0.4879356568364611
f1:  0.56875


In [None]:
#Even a simple model will give us more than 70% accuracy. Why?

In [46]:
#Synthetic Minority Oversampling Technique (SMOTE) is an over sampling technique based on
#nearest neighbors that adds new points between existing points. 
#Apply imblearn.over_sampling.SMOTE to the dataset. Build and evaluate the logistic 
#regression model. Is it there any improvement?
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=100, sampling_strategy ='minority',k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_sample(X_train,y_train)

In [47]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.5145454545454545
recall:  0.7587131367292225
f1:  0.6132177681473456


  return f(**kwargs)
