In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from numpy import arange
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from fpdf import FPDF
from sklearn.pipeline import Pipeline
import pickle
from datetime import datetime as dt
from datetime import timedelta
from imblearn.over_sampling import SMOTE

In [2]:
%matplotlib inline
plt.style.use('fivethirtyeight')
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [3]:
df_churn = pd.read_pickle('./data/training/churn_practise.pickle')

In [4]:
df_churn.head()

Unnamed: 0,SENIOR_CITIZEN,TENURE,MONTHLY_CHARGES,CHURN_STATUS,GENDER_F,GENDER_M
0,0,1,29.85,0,1,0
1,0,34,56.95,0,0,1
2,0,2,53.85,1,0,1
3,0,45,42.3,0,0,1
4,0,2,70.7,1,1,0


In [5]:
df = df_churn.copy()

In [6]:
df.shape

(7043, 6)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   SENIOR_CITIZEN   7043 non-null   int64  
 1   TENURE           7043 non-null   int64  
 2   MONTHLY_CHARGES  7043 non-null   float64
 3   CHURN_STATUS     7043 non-null   int64  
 4   GENDER_F         7043 non-null   uint8  
 5   GENDER_M         7043 non-null   uint8  
dtypes: float64(1), int64(3), uint8(2)
memory usage: 288.9 KB


In [8]:
df.columns

Index(['SENIOR_CITIZEN', 'TENURE', 'MONTHLY_CHARGES', 'CHURN_STATUS',
       'GENDER_F', 'GENDER_M'],
      dtype='object')

In [9]:
df.describe()

Unnamed: 0,SENIOR_CITIZEN,TENURE,MONTHLY_CHARGES,CHURN_STATUS,GENDER_F,GENDER_M
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,0.26537,0.495244,0.504756
std,0.368612,24.559481,30.090047,0.441561,0.500013,0.500013
min,0.0,0.0,18.25,0.0,0.0,0.0
25%,0.0,9.0,35.5,0.0,0.0,0.0
50%,0.0,29.0,70.35,0.0,0.0,1.0
75%,0.0,55.0,89.85,1.0,1.0,1.0
max,1.0,72.0,118.75,1.0,1.0,1.0


In [10]:
X = df.drop('CHURN_STATUS', axis= 1)
y = df['CHURN_STATUS'].values

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.2, random_state= 0)
X_train,X_val,y_train,y_val =train_test_split (X_train,y_train, test_size = 0.25, random_state=0)

In [12]:
df.CHURN_STATUS.value_counts()

0    5174
1    1869
Name: CHURN_STATUS, dtype: int64

In [13]:
sm = SMOTE()

In [14]:
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

In [15]:
X_train.shape

(4225, 5)

In [16]:
X_train_smote.shape

(6178, 5)

In [17]:
y_train_smote.shape

(6178,)

In [18]:
(unique, counts) = np.unique(y_train_smote, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

[[   0 3089]
 [   1 3089]]


In [19]:
start = dt.now()
print(" Starting ------- Logistic Regression")
pipline = Pipeline([
    ('logistic', LogisticRegression())
])
param_grid ={
    'logistic__penalty':('l1', 'l2', 'elasticnet', 'none'),
    'logistic__solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
    'logistic__max_iter':[50,100,500]
}
model = GridSearchCV ( estimator= pipline,
                      param_grid = param_grid,
                      scoring = 'roc_auc',
                      n_jobs = -1,
                      pre_dispatch = '2*n_jobs',
                      cv = 5,
                      verbose = 1,
                      return_train_score = False)
model.fit(X_train,y_train)
pkl_filename = './models/logistic_regression_practise.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)
X_val_np = X_val.to_numpy()
predicted = model.predict(X_val_np)
proba = model.predict_proba(X_val_np)
accuracy = accuracy_score(y_val,predicted)
CM = confusion_matrix(y_val, predicted)
(TN,FN,TP,FP) = (CM[0][0],CM[1][0],CM[1][1],CM[0][1])
FPR = FP/(FP + TN)
precision = TP / (TP + FP)
recall = TP / (TP+FN)
F1_Score = 2 * (precision * recall) / (precision + recall)
end = dt.now()
print("Best Params-")
print(model.best_params_)
print("\n")
print("Algorithm: ", 'Logistic Regression')
print("Accuracy: ",accuracy)
print ("Precision:", precision)
print ("Recall", recall)
print ("F1-support:", F1_Score)
print ("FPR", FPR)
print( "Runtime:", (end - start))
print("Best Params-")
print(model.best_params_)

 Starting ------- Logistic Regression
Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Params-
{'logistic__max_iter': 50, 'logistic__penalty': 'none', 'logistic__solver': 'newton-cg'}


Algorithm:  Logistic Regression
Accuracy:  0.7792760823278921
Precision: 0.6071428571428571
Recall 0.4191780821917808
F1-support: 0.4959481361426256
FPR 0.09482758620689655
Runtime: 0:00:02.987874
Best Params-
{'logistic__max_iter': 50, 'logistic__penalty': 'none', 'logistic__solver': 'newton-cg'}


 0.8203287  0.82034011 0.81615115 0.80916106        nan        nan
        nan        nan        nan 0.82041843 0.82040845        nan
 0.81624372 0.80917389        nan        nan 0.82035148        nan
 0.81610124 0.82037711 0.82038852 0.82034011 0.81934997 0.8161369
        nan        nan        nan        nan        nan 0.82041843
 0.82041843        nan 0.81933857 0.81618244        nan        nan
 0.82035719        nan 0.82031303 0.82037711 0.82038852 0.82034011
 0.8203344  0.82032732        nan        nan        nan        nan
        nan 0.82041843 0.82041843        nan 0.82040424 0.82031589]


In [20]:
log_reg = LogisticRegression(penalty="l2", solver= 'newton-cg', n_jobs= -1, max_iter= 50,
                             verbose= 1 )

In [21]:
log_reg.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


LogisticRegression(max_iter=50, n_jobs=-1, solver='newton-cg', verbose=1)

In [22]:
predicted = log_reg.predict(X_val_np)
accuracy = accuracy_score(y_val,predicted)
print('Accuracy:', accuracy)

Accuracy: 0.7792760823278921
