In [52]:
import numpy as np
from numpy import mean,std
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,train_test_split,cross_val_score,cross_validate

In [53]:
X= pd.read_csv('../data/processed/X_train', header=0, index_col=0)
X_final_test = pd.read_csv('../data/processed/X_final_test', header=0, index_col=0)
y = pd.read_csv('../data/processed/y_train', header=0, index_col=0)
y_final_test = pd.read_csv('../data/processed/y_final_test', header=0, index_col=0)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Feature Scaling

In [55]:
from sklearn.preprocessing import StandardScaler

In [56]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(X_train)
print(X_test)

[[ 0.37296609 -1.48438197  0.84757578 ...  0.66040066 -0.74523022
  -0.94497498]
 [ 0.04357081 -1.48438197 -1.20616553 ... -0.93980094 -0.74523022
  -0.94497498]
 [-0.06622762  0.67368105 -1.20616553 ... -0.93980094 -0.74523022
   1.12575281]
 ...
 [-0.94461505 -1.48438197 -0.17929488 ...  2.26060226 -0.74523022
  -0.94497498]
 [-1.4936072  -1.48438197 -0.17929488 ... -0.93980094 -0.74523022
  -0.94497498]
 [ 0.59256295  0.67368105  0.84757578 ...  0.66040066  0.36381318
   1.12575281]]
[[-0.39562291  0.67368105 -2.23303618 -0.43213029 -0.75419858 -0.45993311
   1.02960061 -1.05950594  1.27475488  0.26715397 -0.93980094  0.36381318
  -0.94497498]
 [ 0.59256295  0.67368105  0.84757578  0.62580526  0.67787825 -0.45993311
   1.02960061 -0.31801333  1.27475488  1.422202    0.66040066  1.47285659
   1.12575281]
 [ 0.59256295 -1.48438197  0.84757578  0.89028914  0.17976457 -0.45993311
   1.02960061  0.33624486 -0.78446454  1.25719513  0.66040066  1.47285659
   1.12575281]
 [-0.39562291 -1.48

# LogisticRegression Model Building

In [57]:
# Training the Logistic Regression model on the Training set
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy:', accuracy_score(y_test, y_pred)*100,'%')
print(classification_report(y_test,y_pred))

[[24  4]
 [ 7 13]]
Accuracy: 77.08333333333334 %
              precision    recall  f1-score   support

           0       0.77      0.86      0.81        28
           1       0.76      0.65      0.70        20

    accuracy                           0.77        48
   macro avg       0.77      0.75      0.76        48
weighted avg       0.77      0.77      0.77        48



  y = column_or_1d(y, warn=True)


# Cross Fold Validation

In [58]:
cv = KFold(n_splits=10, random_state=42, shuffle=True)
model = LogisticRegression()
print(cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1).mean())
#print(cross_val_score(model, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1).mean())
#print("Test set:", cross_val_score(y_test, model.predict(X_test),scoring='accuracy'))

0.8257309941520468


In [59]:
#Need to be clear
#
import sklearn
model = sklearn.base.clone(model)
model.fit(X_train, y_train)
print("Test set RMSE:", mean_squared_error(y_test, model.predict(X_test), squared=False))
print("Mean validation RMSE:", -scores["test_score"].mean())

Test set RMSE: 0.47871355387816905
Mean validation RMSE: 0.4343729928243839


  y = column_or_1d(y, warn=True)


# Hyperparameter tuning for logistic regression model
Here, we are using grid searching for parameter tuning. The following hyperparameter are chosen for tuning:
solver[‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’]
There are five solver used in sklearn. Each solver uses to find the parameter weights that minimize the cost function.
penalty[‘none’, ‘l1’, ‘l2’, ‘elasticnet’]
Regularization addresses the problem of over-fitting by penalizing the parameters. We set the four penalties.
C [100, 10, 1.0, 0.1, 0.01]
The C parameter controls the penalty strength, which can also be effective.
max_iteration [20, 50, 100, 200, 500, 1000]
We set some random values in max_iteration to find out which iteration will be best for the solver to converge.

In [60]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [61]:
# define models and parameters
model = LogisticRegression()
solvers = ['lbfgs','newton-cg','liblinear','sag','saga']
penalty = ['l1', 'l2', 'elasticnet', 'none']
c_values = [100, 10, 1.0, 0.1, 0.01]
max_iteration= [20, 50, 100, 200, 500, 1000]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values,max_iter=max_iteration)
cv = KFold(n_splits=10, random_state=42, shuffle=True)
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.847076 using {'C': 0.1, 'max_iter': 20, 'penalty': 'l1', 'solver': 'saga'}
0.000000 (0.000000) with: {'C': 100, 'max_iter': 20, 'penalty': 'l1', 'solver': 'lbfgs'}
0.000000 (0.000000) with: {'C': 100, 'max_iter': 20, 'penalty': 'l1', 'solver': 'newton-cg'}
0.825731 (0.099615) with: {'C': 100, 'max_iter': 20, 'penalty': 'l1', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 100, 'max_iter': 20, 'penalty': 'l1', 'solver': 'sag'}
0.825731 (0.099615) with: {'C': 100, 'max_iter': 20, 'penalty': 'l1', 'solver': 'saga'}
0.825731 (0.099615) with: {'C': 100, 'max_iter': 20, 'penalty': 'l2', 'solver': 'lbfgs'}
0.825731 (0.099615) with: {'C': 100, 'max_iter': 20, 'penalty': 'l2', 'solver': 'newton-cg'}
0.825731 (0.099615) with: {'C': 100, 'max_iter': 20, 'penalty': 'l2', 'solver': 'liblinear'}
0.825731 (0.099615) with: {'C': 100, 'max_iter': 20, 'penalty': 'l2', 'solver': 'sag'}
0.825731 (0.099615) with: {'C': 100, 'max_iter': 20, 'penalty': 'l2', 'solver': 'saga'}
0.000000 (0.00000

2700 fits failed out of a total of 6000.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Asus\PycharmProjects\Heart-Disease-Machine-Learning-Exploration\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Asus\PycharmProjects\Heart-Disease-Machine-Learning-Exploration\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Asus\PycharmProjects\Heart-Disease-Machine-Learning-Exploration\venv\lib\site-packages\sklearn\linear_model\_logistic.py

# Result Summary:

First, we find 77.08% accuracy of our logistic regression model for our dataset using training_test_split method.
Secondly, we apply 10-fold cross validation for our model and find accuracy of 82.57%.
Finally, we use hyperparameter tuning method for our model and find accuracy of 84.71% which is an improvement than previous results.