In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np 
from collections import Counter
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler

from sqlalchemy import create_engine
import psycopg2
from config import db_password

In [2]:
#create PostgresSQL connection
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/CAD"
engine = create_engine(db_string)
CHD_df = pd.read_sql('SELECT * FROM final_ml', engine)
CHD_df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHD
0,0,42,4.0,1,15.0,0.0,0,0,0,243.0,126.0,85.0,22.02,78.0,63.0,0
1,0,64,1.0,1,8.0,0.0,0,1,0,317.0,182.5,88.0,20.52,75.0,79.0,0
2,0,46,1.0,1,20.0,0.0,0,0,0,245.0,97.0,65.0,23.8,60.0,73.0,0
3,0,60,2.0,1,2.0,0.0,0,1,0,460.0,131.0,96.0,25.21,75.0,80.0,0
4,1,51,1.0,0,0.0,0.0,0,1,0,220.0,151.0,87.5,22.01,80.0,86.0,0


In [3]:
# DataFrame statistics 
CHD_df.describe()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHD
count,11191.0,11191.0,11191.0,11191.0,11191.0,11191.0,11191.0,11191.0,11191.0,11191.0,11191.0,11191.0,11191.0,11191.0,11191.0,11191.0
mean,0.43812,54.443035,1.989635,0.438299,8.502457,0.0848,0.012242,0.455991,0.045215,241.602538,136.397194,83.101019,25.882649,76.802877,84.104012,0.099723
std,0.496178,9.419929,1.02971,0.496201,12.31136,0.278597,0.109969,0.498082,0.207784,44.562488,22.982082,11.843891,4.107707,12.36393,24.751048,0.299644
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,112.0,83.5,30.0,14.43,42.0,39.0,0.0
25%,0.0,47.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,212.0,120.0,75.0,23.1,69.0,73.0,0.0
50%,0.0,54.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,240.0,132.0,82.0,25.48,75.0,81.0,0.0
75%,1.0,61.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,267.0,149.0,90.0,28.08,85.0,87.0,0.0
max,1.0,81.0,4.0,1.0,90.0,1.0,1.0,1.0,1.0,638.0,295.0,150.0,56.8,150.0,478.0,1.0


In [4]:
# Checking column data types
CHD_df.dtypes

male                 int64
age                  int64
education            int64
currentSmoker        int64
cigsPerDay           int64
BPMeds               int64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol              int64
sysBP              float64
diaBP              float64
BMI                float64
heartRate            int64
glucose              int64
CHD                  int64
dtype: object

In [5]:
# Checking for null values
CHD_df.isna().sum()

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
CHD                0
dtype: int64

In [6]:
model = LogisticRegression()

In [7]:
# Split data into features and outcome
y = CHD_df["CHD"]
X = CHD_df.drop(columns="CHD")

In [8]:
# Split data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=15, stratify=y)
Counter(y_train)

Counter({1: 837, 0: 7556})

In [9]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8393, 15)
(2798, 15)
(8393,)
(2798,)


In [10]:
# Creating a logistic regression model

classifier = LogisticRegression(solver='lbfgs',
   max_iter=100,
   random_state=1)
classifier

LogisticRegression(random_state=1)

In [11]:
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=100, multi_class='warn', penalty='12',
   random_state=1, solver='lbfgs', tol=0.0001, warm_start=False)



LogisticRegression(C=0.1, multi_class='warn', penalty='12', random_state=1)

In [12]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [13]:
# Train logistic regression model
classifier.fit(X_train_scaled, y_train)

#model = LogisticRegression(solver='lbfgs', random_state=1)
#model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [14]:
# Validate logistic regression model
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
5717,0,0
4671,0,0
1546,0,0
3036,0,0
488,0,0
...,...,...
1745,0,0
130,0,1
3756,0,0
10745,0,0


In [15]:
# Evaluate the model's performance 
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))


0.899928520371694


In [16]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual No CHD", "Actual CHD"], columns=["Predicted No CHD", "Predicted CHD"])

cm_df

Unnamed: 0,Predicted No CHD,Predicted CHD
Actual No CHD,2518,1
Actual CHD,279,0


In [17]:
# Classification report as a DataFrame

report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.90      1.00      0.95      2519
           1       0.00      0.00      0.00       279

    accuracy                           0.90      2798
   macro avg       0.45      0.50      0.47      2798
weighted avg       0.81      0.90      0.85      2798



In [18]:
# Get feature importance
importance = classifier.coef_[0]

# Summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))


Feature: 0, Score: 0.32491
Feature: 1, Score: 0.41045
Feature: 2, Score: -0.05359
Feature: 3, Score: 0.08123
Feature: 4, Score: -0.00244
Feature: 5, Score: 0.02957
Feature: 6, Score: 0.02892
Feature: 7, Score: 0.07593
Feature: 8, Score: 0.09945
Feature: 9, Score: 0.13915
Feature: 10, Score: 0.23783
Feature: 11, Score: -0.03381
Feature: 12, Score: 0.06343
Feature: 13, Score: -0.06625
Feature: 14, Score: 0.05962


In [19]:
# Features with lowest coef are: Feature 8 (diabetes), Feature 13 (heartRate)