In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
from imblearn.over_sampling import RandomOverSampler

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score

from sqlalchemy import create_engine
import psycopg2
from config import db_password


In [2]:
#create PostgresSQL connection
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/CAD"
engine = create_engine(db_string)
CHD_df = pd.read_sql('SELECT * FROM final_ml', engine)
CHD_df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHD
0,0,42,4.0,1,15.0,0.0,0,0,0,243.0,126.0,85.0,22.02,78.0,63.0,0
1,0,64,1.0,1,8.0,0.0,0,1,0,317.0,182.5,88.0,20.52,75.0,79.0,0
2,0,46,1.0,1,20.0,0.0,0,0,0,245.0,97.0,65.0,23.8,60.0,73.0,0
3,0,60,2.0,1,2.0,0.0,0,1,0,460.0,131.0,96.0,25.21,75.0,80.0,0
4,1,51,1.0,0,0.0,0.0,0,1,0,220.0,151.0,87.5,22.01,80.0,86.0,0


In [3]:
# Drop features with lower importance 

CHD_df.drop(['cigsPerDay', 'BPMeds', 'prevalentStroke'], axis= 1, inplace=True)
CHD_df

Unnamed: 0,male,age,education,currentSmoker,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHD
0,0,42,4,1,0,0,243,126.0,85.0,22.02,78,63,0
1,0,64,1,1,1,0,317,182.5,88.0,20.52,75,79,0
2,0,46,1,1,0,0,245,97.0,65.0,23.80,60,73,0
3,0,60,2,1,1,0,460,131.0,96.0,25.21,75,80,0
4,1,51,1,0,1,0,220,151.0,87.5,22.01,80,86,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11186,0,55,1,0,0,0,232,119.0,81.0,30.00,60,100,0
11187,0,78,1,0,1,1,241,151.0,80.0,28.17,60,84,0
11188,1,56,3,0,0,0,280,123.0,75.0,27.82,68,112,0
11189,0,45,3,0,1,0,226,132.0,83.0,23.19,65,79,0


In [4]:
# Split data into features and outcome
y = CHD_df["CHD"]
X = CHD_df.drop(columns="CHD")

In [5]:
y

0        0
1        0
2        0
3        0
4        0
        ..
11186    0
11187    0
11188    0
11189    0
11190    1
Name: CHD, Length: 11191, dtype: int64

In [6]:
X

Unnamed: 0,male,age,education,currentSmoker,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
0,0,42,4,1,0,0,243,126.0,85.0,22.02,78,63
1,0,64,1,1,1,0,317,182.5,88.0,20.52,75,79
2,0,46,1,1,0,0,245,97.0,65.0,23.80,60,73
3,0,60,2,1,1,0,460,131.0,96.0,25.21,75,80
4,1,51,1,0,1,0,220,151.0,87.5,22.01,80,86
...,...,...,...,...,...,...,...,...,...,...,...,...
11186,0,55,1,0,0,0,232,119.0,81.0,30.00,60,100
11187,0,78,1,0,1,1,241,151.0,80.0,28.17,60,84
11188,1,56,3,0,0,0,280,123.0,75.0,27.82,68,112
11189,0,45,3,0,1,0,226,132.0,83.0,23.19,65,79


In [7]:
# Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

In [8]:
# Creating a logistic regression model

classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=1)
classifier

LogisticRegression(max_iter=200, random_state=1)

In [9]:
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=100, multi_class='warn', penalty='12',
   random_state=1, solver='lbfgs', tol=0.0001, warm_start=False)

LogisticRegression(C=0.1, multi_class='warn', penalty='12', random_state=1)

In [10]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Train logistic regression model
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [12]:
# Validate logistic regression model
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
11073,0,0
8606,0,0
6394,0,0
2153,0,0
6135,0,0
...,...,...
10357,0,0
5987,0,0
1338,0,0
6868,0,0


In [13]:
# Evaluate the model's performance 

print(accuracy_score(y_test, predictions))

0.9002859185132237


In [14]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Print confusion matrix 

#matrix = confusion_matrix(y_test, predictions)
#df_matrix = pd.DataFrame(matrix).transpose()

#df_matrix


cm_df = pd.DataFrame(
    cm, index=["Developed CHD", "Didn't Develop CHD"], columns=["Predicted CHD", "Predicted No CHD"])

cm_df

Unnamed: 0,Predicted CHD,Predicted No CHD
Developed CHD,2518,1
Didn't Develop CHD,278,1


In [15]:
# Classification report as a DataFrame
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.90      1.00      0.95      2519
           1       0.50      0.00      0.01       279

    accuracy                           0.90      2798
   macro avg       0.70      0.50      0.48      2798
weighted avg       0.86      0.90      0.85      2798



In [16]:
# Get feature importance
importance = classifier.coef_[0]

# Summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.37374
Feature: 1, Score: 0.41506
Feature: 2, Score: -0.08655
Feature: 3, Score: 0.05782
Feature: 4, Score: 0.06550
Feature: 5, Score: 0.11245
Feature: 6, Score: 0.12514
Feature: 7, Score: 0.25785
Feature: 8, Score: -0.07195
Feature: 9, Score: 0.07128
Feature: 10, Score: -0.08541
Feature: 11, Score: 0.06975
