In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sqlalchemy import create_engine
import psycopg2
from config import db_password

In [2]:
#create PostgresSQL connection
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/CAD"
engine = create_engine(db_string)
CHD_df = pd.read_sql('SELECT * FROM final_ml', engine)
CHD_df

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHD
0,0,42,4.0,1,15.0,0.0,0,0,0,243.0,126.0,85.0,22.02,78.0,63.0,0
1,0,64,1.0,1,8.0,0.0,0,1,0,317.0,182.5,88.0,20.52,75.0,79.0,0
2,0,46,1.0,1,20.0,0.0,0,0,0,245.0,97.0,65.0,23.80,60.0,73.0,0
3,0,60,2.0,1,2.0,0.0,0,1,0,460.0,131.0,96.0,25.21,75.0,80.0,0
4,1,51,1.0,0,0.0,0.0,0,1,0,220.0,151.0,87.5,22.01,80.0,86.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11186,0,55,1.0,0,0.0,0.0,0,0,0,232.0,119.0,81.0,30.00,60.0,100.0,0
11187,0,78,1.0,0,0.0,0.0,0,1,1,241.0,151.0,80.0,28.17,60.0,84.0,0
11188,1,56,3.0,0,0.0,0.0,0,0,0,280.0,123.0,75.0,27.82,68.0,112.0,0
11189,0,45,3.0,0,0.0,1.0,0,1,0,226.0,132.0,83.0,23.19,65.0,79.0,0


In [3]:
# Drop features with lower importance 

CHD_df.drop(['diabetes', 'heartRate', 'currentSmoker', 'BPMeds', 'male', 'age'], axis= 1, inplace=True)
CHD_df

Unnamed: 0,education,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,diaBP,BMI,glucose,CHD
0,4.0,15.0,0,0,243.0,126.0,85.0,22.02,63.0,0
1,1.0,8.0,0,1,317.0,182.5,88.0,20.52,79.0,0
2,1.0,20.0,0,0,245.0,97.0,65.0,23.80,73.0,0
3,2.0,2.0,0,1,460.0,131.0,96.0,25.21,80.0,0
4,1.0,0.0,0,1,220.0,151.0,87.5,22.01,86.0,0
...,...,...,...,...,...,...,...,...,...,...
11186,1.0,0.0,0,0,232.0,119.0,81.0,30.00,100.0,0
11187,1.0,0.0,0,1,241.0,151.0,80.0,28.17,84.0,0
11188,3.0,0.0,0,0,280.0,123.0,75.0,27.82,112.0,0
11189,3.0,0.0,0,1,226.0,132.0,83.0,23.19,79.0,0


In [4]:
# Split data into features and outcome
y = CHD_df["CHD"]
X = CHD_df.drop(columns="CHD")

In [5]:
y

0        0
1        0
2        0
3        0
4        0
        ..
11186    0
11187    0
11188    0
11189    0
11190    1
Name: CHD, Length: 11191, dtype: int64

In [6]:
X

Unnamed: 0,education,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,diaBP,BMI,glucose
0,4.0,15.0,0,0,243.0,126.0,85.0,22.02,63.0
1,1.0,8.0,0,1,317.0,182.5,88.0,20.52,79.0
2,1.0,20.0,0,0,245.0,97.0,65.0,23.80,73.0
3,2.0,2.0,0,1,460.0,131.0,96.0,25.21,80.0
4,1.0,0.0,0,1,220.0,151.0,87.5,22.01,86.0
...,...,...,...,...,...,...,...,...,...
11186,1.0,0.0,0,0,232.0,119.0,81.0,30.00,100.0
11187,1.0,0.0,0,1,241.0,151.0,80.0,28.17,84.0
11188,3.0,0.0,0,0,280.0,123.0,75.0,27.82,112.0
11189,3.0,0.0,0,1,226.0,132.0,83.0,23.19,79.0


In [7]:
# Split data into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

In [8]:
# Creating a logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=1)
classifier

LogisticRegression(max_iter=200, random_state=1)

In [9]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=100, multi_class='warn', penalty='12',
   random_state=1, solver='lbfgs', tol=0.0001, warm_start=False)

LogisticRegression(multi_class='warn', penalty='12', random_state=1)

In [10]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Train logistic regression model
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [12]:
# Validate logistic regression model
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
11073,0,0
8606,0,0
6394,0,0
2153,0,0
6135,0,0
...,...,...
10357,0,0
5987,0,0
1338,0,0
6868,0,0


In [13]:
# Evaluate the model's performance 
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))

0.8995711222301644


In [14]:
# Print confusion matrix - NEED TO FIX, TRY AS A DATAFRAME
#matrix = confusion_matrix(y_test, predictions)
#print(matrix)

matrix = confusion_matrix(y_test, predictions)
df_matrix = pd.DataFrame(matrix).transpose()

df_matrix

Unnamed: 0,0,1
0,2517,279
1,2,0


In [15]:
# Classification report as a DataFrame

report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.90      1.00      0.95      2519
           1       0.00      0.00      0.00       279

    accuracy                           0.90      2798
   macro avg       0.45      0.50      0.47      2798
weighted avg       0.81      0.90      0.85      2798

