In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sqlalchemy import create_engine
import psycopg2
from config import db_password

In [2]:
#create PostgresSQL connection
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/CAD"
engine = create_engine(db_string)
CHD_df = pd.read_sql('SELECT * FROM final_ml', engine)
CHD_df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHD
0,0,42,4.0,1,15.0,0.0,0,0,0,243.0,126.0,85.0,22.02,78.0,63.0,0
1,0,64,1.0,1,8.0,0.0,0,1,0,317.0,182.5,88.0,20.52,75.0,79.0,0
2,0,46,1.0,1,20.0,0.0,0,0,0,245.0,97.0,65.0,23.8,60.0,73.0,0
3,0,60,2.0,1,2.0,0.0,0,1,0,460.0,131.0,96.0,25.21,75.0,80.0,0
4,1,51,1.0,0,0.0,0.0,0,1,0,220.0,151.0,87.5,22.01,80.0,86.0,0


In [3]:
# Checking column data types
CHD_df.dtypes

male                 int64
age                  int64
education            int64
currentSmoker        int64
cigsPerDay           int64
BPMeds               int64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol              int64
sysBP              float64
diaBP              float64
BMI                float64
heartRate            int64
glucose              int64
CHD                  int64
dtype: object

In [4]:
CHD_df.dtypes

male                 int64
age                  int64
education            int64
currentSmoker        int64
cigsPerDay           int64
BPMeds               int64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol              int64
sysBP              float64
diaBP              float64
BMI                float64
heartRate            int64
glucose              int64
CHD                  int64
dtype: object

In [5]:
# Split data into features and outcome
y = CHD_df["CHD"]
X = CHD_df.drop(columns="CHD")

In [6]:
# Split data into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)


In [7]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)

In [8]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=5) 

In [10]:
# Fitting the model Random Forrest
rf_model = rf_model.fit(X_train_scaled, y_train)

In [11]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [12]:
# Accuracy score 
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))

0.860614724803431


In [13]:
# Making predictions using the testing data (random forest).
predictions = rf_model.predict(X_test_scaled)

In [14]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.01838164, 0.11196585, 0.03661135, 0.01366465, 0.0434605 ,
       0.01133826, 0.00577341, 0.01496135, 0.0138665 , 0.12869539,
       0.13040559, 0.11570277, 0.13695955, 0.10245052, 0.11576268])

In [15]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2408,120
Actual 1,270,0


In [16]:
# Classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.90      0.95      0.93      2528
           1       0.00      0.00      0.00       270

    accuracy                           0.86      2798
   macro avg       0.45      0.48      0.46      2798
weighted avg       0.81      0.86      0.84      2798

