In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Read in the data from GitHub

url = 'https://raw.githubusercontent.com/ccorboy/project_one/database/cleaned_framingham_ml.csv'
CHD_df = pd.read_csv(url)
CHD_df.head()


Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
# Drop features with lower importance 

CHD_df.drop(['diabetes', 'heartRate', 'currentSmoker', 'BPMeds', 'male', 'age'], axis= 1, inplace=True)
CHD_df

Unnamed: 0,education,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,diaBP,BMI,glucose,TenYearCHD
0,4.0,0.0,0,0,195.0,106.0,70.0,26.97,77.0,0
1,2.0,0.0,0,0,250.0,121.0,81.0,28.73,76.0,0
2,1.0,20.0,0,0,245.0,127.5,80.0,25.34,70.0,0
3,3.0,30.0,0,1,225.0,150.0,95.0,28.58,103.0,1
4,3.0,23.0,0,0,285.0,130.0,84.0,23.10,85.0,0
...,...,...,...,...,...,...,...,...,...,...
3653,1.0,1.0,0,1,313.0,179.0,92.0,25.97,86.0,1
3654,3.0,43.0,0,0,207.0,126.5,80.0,19.71,68.0,0
3655,2.0,0.0,0,0,269.0,133.5,83.0,21.47,107.0,0
3656,3.0,0.0,0,1,185.0,141.0,98.0,25.60,72.0,0


In [4]:
# Split data into features and outcome
y = CHD_df["TenYearCHD"]
X = CHD_df.drop(columns="TenYearCHD")

In [5]:
y

0       0
1       0
2       0
3       1
4       0
       ..
3653    1
3654    0
3655    0
3656    0
3657    0
Name: TenYearCHD, Length: 3658, dtype: int64

In [6]:
X

Unnamed: 0,education,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,diaBP,BMI,glucose
0,4.0,0.0,0,0,195.0,106.0,70.0,26.97,77.0
1,2.0,0.0,0,0,250.0,121.0,81.0,28.73,76.0
2,1.0,20.0,0,0,245.0,127.5,80.0,25.34,70.0
3,3.0,30.0,0,1,225.0,150.0,95.0,28.58,103.0
4,3.0,23.0,0,0,285.0,130.0,84.0,23.10,85.0
...,...,...,...,...,...,...,...,...,...
3653,1.0,1.0,0,1,313.0,179.0,92.0,25.97,86.0
3654,3.0,43.0,0,0,207.0,126.5,80.0,19.71,68.0
3655,2.0,0.0,0,0,269.0,133.5,83.0,21.47,107.0
3656,3.0,0.0,0,1,185.0,141.0,98.0,25.60,72.0


In [7]:
# Split data into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

In [8]:
# Creating a logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=1)
classifier

LogisticRegression(max_iter=200, random_state=1)

In [9]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=100, multi_class='warn', penalty='12',
   random_state=1, solver='lbfgs', tol=0.0001, warm_start=False)

LogisticRegression(multi_class='warn', penalty='12', random_state=1)

In [10]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Train logistic regression model
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [12]:
# Validate logistic regression model
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
1545,0,0
2063,0,0
2286,0,0
3061,0,1
1456,0,0
...,...,...
3341,0,0
477,0,1
429,0,0
3598,0,0


In [13]:
# Evaluate the model's performance 
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))

0.8415300546448088


In [14]:
# Print confusion matrix - NEED TO FIX, TRY AS A DATAFRAME
#matrix = confusion_matrix(y_test, predictions)
#print(matrix)

matrix = confusion_matrix(y_test, predictions)
df_matrix = pd.DataFrame(matrix).transpose()

df_matrix

Unnamed: 0,0,1
0,768,137
1,8,2


In [15]:
# Classification report as a DataFrame

report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.85      0.99      0.91       776
           1       0.20      0.01      0.03       139

    accuracy                           0.84       915
   macro avg       0.52      0.50      0.47       915
weighted avg       0.75      0.84      0.78       915

