In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np 
from collections import Counter
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Read in the data from GitHub

url = 'https://raw.githubusercontent.com/ccorboy/project_one/database/cleaned_framingham_ml.csv'
CHD_df = pd.read_csv(url)
CHD_df.head()


Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
# DataFrame statistics 
CHD_df.describe()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0
mean,0.443685,49.551941,1.980317,0.489065,9.025424,0.030344,0.005741,0.311646,0.027064,236.847731,132.370558,82.917031,25.782802,75.730727,81.852925,0.152269
std,0.496886,8.562029,1.022656,0.499949,11.92159,0.171557,0.075561,0.463229,0.162292,44.097681,22.086866,11.974258,4.065601,11.981525,23.904164,0.359331
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,113.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.08,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.38,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,143.875,90.0,28.0375,82.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,600.0,295.0,142.5,56.8,143.0,394.0,1.0


In [4]:
# Checking column data types
CHD_df.dtypes

male                 int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object

In [5]:
# Checking for null values

CHD_df.isna().sum()

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [6]:
model = LogisticRegression()

In [7]:
# Split data into features and outcome
y = CHD_df["TenYearCHD"]
X = CHD_df.drop(columns="TenYearCHD")

In [8]:
# Split data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=15, stratify=y)
Counter(y_train)

Counter({0: 2325, 1: 418})

In [9]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2743, 15)
(915, 15)
(2743,)
(915,)


In [10]:
# Creating a logistic regression model

classifier = LogisticRegression(solver='lbfgs',
   max_iter=100,
   random_state=1)
classifier

LogisticRegression(random_state=1)

In [11]:
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=100, multi_class='warn', penalty='12',
   random_state=1, solver='lbfgs', tol=0.0001, warm_start=False)

LogisticRegression(C=0.1, multi_class='warn', penalty='12', random_state=1)

In [12]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [13]:
# Train logistic regression model
classifier.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [14]:
# Validate logistic regression model
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
2464,0,0
2421,0,0
3010,0,0
2987,0,0
2408,0,0
...,...,...
3148,0,0
3561,0,0
3155,0,0
2940,0,0


In [15]:
# Evaluate the model's performance 
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))

0.8491803278688524


In [16]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,766,10
Actual 1,128,11


In [17]:
# Classification report as a DataFrame

report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.86      0.99      0.92       776
           1       0.52      0.08      0.14       139

    accuracy                           0.85       915
   macro avg       0.69      0.53      0.53       915
weighted avg       0.81      0.85      0.80       915



In [18]:
# Get feature importance
importance = classifier.coef_[0]

# Summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))


Feature: 0, Score: 0.37958
Feature: 1, Score: 0.55861
Feature: 2, Score: -0.03351
Feature: 3, Score: -0.05738
Feature: 4, Score: 0.19378
Feature: 5, Score: 0.02228
Feature: 6, Score: 0.04043
Feature: 7, Score: 0.15307
Feature: 8, Score: 0.02459
Feature: 9, Score: 0.18307
Feature: 10, Score: 0.38245
Feature: 11, Score: -0.14484
Feature: 12, Score: -0.02506
Feature: 13, Score: -0.06609
Feature: 14, Score: 0.16721


In [19]:
# Features with lowest coef are: Feature 8 (diabetes), Feature 13 (heartRate)