In [1]:
# import library.
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import metrics

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# import dataset.
framingham = pd.read_csv('../data/framingham.csv')
framingham.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB


In [3]:
# split features and target.
framingham_demo = framingham.dropna()
X = framingham_demo.iloc[:, :-1].copy()
y = framingham_demo['TenYearCHD'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.35, random_state=1)

In [4]:
# train model.
framingham_log = sm.Logit(y_train, sm.add_constant(X_train)).fit()
print(framingham_log.summary())

Optimization terminated successfully.
         Current function value: 0.387083
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:             TenYearCHD   No. Observations:                 1280
Model:                          Logit   Df Residuals:                     1264
Method:                           MLE   Df Model:                           15
Date:                Thu, 19 Aug 2021   Pseudo R-squ.:                  0.1278
Time:                        20:32:59   Log-Likelihood:                -495.47
converged:                       True   LL-Null:                       -568.05
Covariance Type:            nonrobust   LLR p-value:                 2.183e-23
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -9.7122      1.209     -8.033      0.000     -12.082      -7.342
male          

In [5]:
# make predictions for test data.
y_pred = framingham_log.predict(sm.add_constant(X_test))
print(framingham_log.pred_table())

print("Accuracy:", (2001+34) / (2001+34+323+19))
print("Baseline:", (2001+19) / (2001+34+323+19))

# auc score.
fpr, tpr, ths = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fpr, tpr)
print(auc)

[[1060.   12.]
 [ 187.   21.]]
Accuracy: 0.8561211611274716
Baseline: 0.8498106857383256
0.7266823042954523
