In [28]:
# Import python libraries

from math import exp
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm

from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve

In [29]:
#Read data into dataframe and display info
#This dataset can be found at https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease
df = pd.read_csv("heart_disease.csv") 
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


In [30]:
#Check for null values
for c in df.columns:
    print(c, df[c].isnull().sum()) 

HeartDisease 0
BMI 0
Smoking 0
AlcoholDrinking 0
Stroke 0
PhysicalHealth 0
MentalHealth 0
DiffWalking 0
Sex 0
AgeCategory 0
Race 0
Diabetic 0
PhysicalActivity 0
GenHealth 0
SleepTime 0
Asthma 0
KidneyDisease 0
SkinCancer 0


In [31]:
#Create dummy variables for categorical features
df1=df

for i in df1.columns:
    if df1[i].dtype=='object':
        df1=pd.get_dummies(df1,columns=[i],drop_first=bool)

df1.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDisease_Yes,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Male,...,Diabetic_Yes,Diabetic_Yes (during pregnancy),PhysicalActivity_Yes,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,16.6,3.0,30.0,5.0,0,1,0,0,0,0,...,1,0,1,0,0,0,1,1,0,1
1,20.34,0.0,0.0,7.0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
2,26.58,20.0,30.0,8.0,0,1,0,0,0,1,...,1,0,1,1,0,0,0,1,0,0
3,24.21,0.0,0.0,6.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,23.71,28.0,0.0,8.0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0


In [32]:
#Splits independent variables and target variable from training data
x=df1.drop(columns=['HeartDisease_Yes'])
y=df1['HeartDisease_Yes']

In [33]:
#Test size is .25
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.25,random_state=1)

In [34]:
#Create sklearn logistic regression model from train data
model=LogisticRegression().fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
#Print the coeficients and accuracy of logistic regression model
print('Intercept: ',model.intercept_)
print('Coeficients: ',model.coef_)
print('Score: ',model.score(x_test,y_test))

Intercept:  [-3.69331939]
Coeficients:  [[ 0.00974955  0.00646698  0.0040848  -0.03543926  0.31267608 -0.38117662
   1.00836707  0.21624568  0.65496768 -1.41007911 -1.36492128 -1.44752341
  -1.09739272 -0.73585758 -0.13392214  0.27967341  0.58528571  0.79782161
   1.08430229  1.27846503  1.56145677 -0.79507727 -0.91080154 -0.80517075
  -0.43906354 -0.51466835  0.27811701  0.52297891 -0.04273648 -0.05522674
   1.14380019  0.69270232  1.42232656  0.10212392  0.26087495  0.56621742
   0.04458075]]
Score:  0.9166468623747639


In [36]:
#Create confusion matrix that compares actual values (rows) against prediction values (columns)
y_predict=model.predict(x_test)
pd.DataFrame(confusion_matrix(y_test,y_predict,labels=[0,1]))

Unnamed: 0,0,1
0,72570,641
1,6023,715


In [37]:
#Display logistic regression model metrics
print('recall score: ',recall_score(y_test,y_predict))
print('accuracy score: ',accuracy_score(y_test,y_predict))
print('f1 score: ',f1_score(y_test,y_predict))
print('precision score: ',precision_score(y_test,y_predict))

recall score:  0.10611457405758386
accuracy score:  0.9166468623747639
f1 score:  0.17667407956510997
precision score:  0.5272861356932154


In [46]:
#Add column of ones to x data and create statsmodels logistic regression model
xi=sm.add_constant(x_train,has_constant='add')
logmodel=sm.Logit(y_train,xi).fit()
print(logmodel.summary())

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


Optimization terminated successfully.
         Current function value: inf
         Iterations 9




                           Logit Regression Results                           
Dep. Variable:       HeartDisease_Yes   No. Observations:               239846
Model:                          Logit   Df Residuals:                   239808
Method:                           MLE   Df Model:                           37
Date:                Tue, 19 Apr 2022   Pseudo R-squ.:                     inf
Time:                        14:55:04   Log-Likelihood:                   -inf
converged:                       True   LL-Null:                        0.0000
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const                               -6.3017      0.134    -47.079      0.000      -6.564      -6.039
BMI                                  0.0087      0.001      6.595



In [47]:
#Remove features with p values > .05
xtr=x_train.drop(columns=['AgeCategory_25-29','Race_Other','Race_White','Diabetic_Yes (during pregnancy)','PhysicalActivity_Yes'])
xte=x_test.drop(columns=['AgeCategory_25-29','Race_Other','Race_White','Diabetic_Yes (during pregnancy)','PhysicalActivity_Yes'])

In [49]:
#Create new sklearn logistic regression model and display confusion matrix
model2=LogisticRegression().fit(xtr,y_train)
ypre=model2.predict(xte)
pd.DataFrame(confusion_matrix(y_test,ypre,labels=[0,1]))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,0,1
0,72587,624
1,6007,731


In [50]:
#Print new logistic regression metrics
print('recall score: ',recall_score(yte,ypre))
print('accuracy score: ',accuracy_score(yte,ypre))
print('f1 score: ',f1_score(yte,ypre))
print('precision score: ',precision_score(yte,ypre))

recall score:  0.10848916592460671
accuracy score:  0.9170596255112634
f1 score:  0.18064994439639195
precision score:  0.5394833948339484


In [51]:
#Print new statmodels logistic regression summary 
xi=sm.add_constant(xtr,has_constant='add')
logmodel2=sm.Logit(y_train,xi).fit()
print(logmodel2.summary())

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


Optimization terminated successfully.
         Current function value: inf
         Iterations 9




                           Logit Regression Results                           
Dep. Variable:       HeartDisease_Yes   No. Observations:               239846
Model:                          Logit   Df Residuals:                   239813
Method:                           MLE   Df Model:                           32
Date:                Tue, 19 Apr 2022   Pseudo R-squ.:                     inf
Time:                        14:57:55   Log-Likelihood:                   -inf
converged:                       True   LL-Null:                        0.0000
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const                               -6.2579      0.093    -67.524      0.000      -6.440      -6.076
BMI                                  0.0086      0.001      6.577



In [66]:
#The new model shows a slight improvement in metrics
print('Recall Score:   ',round(recall_score(y_test,y_predict),4),' New Recall Score:   ',round(recall_score(yte,ypre),4),)
print('Accuracy Score: ',round(accuracy_score(y_test,y_predict),4),' New Accuracy Score: ',round(accuracy_score(yte,ypre),4))
print('F1 Score:       ',round(f1_score(y_test,y_predict),4),' New F1 Score:       ',round(f1_score(yte,ypre),4))
print('Precision Score:',round(precision_score(y_test,y_predict),4),' New Precision Score:',round(precision_score(yte,ypre),4))

Recall Score:    0.1061  New Recall Score:    0.1085
Accuracy Score:  0.9166  New Accuracy Score:  0.9171
F1 Score:        0.1767  New F1 Score:        0.1806
Precision Score: 0.5273  New Precision Score: 0.5395
