## Heart Disease Predict Model

This model can be used to predict whether a patient has heart disease.

## Features
* age
* sex
* chest pain type (4 values)
* resting blood pressure
* serum cholestoral in mg/dl
* fasting blood sugar > 120 mg/dl
* resting electrocardiographic results (values 0,1,2)
* maximum heart rate achieved
* exercise induced angina
* oldpeak = ST depression induced by exercise relative to rest
* the slope of the peak exercise ST segment
* number of major vessels (0-3) colored by flourosopy
* thal: 3 = normal; 6 = fixed defect; 7 = reversable defe

In [114]:
import pandas as pd

from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection

from matplotlib import pyplot as plt
import altair as alt

In [115]:
def getData():
    path = "data/heart.csv"
    df = pd.read_csv(path)
    return df

In [116]:
df = getData()


featureNames = list(df.columns)
className = "target"
featureNames.remove(className)

In [117]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [118]:
normFeatures = ["age","trestbps","chol","thalach","oldpeak"]
def norm(x):
    return (x - x.mean())/x.std()
for x in normFeatures:
    df[x] = norm(df[x].values)


In [119]:
print("Raw data table")
df

Raw data table


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.952197,1,3,0.763956,-0.256334,1,0,0.015443,0,1.087338,0,0,1,1
1,-1.915313,1,2,-0.092738,0.072199,0,1,1.633471,0,2.122573,0,0,2,1
2,-1.474158,0,1,-0.092738,-0.816773,0,0,0.977514,0,0.310912,2,0,2,1
3,0.180175,1,1,-0.663867,-0.198357,0,1,1.239897,0,-0.206705,2,0,2,1
4,0.290464,0,0,-0.663867,2.082050,0,1,0.583939,1,-0.379244,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.290464,0,0,0.478391,-0.101730,0,1,-1.165281,1,-0.724323,1,0,3,0
299,-1.033002,1,3,-1.234996,0.342756,0,1,-0.771706,0,0.138373,1,0,3,0
300,1.503641,1,0,0.706843,-1.029353,1,1,-0.378132,0,2.036303,1,2,3,0
301,0.290464,1,0,-0.092738,-2.227533,0,1,-1.515125,1,0.138373,1,1,3,0


In [120]:
groupMean = df.groupby("target").mean().stack().reset_index()
groupMean.columns=["target","feature","mean"]

table = df[featureNames].corrwith(df["target"]).reset_index()
table.columns = ["feature","corrleation"]
table = table.reset_index().round(2)

In [121]:
meanChart = alt.Chart(groupMean).mark_bar().encode(
    alt.Y("feature:N",sort=None),
    alt.X("mean:Q",stack="normalize"),
    color="target:N"
).properties(title="Mean Comparison between Targets")

In [122]:
corrChart = alt.Chart(table).mark_bar().encode(
    alt.Y("feature:N",sort=None),
    alt.X("corrleation:Q"), 
    tooltip="corrleation:N",
    color=alt.condition(alt.datum.corrleation > 0,alt.value('skyblue'), alt.value('lightgray'))
).properties(title="Corrleation with Target")
alt.concat(meanChart,corrChart).resolve_legend(color="independent")

## Train model

In [123]:

Xdata = df[featureNames]
ydata = df[className]
X = Xdata.values

y = ydata.values.flatten()
Xtrain,Xtest,ytrain,ytest = model_selection.train_test_split(X,y,test_size=0.2)

In [124]:
# model = tree.DecisionTreeClassifier(max_leaf_nodes=10)
model = LogisticRegression(max_iter=3000)
# model = GaussianNB()
model.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=3000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [125]:
ypredict = model.predict(Xtest)
crossTabel = pd.crosstab(ytest, ypredict, rownames=["Actual"], colnames=["predict"])
crossTabel

predict,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,21,6
1,3,31


In [126]:
from sklearn import model_selection

scores = model_selection.cross_validate(model, X, y, cv=5, scoring='roc_auc')
scoreMean,scoreStd = scores["test_score"].mean(),scores["test_score"].std()
print(f"Score:{scoreMean:.2f} +-{scoreStd*1:.2f}")

Score:0.90 +-0.04


In [127]:
# plt.figure(figsize=(15,15))
# annotations = tree.plot_tree(model, feature_names=featureNames, class_names=["health", "weak"], filled=True, rounded=True)