In [28]:
import pandas as pd

In [29]:
df = pd.read_csv('data/heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [31]:
df.corr()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
Age,1.0,0.254399,-0.095282,0.198039,-0.382045,0.258612,0.282039
RestingBP,0.254399,1.0,0.100893,0.070193,-0.112135,0.164803,0.107589
Cholesterol,-0.095282,0.100893,1.0,-0.260974,0.235792,0.050148,-0.232741
FastingBS,0.198039,0.070193,-0.260974,1.0,-0.131438,0.052698,0.267291
MaxHR,-0.382045,-0.112135,0.235792,-0.131438,1.0,-0.160691,-0.400421
Oldpeak,0.258612,0.164803,0.050148,0.052698,-0.160691,1.0,0.403951
HeartDisease,0.282039,0.107589,-0.232741,0.267291,-0.400421,0.403951,1.0


In [32]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

In [33]:
from sklearn.model_selection import train_test_split


In [34]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

In [35]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 688 entries, 155 to 102
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             688 non-null    int64  
 1   Sex             688 non-null    object 
 2   ChestPainType   688 non-null    object 
 3   RestingBP       688 non-null    int64  
 4   Cholesterol     688 non-null    int64  
 5   FastingBS       688 non-null    int64  
 6   RestingECG      688 non-null    object 
 7   MaxHR           688 non-null    int64  
 8   ExerciseAngina  688 non-null    object 
 9   Oldpeak         688 non-null    float64
 10  ST_Slope        688 non-null    object 
dtypes: float64(1), int64(5), object(5)
memory usage: 64.5+ KB


In [36]:
X_train.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
155,56,M,ASY,155,342,1,Normal,150,Y,3.0,Flat
362,56,M,NAP,155,0,0,ST,99,N,0.0,Flat
869,59,M,NAP,150,212,1,Normal,157,N,1.6,Up
101,51,M,ASY,130,179,0,Normal,100,N,0.0,Up
199,57,F,TA,130,308,0,Normal,98,N,1.0,Flat


In [37]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [38]:
ohe_columns = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
ct = ColumnTransformer([('ohe', OneHotEncoder(sparse=False, drop='first'), 
ohe_columns)], remainder='passthrough')

transformed_array = ct.fit_transform(X_train[ohe_columns])

In [39]:
transformed_cols = [j for i in range(0, len(ct.transformers_[0][1].categories_)) for j in ct.transformers_[0][1].categories_[i]]

In [40]:
clean_cols = [col for col in transformed_cols if col not in ['F', 'ASY', 'LVH', 'N', 'Down']]

In [41]:
ohe_df = pd.DataFrame(transformed_array, index=X_train.index, columns=clean_cols)

In [42]:
final_X_train = pd.concat([X_train, ohe_df], axis=1)

In [43]:
final_X_train.drop(ohe_columns, axis=1, inplace=True)

In [44]:
y_train.value_counts()[1]/y_train.shape[0]

0.5465116279069767

In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
#Logistic is computationally cheap and quick to use as well as there is no need for standardization, 
#the lack of multicollearity between features adds to make this an incredible use case for logisitic regression,
# liblinear for a small dataset, 
# l1 penalty for liblinear solver, 
# verbose at 1 for liblinear solver
log_reg = LogisticRegression(penalty='l1', solver='liblinear', verbose=1)
log_reg.fit(final_X_train, y_train)

[LibLinear]

LogisticRegression(penalty='l1', solver='liblinear', verbose=1)

In [47]:
from sklearn.model_selection import cross_val_score

In [48]:
#scored with the model's default scoring technique,
#which is accuracy,
#due to the detrimental consequences that would take place if both a false positive or a false negative were to occur.

cross_val_score(log_reg, final_X_train, y_train).mean()

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

0.8633026552417222

In [49]:
log_pipe = Pipeline(steps=[('ct', ct), ('lr', log_reg)])

log_pipe.fit(X_train, y_train)

log_pipe.score(X_test, y_test)

[LibLinear]

0.8608695652173913

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
rfc_pipe = Pipeline(steps=[('ct', ct), ('rfc', RandomForestClassifier(n_estimators=100))])

rfc_pipe.fit(X_train, y_train)

rfc_pipe.score(X_test, y_test)

0.8739130434782608

In [52]:
#RandomForestClassifier had a lackluster accuracy. It was slightly improved from the logistic regression but nothing to write home about.
#Now to try a neural network as they are the best at learning and in the health industry, 
#probably the most useful calssifier to use in a situation where you must identify whether or not a patient has a life threatening disease.