In [4]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import joblib
from sklearn.inspection import permutation_importance
from sklearn import tree


In [5]:
#DATA ENGINEERING

#Import scv dataset
data = pd.read_csv("heart.csv")

#Separating HeartDisease and the rest of the dataset into two variables
X = data.drop(columns = ['HeartDisease'])
y = data['HeartDisease']

#Changine changing nonnumerical featuers into multiple columns
X = pd.get_dummies(X)

X

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,False,True,False,True,False,False,False,True,False,True,False,False,False,True
1,49,160,180,0,156,1.0,True,False,False,False,True,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,False,True,False,True,False,False,False,False,True,True,False,False,False,True
3,48,138,214,0,108,1.5,True,False,True,False,False,False,False,True,False,False,True,False,True,False
4,54,150,195,0,122,0.0,False,True,False,False,True,False,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,False,True,False,False,False,True,False,True,False,True,False,False,True,False
914,68,144,193,1,141,3.4,False,True,True,False,False,False,False,True,False,True,False,False,True,False
915,57,130,131,0,115,1.2,False,True,True,False,False,False,False,True,False,False,True,False,True,False
916,57,130,236,0,174,0.0,True,False,False,True,False,False,True,False,False,True,False,False,True,False


In [6]:
# Splitting data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Creating the model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

#Define parameter for hyperparameter tuning
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

#Get the best fit 
best_model = grid_search.best_estimator_

# Get feature importances
importances = model.feature_importances_

# Print feature importance scores
for feature, importance in zip(X.columns, importances):
    print(f"{feature}: {importance}")

# Prediction and accuracy
#prediction = model.predict(X_test) OLD WAY - NOT USING HYPERPARAMETER TUNING
prediction = best_model.predict(X_test) # Predicting using best fit

joblib.dump(best_model, 'model.joblib')
accuracy = accuracy_score(y_test, prediction)
print(f"Accuracy: {accuracy}")

Age: 0.05498359646167222
RestingBP: 0.06603892957942173
Cholesterol: 0.07403293938888017
FastingBS: 0.02855914676388872
MaxHR: 0.1186281675864131
Oldpeak: 0.08512522653556188
Sex_F: 0.00620500176118352
Sex_M: 0.03774615057828609
ChestPainType_ASY: 0.07241235747067945
ChestPainType_ATA: 0.01803054800835141
ChestPainType_NAP: 0.0
ChestPainType_TA: 0.0009271145209580848
RestingECG_LVH: 0.00923912978365
RestingECG_Normal: 0.0036626746506986025
RestingECG_ST: 0.008186101413646327
ExerciseAngina_N: 0.0036626746506986025
ExerciseAngina_Y: 0.02015637019678808
ST_Slope_Down: 0.007848588537211291
ST_Slope_Flat: 0.0
ST_Slope_Up: 0.38455528211201073
Accuracy: 0.8206521739130435
