In [19]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
file_path = r'Heart Disease Indicators modified.csv'
heart_data = pd.read_csv(file_path)

In [3]:
heart_data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3,30,0,0,57,White,1,1,Very good,5,1,0,1
1,0,20.34,0,0,1,0,0,0,0,80,White,0,1,Very good,7,0,0,0
2,0,26.58,1,0,0,20,30,0,1,67,White,1,1,Fair,8,1,0,0
3,0,24.21,0,0,0,0,0,0,0,77,White,0,0,Good,6,0,0,1
4,0,23.71,0,0,0,28,0,1,0,42,White,0,1,Very good,8,0,0,0


In [4]:
heart_data.drop(['Race', 'GenHealth'], axis = 1)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.60,1,0,0,3,30,0,0,57,1,1,5,1,0,1
1,0,20.34,0,0,1,0,0,0,0,80,0,1,7,0,0,0
2,0,26.58,1,0,0,20,30,0,1,67,1,1,8,1,0,0
3,0,24.21,0,0,0,0,0,0,0,77,0,0,6,0,0,1
4,0,23.71,0,0,0,28,0,1,0,42,0,1,8,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7,0,1,1,62,1,0,6,1,0,0
319791,0,29.84,1,0,0,0,0,0,1,37,0,1,5,1,0,0
319792,0,24.24,0,0,0,0,0,0,0,47,0,1,6,0,0,0
319793,0,32.81,0,0,0,0,0,0,0,27,0,0,12,0,0,0


In [5]:
y = heart_data.HeartDisease

In [6]:
#setting prediction metrics
disease_metrics = ['BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory', 'Diabetic',
        'PhysicalActivity', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer']
X = heart_data[disease_metrics]

In [8]:
#splitting the data into a training and validation set
train_X, val_X, train_y, val_y = train_test_split(X, y)

In [9]:
#decision tree model
tree_model = DecisionTreeRegressor()
tree_model.fit(train_X, train_y)
preds = tree_model.predict(val_X)
mae = mean_absolute_error(val_y, preds)
print(mae)

0.1387922729549969


In [10]:
#function to find mae of different tree sized
def tree_depth_error(max_nodes, train_X, val_X, train_y, val_y):
    heart_model = DecisionTreeRegressor(max_leaf_nodes=max_nodes)
    heart_model.fit(train_X, train_y)
    preds = heart_model.predict(val_X)
    mae = mean_absolute_error(val_y, preds)
    return mae

In [11]:
#using tree_depth_error function to find best tree size
rel = {}
for nodes in (5, 50, 500, 5000, 50000):
    my_mae = tree_depth_error(nodes, train_X, val_X, train_y, val_y)
    rel[my_mae] = nodes
best_tree_size = rel[min(rel)]
tree_error = min(rel)
print("best tree size: %d \t mae: %18.17f" %(best_tree_size, min(rel)))

best tree size: 5000 	 mae: 0.13384565201489770


In [12]:
#random forest model
forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
forest_preds = forest_model.predict(val_X)
forest_model_error = mean_absolute_error(val_y, forest_preds)
print("forest error: %18.17f" %(forest_model_error))

forest error: 0.14026695120343591


In [13]:
#linear regression model
lr_model = LinearRegression()
lr_model.fit(train_X, train_y)
lr_preds = lr_model.predict(val_X)
lr_error = mean_absolute_error(val_y, lr_preds)
print("error: %18.17f" %(lr_error))

error: 0.14691809939453856


In [16]:
#neural network regression model
nnr_model = MLPRegressor()
nnr_model.fit(train_X, train_y)
nnr_ans = nnr_model.predict(val_X)
nnr_error = mean_absolute_error(val_y, nnr_ans)
print("error: %18.17f" %(nnr_error))

error: 0.16421733599555130


In [17]:
errors = {'Decision Tree': [tree_error], 'Random Forest': [forest_model_error], 
            'Linear Regression': [lr_error], 'Neural Network Regression': [nnr_error]}
errors_df = pd.DataFrame(errors)
errors_df
# we take the decision tree as the final model as MLPRegression gives very varying results

Unnamed: 0,Decision Tree,Random Forest,Linear Regression,Neural Network Regression
0,0.133846,0.140267,0.146918,0.164217


In [18]:
#defining final model
f_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size)
f_model.fit(X, y)

In [20]:
filename = 'finalized_model.pkl'
pickle.dump(f_model, open(filename, 'wb'))

In [21]:
#loaded_model = pickle.load(open(filename, 'rb'))