In [1]:
import pandas as pd
import numpy as np
import prepdatamodel as pdm
from sklearn.linear_model import LinearRegression

In [2]:
heart = pd.read_csv('heart_2020_cleaned.csv')
col_bin = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']
heart = pdm.binary(heart, col_bin)
heartyes = heart[heart.HeartDisease == 1]
heartno = heart[heart.HeartDisease == 0]
heartnodrop = heartno.sample(frac = 0.9, random_state = 0)
heart = heart.drop(heartnodrop.index)

In [3]:
col_num = ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer', 'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
col_cat = ['Sex', 'AgeCategory', 'Race', 'Diabetic', 'GenHealth']

In [4]:
heart_train = heart.sample(frac = 0.8, random_state = 0)
heart_test = heart.drop(heart_train.index)

In [5]:
heart_train_answer = heart_train.pop("HeartDisease")
heart_test_answer = heart_test.pop("HeartDisease")

In [6]:
Y = heart_train_answer.values
X = pdm.pipeline(heart_train, col_num, col_cat)
y = heart_test_answer.values
x = pdm.pipeline(heart_test, col_num, col_cat)

In [8]:
lin_reg = LinearRegression()
lin_reg.fit(X, Y)

LinearRegression()

In [10]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(lin_reg.predict(x), list(y))
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.4058031223994454

In [27]:
from sklearn.ensemble import RandomForestRegressor

In [28]:
forest_reg = RandomForestRegressor()
forest_reg.fit(X, Y)
forest_mse = mean_squared_error(forest_reg.predict(x), list(y))
forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.4260761882489967

In [29]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
    ]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
    scoring='neg_mean_squared_error',
    return_train_score=True)

In [30]:
grid_search.fit(X, Y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [31]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [32]:
grid_search.best_estimator_

RandomForestRegressor(max_features=6, n_estimators=30)

In [33]:
grid_search_mse = mean_squared_error(grid_search.best_estimator_.predict(x), list(y))
grid_search_rmse = np.sqrt(grid_search_mse)
grid_search_rmse 

0.42603380031232113