In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
df = pd.read_csv('patients.csv')
df.head()

In [None]:
id = df['Id']
df = df.drop(columns=['Id','SSN', 'DRIVERS', 'PASSPORT', 'PREFIX',
       'FIRST', 'LAST', 'SUFFIX', 'MAIDEN','ADDRESS','ZIP','STATE','CITY','BIRTHPLACE',
       'LAT', 'LON'])

In [None]:
df.isnull().sum()

In [None]:
df['BIRTHDATE'] = pd.to_datetime(df['BIRTHDATE'],format='%Y-%m-%d')
df['DEATHDATE'] = pd.to_datetime(df['DEATHDATE'],format='%Y-%m-%d')
df['isDead'] = np.where(df['DEATHDATE'].notnull(),1,0)
now = pd.Timestamp('now')
df['age'] = np.where(df['isDead']==0,(now-df['BIRTHDATE']).astype('<m8[Y]'),(df['DEATHDATE']-df['BIRTHDATE']).astype('<m8[Y]'))
df['binned']=pd.cut(x=df['age'], bins=[0,20,40,60,80,100,110,120], right=False)
df = df.drop(columns=['BIRTHDATE','DEATHDATE'])
df.head()

In [None]:
df['ETHNICITY'] = np.where(df['ETHNICITY']=='nonhispanic',0,1)
df['GENDER'] = np.where(df['GENDER']=='M',1,0)
df.head()

In [None]:
df.loc[(df["binned"]==pd.Interval(0, 20, closed='left')),'MARITAL'] = 'S'
df.loc[(df["binned"]==pd.Interval(20, 40, closed='left')) & (df['MARITAL'].isna()),'MARITAL'] = 'M'
df.MARITAL = np.where(df.MARITAL=='M',1,0)
df.head()

In [None]:
# correlation plot
corr = df.corr()
sns.heatmap(corr, cmap = 'Wistia', annot= True)

In [None]:
df = pd.get_dummies(data=df, columns=['RACE','COUNTY','binned'])
df.head()

In [None]:
target = ['HEALTHCARE_EXPENSES','HEALTHCARE_COVERAGE']
df = df[[c for c in df if c not in target] + [c for c in target if c in df]]
df.head()

In [None]:
Y = df.HEALTHCARE_COVERAGE
Z = df.HEALTHCARE_EXPENSES
X = df.drop(columns=target)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,Y, random_state = 0, test_size=0.2)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg_cov = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search_cov = GridSearchCV(forest_reg_cov, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search_cov.fit(x_train, y_train)

In [None]:
grid_search_cov.best_params_, grid_search_cov.best_estimator_

In [None]:
cvres = grid_search_cov.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
final_model_cov = grid_search.best_estimator_
cov_pred = final_model_cov.predict(x_test)

final_mse = mean_squared_error(y_test, cov_pred)
final_rmse = np.sqrt(final_mse)
print("RMSE on Test ==> ",final_rmse)

In [None]:
temp_df = pd.DataFrame({"true":y_test, "pred":cov_pred})
sns.scatterplot(data=temp_df)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,Z, random_state = 0, test_size=0.2)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg_exp = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search_exp = GridSearchCV(forest_reg_exp, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search_exp.fit(x_train, y_train)

In [None]:
grid_search_exp.best_params_,grid_search_exp.best_estimator_

In [None]:
cvres = grid_search_exp.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
final_model_exp = grid_search1.best_estimator_
exp_pred = final_model1.predict(x_test)

final_mse = mean_squared_error(y_test, exp_pred)
final_rmse = np.sqrt(final_mse)
print("RMSE on Test ==> ",final_rmse)

In [None]:
temp_df = pd.DataFrame({"true":y_test, "pred":exp_pred})
sns.scatterplot(data=temp_df)

In [None]:
import pickle
pickle.dump(final_model_cov,open('cover.pkl','wb'))
pickle.dump(final_model_exp,open('expense.pkl','wb'))

In [None]:
val = np.asarray([ 0. , 1. , 1. , 0. , 32. , 1. , 0. , 0. , 0. , 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0.
 , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. , 0. , 0. , 0. , 0., 0.]).reshape(1,-1)

In [None]:
.predict(val)

In [None]:
final_model_cov.predict(val)