# Predicting Hospitalization Costs

Chris Defreitas

November 2018

Bryant University

# Modeling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
%matplotlib inline
plt.style.use('ggplot')

In [2]:
import warnings
warnings.filterwarnings('ignore')

Import dataset

In [3]:
dtype = {'admtype': 'int8',
 'age': 'float16',
 'asource': 'category',
 'campus': 'category',
 'diag_adm': 'category',
 'diag_cat3': 'category',
 'diag_cat4': 'category',
 'er_mode': 'int8',
 'moa': 'int8',
 'pay_ub92': 'int8',
 'provider': 'int8',
 'pt_state': 'category',
 'race': 'int8',
 'sex': 'int8',
 'tot': 'float64',
 'yoa': 'int8'}

In [4]:
df0 = pd.read_csv('df_sample_cleansed2.csv', dtype=dtype, low_memory=False)


In [5]:
df0.head()

Unnamed: 0,pay_ub92,age,sex,provider,moa,yoa,admtype,asource,tot,pt_state,diag_adm,campus,er_mode,race,diag_cat3,diag_cat4
0,1,63.0,2,5,12,7,1,7,21071.0,RI,49121,3,0,1,491,4912
1,1,66.0,2,10,5,5,1,7,14791.0,RI,4359,0,0,1,435,4359
2,4,0.0,2,14,2,10,4,S,2501.0,RI,V3000,0,0,1,V30,V300
3,2,74.0,1,13,8,9,1,7,9444.0,RI,5990,0,1,1,599,5990
4,4,37.0,2,3,7,5,3,7,8771.0,RI,66704,0,9,5,667,6670


In [6]:
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 16 columns):
pay_ub92     20000 non-null int8
age          20000 non-null float16
sex          20000 non-null int8
provider     20000 non-null int8
moa          20000 non-null int8
yoa          20000 non-null int8
admtype      20000 non-null int8
asource      20000 non-null category
tot          20000 non-null float64
pt_state     20000 non-null category
diag_adm     20000 non-null category
campus       20000 non-null category
er_mode      20000 non-null int8
race         20000 non-null int8
diag_cat3    20000 non-null category
diag_cat4    20000 non-null category
dtypes: category(6), float16(1), float64(1), int8(8)
memory usage: 698.4 KB


In [7]:
df0.isna().sum()

pay_ub92     0
age          0
sex          0
provider     0
moa          0
yoa          0
admtype      0
asource      0
tot          0
pt_state     0
diag_adm     0
campus       0
er_mode      0
race         0
diag_cat3    0
diag_cat4    0
dtype: int64

## Transformation

In [8]:
def simplify_state(df):
    """Group states"""
    state_codes = ['RI', 'MA', 'NY', 'FL', '9']
    df.pt_state = df.pt_state.cat.set_categories(state_codes)
    df.pt_state = df.pt_state.fillna('9')
    return df

In [9]:
def simplify_age(df):
    """Reduce ages over 100 to 100"""
    df.age = df.age.where(df.age < 100, other=100)
    return df

In [10]:
def transform_target(df):
    
    return df

In [11]:
def transform_features(df):
    """Execute transformations"""
    df = simplify_state(df)
    df = simplify_age(df)
    #df = transform_target(df)
    return df

In [12]:
df1 = transform_features(df0)

## Feature Selection

In [13]:
drop_col = ['diag_cat3', 'diag_cat4']
df2 = df1.drop(drop_col, axis=1)

## One-Hot-Encoding

In [14]:
col = ['pay_ub92', 'provider', 'admtype', 'asource', 'pt_state', 'campus', 'er_mode', 'race', 'diag_adm']
df2 = pd.get_dummies(df2, columns=col, prefix=col)

## Split data set into Train and Test

In [15]:
df2.head()

Unnamed: 0,age,sex,moa,yoa,tot,pay_ub92_1,pay_ub92_2,pay_ub92_3,pay_ub92_4,pay_ub92_5,...,diag_adm_V594,diag_adm_V6284,diag_adm_V643,diag_adm_V667,diag_adm_V702,diag_adm_V714,diag_adm_V716,diag_adm_V7189,diag_adm_V8409,diag_adm_XXX
0,63.0,2,12,7,21071.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,66.0,2,5,5,14791.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,2,2,10,2501.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,74.0,1,8,9,9444.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,37.0,2,7,5,8771.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
y = df2.tot.astype(float)
X = df2.drop(columns=['tot'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Normalize Transform

In [18]:
from sklearn.preprocessing import Normalizer

In [19]:
def normalize_attr(train, test, attrs):
    for feature_name in attrs:
        max_value = train[feature_name].max()
        min_value = train[feature_name].min()
        train[feature_name] = (train[feature_name] - min_value) / (max_value - min_value)
        test[feature_name] = (test[feature_name] - min_value) / (max_value - min_value)
    return train, test

In [20]:
X_train, X_test = normalize_attr(X_train.copy(), X_test.copy(), ['age'])

In [21]:
def normalize_target(train, test):
    max_value = train.max()
    min_value = train.min()
    train = (train - min_value) / (max_value - min_value)
    test = (test - min_value) / (max_value - min_value)
    return train, test

In [22]:
y_train, y_test = normalize_target(y_train.copy(), y_test.copy())

In [23]:
print(X_train.shape)
print(X_test.shape)

(16000, 1834)
(4000, 1834)


## Model Testing

In [24]:
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [25]:
def print_model_score(train_predicted, test_predicted):
    train_rmse = mean_squared_error(y_train, train_predicted)**.5
    train_r2 = r2_score(y_train, train_predicted)
    test_rmse = mean_squared_error(y_test, test_predicted)**.5
    test_r2 = r2_score(y_test, test_predicted)
    print("Train RMSE:\t{}".format(train_rmse))
    print("Train R2 Score:\t{}".format(train_r2))
    print("Test RMSE:\t{}".format(test_rmse))
    print("Test R2 Score:\t{}".format(test_r2))

## Decision Tree

In [26]:
from sklearn.tree import DecisionTreeRegressor

In [27]:
model = DecisionTreeRegressor(min_samples_split=.05)
model.fit(X_train, y_train)
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [28]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	0.2008872503345378
Train R2 Score:	0.3317660847871262
Test RMSE:	0.20736292859174946
Test R2 Score:	0.30963395793979875


In [29]:
fi = model.feature_importances_
col = X_train.columns
feat_imp = pd.DataFrame(fi, columns=['Importance'],index=col).sort_values(by='Importance', ascending=False)

## Lasso Regression

In [30]:
from sklearn.linear_model import Lasso

In [31]:
model = Lasso()
model.fit(X_train, y_train)
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [32]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	0.24574693924419727
Train R2 Score:	0.0
Test RMSE:	0.24957606181000835
Test R2 Score:	-5.305388837251179e-05


## AdaBoost

In [33]:
from sklearn.ensemble import AdaBoostRegressor

  from numpy.core.umath_tests import inner1d


In [34]:
model = AdaBoostRegressor()
model.fit(X_train, y_train)
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [35]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	0.2171049157891304
Train R2 Score:	0.21951766264452144
Test RMSE:	0.22104780569145957
Test R2 Score:	0.2155060446591769


## Boosting

In [36]:
from sklearn.ensemble import GradientBoostingRegressor

In [37]:
params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.1, 'loss': 'ls'}
model = GradientBoostingRegressor(**params)
model.fit(X_train, y_train)
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [38]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	0.19159151275785916
Train R2 Score:	0.3921781638593689
Test RMSE:	0.19965272937857834
Test R2 Score:	0.3600181039343088
