In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
%matplotlib inline
plt.style.use('ggplot')

Import dataset

In [4]:
dtype = {'admtype': 'int8',
 'age': 'float16',
 'asource': 'category',
 'campus': 'category',
 'diag_adm': 'category',
 'diag_cat3': 'category',
 'diag_cat4': 'category',
 'er_mode': 'int8',
 'moa': 'int8',
 'pay_ub92': 'int8',
 'provider': 'int8',
 'pt_state': 'category',
 'race': 'int8',
 'sex': 'int8',
 'tot': 'float64',
 'yoa': 'int8'}

In [5]:
df0 = pd.read_csv('df_sample_cleansed.csv', dtype=dtype, low_memory=False)
#df0 = pd.read_csv('df_cleansed.csv', dtype=dtype, low_memory=False)
#df3 = pd.read_csv('df_sample_coded.csv', low_memory=False)
#df3 = pd.read_csv('df_binary.csv', low_memory=False)
#df3 = pd.read_csv('df3_arrival.csv', low_memory=False)

In [6]:
#df3 = df3.drop(['Unnamed: 0'], axis='columns')

In [7]:
df0.head()

Unnamed: 0,pay_ub92,age,sex,provider,moa,yoa,admtype,asource,tot,pt_state,diag_adm,campus,er_mode,race,diag_cat3,diag_cat4
0,10,0.0,1,9,4,8,4,S,2152.0,RI,V3000,0,0,1,V30,V300
1,9,58.0,1,9,5,6,1,7,5388.0,RI,0088,0,0,1,008,0088
2,9,36.0,2,13,3,5,3,1,6422.0,MA,65423,0,0,1,654,6542
3,8,22.0,1,16,10,11,1,A,10453.0,RI,29633,0,4,5,296,2963
4,99,74.0,2,1,12,13,1,1,6510.0,RI,29651,0,0,2,296,2965


In [8]:
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 16 columns):
pay_ub92     10000 non-null int8
age          10000 non-null float16
sex          10000 non-null int8
provider     10000 non-null int8
moa          10000 non-null int8
yoa          10000 non-null int8
admtype      10000 non-null int8
asource      10000 non-null category
tot          10000 non-null float64
pt_state     10000 non-null category
diag_adm     10000 non-null category
campus       10000 non-null category
er_mode      10000 non-null int8
race         10000 non-null int8
diag_cat3    10000 non-null category
diag_cat4    10000 non-null category
dtypes: category(6), float16(1), float64(1), int8(8)
memory usage: 388.4 KB


In [9]:
df0.isna().sum()

pay_ub92     0
age          0
sex          0
provider     0
moa          0
yoa          0
admtype      0
asource      0
tot          0
pt_state     0
diag_adm     0
campus       0
er_mode      0
race         0
diag_cat3    0
diag_cat4    0
dtype: int64

## Transformation

In [10]:
df1 = df0.copy()

In [11]:
def simplify_state(df):
    """Group states"""
    state_codes = ['RI', 'MA', 'NY', 'FL', '9']
    df.pt_state = df.pt_state.cat.set_categories(state_codes)
    df.pt_state = df.pt_state.fillna('9')
    return df

In [12]:
def simplify_age(df):
    """Reduce ages over 100 to 100"""
    df.age = df.age.where(df.age < 100, other=100)
    return df

In [13]:
def categorize_target(df):
    """Create a binary target variable"""
    tgt = df.tot > 25000
    df = df.assign(high_cost=tgt.values)
    return df

In [14]:
def transform_features(df):
    """Execute transformations"""
    df = simplify_state(df)
    df = simplify_age(df)
    df = categorize_target(df)
    return df

In [15]:
df1 = transform_features(df1)

## Feature Selection

In [16]:
drop_col = ['diag_adm', 'diag_cat4', 'high_cost']
df2 = df1.drop(drop_col, axis=1)

## One-Hot-Encoding

In [17]:
col = ['pay_ub92', 'provider', 'admtype', 'asource', 'pt_state', 'campus', 'er_mode', 'race', 'diag_cat3']
df2 = pd.get_dummies(df2, columns=col, prefix=col)

## Split data set into Train and Test

In [18]:
df2.head()

Unnamed: 0,age,sex,moa,yoa,tot,pay_ub92_1,pay_ub92_2,pay_ub92_3,pay_ub92_4,pay_ub92_6,...,diag_cat3_V55,diag_cat3_V57,diag_cat3_V58,diag_cat3_V59,diag_cat3_V62,diag_cat3_V63,diag_cat3_V66,diag_cat3_V70,diag_cat3_V71,diag_cat3_XXX
0,0.0,1,4,8,2152.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,58.0,1,5,6,5388.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,36.0,2,3,5,6422.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,22.0,1,10,11,10453.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,74.0,2,12,13,6510.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
from sklearn.model_selection import train_test_split

In [20]:
y = df2.tot.astype(float)
X = df2.drop(columns=['tot'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [21]:
#train, test = train_test_split(df2, test_size=0.2, random_state=0)

In [22]:
print(X_train.shape)
print(X_test.shape)
#print(train.shape)
#print(test.shape)

(8000, 575)
(2000, 575)


## Count Encoding

In [222]:
#diag_counts = train[['diag_cat3', 'tot']].groupby(by='diag_cat3').mean().to_dict()['tot']
#other = train.tot.mean

In [223]:
#def lookup_count(attr):
#    try:
#        return diag_counts[attr]
#    except KeyError:
#        return other   

In [224]:
#def count_encode(df, attr):
#    encoded = df[attr].apply(lookup_count)
#    df = df.assign(diag_encoded=encoded.values)
#    df.diag_encoded = df.diag_encoded/df.diag_encoded.max()
#    df = df.drop(attr, axis=1)
#    return df

In [225]:
#train = count_encode(train, 'diag_cat3')
#test = count_encode(test, 'diag_cat3')

In [226]:
#X_train = train.drop('tot', axis=1)
#X_test = test.drop('tot', axis=1)
#y_train = train.tot
#y_test = test.tot

In [227]:
#X_train.head()

In [228]:
#X_train.describe()

## Model Testing

In [23]:
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [24]:
def print_model_score(train_predicted, test_predicted):
    train_rmse = mean_squared_error(y_train, train_predicted)**.5
    train_r2 = r2_score(y_train, train_predicted)
    test_rmse = mean_squared_error(y_test, test_predicted)**.5
    test_r2 = r2_score(y_test, test_predicted)
    print("Train RMSE:\t{}".format(train_rmse))
    print("Train R2 Score:\t{}".format(train_r2))
    print("Test RMSE:\t{}".format(test_rmse))
    print("Test R2 Score:\t{}".format(test_r2))

## Decision Tree

In [25]:
from sklearn.tree import DecisionTreeRegressor

In [26]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [27]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	11557.551377161566
Train R2 Score:	0.9373936174174887
Test RMSE:	63116.651937438866
Test R2 Score:	-0.38159786939150275


In [28]:
fi = model.feature_importances_
col = X_train.columns
feat_imp = pd.DataFrame(fi, columns=['Importance'],index=col).sort_values(by='Importance', ascending=False)
print(feat_imp)

               Importance
moa              0.146808
pt_state_MA      0.093594
age              0.083277
yoa              0.048587
asource_5        0.046266
diag_cat3_431    0.037181
asource_4        0.031973
campus_3         0.028605
race_9           0.022652
diag_cat3_803    0.022210
sex              0.018078
race_1           0.016974
pay_ub92_6       0.016901
diag_cat3_821    0.013135
pay_ub92_2       0.011060
diag_cat3_867    0.010998
diag_cat3_396    0.010889
diag_cat3_452    0.009430
provider_4       0.009102
pay_ub92_1       0.008988
asource_1        0.008488
diag_cat3_042    0.008479
diag_cat3_038    0.008334
asource_S        0.007757
diag_cat3_780    0.007587
pay_ub92_4       0.007550
diag_cat3_430    0.007162
pay_ub92_9       0.006909
er_mode_0        0.006602
diag_cat3_205    0.006502
...                   ...
diag_cat3_826    0.000000
diag_cat3_488    0.000000
diag_cat3_828    0.000000
diag_cat3_831    0.000000
diag_cat3_836    0.000000
diag_cat3_454    0.000000
diag_cat3_71

## Lasso Regression

In [235]:
from sklearn.linear_model import Lasso

In [236]:
model = Lasso()
model.fit(X_train, y_train)
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

  return umr_sum(a, axis, dtype, out, keepdims, initial)


In [237]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	40829.33494339833
Train R2 Score:	0.21867529397774976
Test RMSE:	54734.417541741976
Test R2 Score:	-0.03899809149029676


## AdaBoost

In [42]:
from sklearn.ensemble import AdaBoostRegressor

In [43]:
model = AdaBoostRegressor()
model.fit(X_train, y_train)
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [44]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	42454.00729628877
Train R2 Score:	0.15525753646526463
Test RMSE:	55547.17772269557
Test R2 Score:	-0.07008368803758191


## Boosting

In [45]:
from sklearn.ensemble import GradientBoostingRegressor

In [46]:
params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.1, 'loss': 'ls'}
model = GradientBoostingRegressor(**params)
model.fit(X_train, y_train)
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [47]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	35974.332167098044
Train R2 Score:	0.39344187276731235
Test RMSE:	53114.78919158587
Test R2 Score:	0.02158146272999839


## XGBoost

In [238]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

In [239]:
# fit model to training data
#params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
#          'learning_rate': 0.3, 'loss': 'ls'}
#model = XGBRegressor(**params)
model = XGBRegressor()
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [240]:
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [241]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	39167.12455579084
Train R2 Score:	0.2809976271545833
Test RMSE:	54200.34633637991
Test R2 Score:	-0.018820960758463956


## Random Forest

In [242]:
from sklearn.ensemble import RandomForestRegressor

  from numpy.core.umath_tests import inner1d


In [255]:
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [256]:
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [257]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	19741.27068433493
Train R2 Score:	0.8173426501336625
Test RMSE:	55367.35477264504
Test R2 Score:	-0.06316653603708144
