In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
%matplotlib inline
plt.style.use('ggplot')

Import dataset

In [2]:
dtype = {'admtype': 'int8',
 'age': 'float16',
 'asource': 'category',
 'campus': 'category',
 'diag_adm': 'category',
 'diag_cat3': 'category',
 'diag_cat4': 'category',
 'er_mode': 'int8',
 'moa': 'int8',
 'pay_ub92': 'int8',
 'provider': 'int8',
 'pt_state': 'category',
 'race': 'int8',
 'sex': 'int8',
 'tot': 'float64',
 'yoa': 'int8'}

In [3]:
df0 = pd.read_csv('df_sample_cleansed2.csv', dtype=dtype, low_memory=False)
#df0 = pd.read_csv('df_cleansed2.csv', dtype=dtype, low_memory=False)
#df3 = pd.read_csv('df_sample_coded.csv', low_memory=False)
#df3 = pd.read_csv('df_binary.csv', low_memory=False)
#df3 = pd.read_csv('df3_arrival.csv', low_memory=False)

In [4]:
#df3 = df3.drop(['Unnamed: 0'], axis='columns')

In [5]:
df0.head()

Unnamed: 0,pay_ub92,age,sex,provider,moa,yoa,admtype,asource,tot,pt_state,diag_adm,campus,er_mode,race,diag_cat3,diag_cat4
0,4,0.0,1,14,7,11,4,S,5926.0,RI,V3001,0,0,3,V30,V300
1,1,84.0,2,3,4,11,2,2,24048.0,MA,3310,0,9,1,331,3310
2,1,78.0,1,2,4,7,1,7,9680.0,RI,49121,1,0,1,491,4912
3,4,0.0,2,14,3,13,4,S,3013.0,RI,V3000,0,0,9,V30,V300
4,1,83.0,1,2,10,7,1,7,16246.0,RI,3488,1,0,1,348,3488


In [6]:
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 16 columns):
pay_ub92     20000 non-null int8
age          20000 non-null float16
sex          20000 non-null int8
provider     20000 non-null int8
moa          20000 non-null int8
yoa          20000 non-null int8
admtype      20000 non-null int8
asource      20000 non-null category
tot          20000 non-null float64
pt_state     20000 non-null category
diag_adm     20000 non-null category
campus       20000 non-null category
er_mode      20000 non-null int8
race         20000 non-null int8
diag_cat3    20000 non-null category
diag_cat4    20000 non-null category
dtypes: category(6), float16(1), float64(1), int8(8)
memory usage: 698.6 KB


In [7]:
df0.isna().sum()

pay_ub92     0
age          0
sex          0
provider     0
moa          0
yoa          0
admtype      0
asource      0
tot          0
pt_state     0
diag_adm     0
campus       0
er_mode      0
race         0
diag_cat3    0
diag_cat4    0
dtype: int64

## Transformation

In [8]:
def simplify_state(df):
    """Group states"""
    state_codes = ['RI', 'MA', 'NY', 'FL', '9']
    df.pt_state = df.pt_state.cat.set_categories(state_codes)
    df.pt_state = df.pt_state.fillna('9')
    return df

In [9]:
def simplify_age(df):
    """Reduce ages over 100 to 100"""
    df.age = df.age.where(df.age < 100, other=100)
    return df

In [10]:
def transform_target(df):
    
    return df

In [11]:
def transform_features(df):
    """Execute transformations"""
    df = simplify_state(df)
    df = simplify_age(df)
    #df = transform_target(df)
    return df

In [12]:
df1 = transform_features(df0)

## Feature Selection

In [13]:
drop_col = ['diag_cat3', 'diag_cat4']
df2 = df1.drop(drop_col, axis=1)

## One-Hot-Encoding

In [14]:
col = ['pay_ub92', 'provider', 'admtype', 'asource', 'pt_state', 'campus', 'er_mode', 'race', 'diag_adm']
df2 = pd.get_dummies(df2, columns=col, prefix=col)

## Split data set into Train and Test

In [15]:
df2.head()

Unnamed: 0,age,sex,moa,yoa,tot,pay_ub92_1,pay_ub92_2,pay_ub92_3,pay_ub92_4,pay_ub92_6,...,diag_adm_V6284,diag_adm_V6285,diag_adm_V650,diag_adm_V667,diag_adm_V702,diag_adm_V717,diag_adm_V7189,diag_adm_V7281,diag_adm_V7651,diag_adm_XXX
0,0.0,1,7,11,5926.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,84.0,2,4,11,24048.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,78.0,1,4,7,9680.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,2,3,13,3013.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,83.0,1,10,7,16246.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
y = df2.tot.astype(float)
X = df2.drop(columns=['tot'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Normalize Transform

In [18]:
from sklearn.preprocessing import Normalizer

In [19]:
def normalize_attr(train, test, attrs):
    for feature_name in attrs:
        max_value = train[feature_name].max()
        min_value = train[feature_name].min()
        train[feature_name] = (train[feature_name] - min_value) / (max_value - min_value)
        test[feature_name] = (test[feature_name] - min_value) / (max_value - min_value)
    return train, test

In [20]:
X_train, X_test = normalize_attr(X_train.copy(), X_test.copy(), ['age'])

In [21]:
def normalize_target(train, test):
    max_value = train.max()
    min_value = train.min()
    train = (train - min_value) / (max_value - min_value)
    test = (test - min_value) / (max_value - min_value)
    return train, test

In [22]:
y_train, y_test = normalize_target(y_train.copy(), y_test.copy())

In [23]:
y_train.describe()

count    16000.000000
mean         0.432148
std          0.246984
min          0.000000
25%          0.232000
50%          0.402350
75%          0.606933
max          1.000000
Name: tot, dtype: float64

In [24]:
#train, test = train_test_split(df2, test_size=0.2, random_state=0)

In [25]:
print(X_train.shape)
print(X_test.shape)
#print(train.shape)
#print(test.shape)

(16000, 1852)
(4000, 1852)


## Count Encoding

In [26]:
#diag_counts = train[['diag_cat3', 'tot']].groupby(by='diag_cat3').mean().to_dict()['tot']
#other = train.tot.mean

In [27]:
#def lookup_count(attr):
#    try:
#        return diag_counts[attr]
#    except KeyError:
#        return other   

In [28]:
#def count_encode(df, attr):
#    encoded = df[attr].apply(lookup_count)
#    df = df.assign(diag_encoded=encoded.values)
#    df.diag_encoded = df.diag_encoded/df.diag_encoded.max()
#    df = df.drop(attr, axis=1)
#    return df

In [29]:
#train = count_encode(train, 'diag_cat3')
#test = count_encode(test, 'diag_cat3')

In [30]:
#X_train = train.drop('tot', axis=1)
#X_test = test.drop('tot', axis=1)
#y_train = train.tot
#y_test = test.tot

In [31]:
#X_train.head()

In [32]:
#X_train.describe()

## Model Testing

In [33]:
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [34]:
def print_model_score(train_predicted, test_predicted):
    train_rmse = mean_squared_error(y_train, train_predicted)**.5
    train_r2 = r2_score(y_train, train_predicted)
    test_rmse = mean_squared_error(y_test, test_predicted)**.5
    test_r2 = r2_score(y_test, test_predicted)
    print("Train RMSE:\t{}".format(train_rmse))
    print("Train R2 Score:\t{}".format(train_r2))
    print("Test RMSE:\t{}".format(test_rmse))
    print("Test R2 Score:\t{}".format(test_r2))

## Decision Tree

In [35]:
from sklearn.tree import DecisionTreeRegressor

In [36]:
model = DecisionTreeRegressor(min_samples_split=.05)
model.fit(X_train, y_train)
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [37]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	0.20143123468769117
Train R2 Score:	0.3348129658752408
Test RMSE:	0.20170112758959713
Test R2 Score:	0.31372048980216327


In [38]:
fi = model.feature_importances_
col = X_train.columns
feat_imp = pd.DataFrame(fi, columns=['Importance'],index=col).sort_values(by='Importance', ascending=False)
#print(feat_imp)

## Lasso Regression

In [39]:
from sklearn.linear_model import Lasso

In [40]:
model = Lasso()
model.fit(X_train, y_train)
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

  return umr_sum(a, axis, dtype, out, keepdims, initial)


In [41]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	0.24697609927006825
Train R2 Score:	0.0
Test RMSE:	0.24347853634014985
Test R2 Score:	-1.3228747556270903e-05


## AdaBoost

In [42]:
from sklearn.ensemble import AdaBoostRegressor

  from numpy.core.umath_tests import inner1d


In [43]:
model = AdaBoostRegressor()
model.fit(X_train, y_train)
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [44]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	0.21625033532335408
Train R2 Score:	0.23333837576272465
Test RMSE:	0.21594065562364878
Test R2 Score:	0.21340132292766356


## Boosting

In [39]:
from sklearn.ensemble import GradientBoostingRegressor

  from numpy.core.umath_tests import inner1d


In [40]:
params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.1, 'loss': 'ls'}
model = GradientBoostingRegressor(**params)
model.fit(X_train, y_train)
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [41]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	0.19165114385227985
Train R2 Score:	0.39783851082832333
Test RMSE:	0.1949202908156645
Test R2 Score:	0.3590878838963434


## XGBoost

In [80]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

In [81]:
# fit model to training data
#params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
#          'learning_rate': 0.3, 'loss': 'ls'}
#model = XGBRegressor(**params)
model = XGBRegressor()
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [82]:
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [83]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	5868.158168167421
Train R2 Score:	0.3727347066994531
Test RMSE:	5885.721982000509
Test R2 Score:	0.3507060461052517


## Random Forest

In [84]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model = RandomForestRegressor(n_estimators=10, criterion='mae')
model.fit(X_train, y_train)

In [None]:
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

In [None]:
print_model_score(train_predicted, test_predicted)

## Neural Network

In [46]:
from sklearn.neural_network import MLPRegressor

In [47]:
model = MLPRegressor()
model.fit(X_train, y_train)

  return umr_sum(a, axis, dtype, out, keepdims, initial)


MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [48]:
train_predicted = model.predict(X_train)
test_predicted = model.predict(X_test)

  return umr_sum(a, axis, dtype, out, keepdims, initial)


In [49]:
print_model_score(train_predicted, test_predicted)

Train RMSE:	5939.407643793184
Train R2 Score:	0.3574100877799382
Test RMSE:	5959.010555209887
Test R2 Score:	0.3344354525513916
