# Predicting Hospitalization Costs

Chris Defreitas

November 2018

Bryant University

# Data Cleaning and Transformation

### Imports

Import libraries

In [1]:
import numpy as np
import pandas as pd
import copy
from pprint import pprint

In [2]:
import warnings
warnings.filterwarnings('ignore')

Import dataset

In [3]:
# Specify the selected field and their datatypes 

type_arrival = {'sex': 'float16',
                'er_mode': 'float16',
                'admtype': 'float16',
                'yoa': 'float16',
                'campus': 'category',
                'pay_ub92': 'category',
                'provider': 'category', 
                'asource': 'category',
                'moa': 'float16',
                'age': 'float16',
                'race': 'float16',
                'diag_adm': 'category',
                'pt_state': 'category',
               }

type_other = {}

type_target = {'tot': 'float64'}

col_arrival = [*type_arrival]
col_other = [*type_other]
col_target = [*type_target]

usecols = col_arrival + col_other + col_target
dtype = {}
for d in [type_arrival, type_other, type_target]:
    for k, v in d.items():
        dtype[k] = v

In [4]:
# import data using relevant columns and datatypes
df0 = pd.read_csv('hdd0313cy.csv', 
                  usecols=usecols, 
                  dtype=dtype,
                 )

In [5]:
print(df0.shape)

(1544747, 14)


## Missing Data

Filter on yoa of at least 2005

In [6]:
df1 = df0.copy()

In [7]:
def getFullYear(y):
    '''Converts yoa from yy format to yyyy format'''
    if y == 0:
        return 0
    elif y < 10:
        return float("200"+str(y)) - 2000
    elif y < 25:
        return float("20"+str(y)) - 2000
    elif y < 100:
        return float("19"+str(y)) - 2000
    else:
        return y - 2000

In [8]:
df1.yoa = df1.yoa.apply(getFullYear)

In [9]:
df1 = df1[df1.yoa >= 5]

In [10]:
print(df1.shape)

(1260408, 14)


Drop values with negative total cost and missing ages

In [11]:
df1 = df1[df1.tot > 0]

In [12]:
print(df1.shape)

(1260013, 14)


In [13]:
df1 = df1[False == df1.age.isna()]

In [14]:
print(df1.shape)

(1260006, 14)


In [15]:
df1[col_arrival + col_target].isna().sum()

sex              0
er_mode     209983
admtype          8
yoa              0
campus           0
pay_ub92         0
provider         0
asource        114
moa              0
age              0
race            20
diag_adm       554
pt_state       257
tot              0
dtype: int64

In [16]:
df1[col_arrival + col_target].describe(include='all')

Unnamed: 0,sex,er_mode,admtype,yoa,campus,pay_ub92,provider,asource,moa,age,race,diag_adm,pt_state,tot
count,1260006.0,1050023.0,1259998.0,1260006.0,1260006.0,1260006.0,1260006.0,1259892.0,1260006.0,1260006.0,1259986.0,1259452,1259749,1260006.0
unique,,,,,6.0,15.0,14.0,20.0,,,,6632,84,
top,,,,,0.0,1.0,7205.0,1.0,,,,V3000,RI,
freq,,,,,871432.0,356231.0,308331.0,464072.0,,,,75222,1157711,
mean,,,,8.901591,,,,,,,,,,25760.76
std,0.0,0.0,0.0,2.556645,,,,,0.0,0.0,0.0,,,45335.06
min,1.0,0.0,1.0,5.0,,,,,1.0,0.0,0.0,,,1.0
25%,1.0,0.0,1.0,7.0,,,,,3.0,29.0,1.0,,,8692.0
50%,2.0,0.0,1.0,9.0,,,,,6.0,53.0,1.0,,,15473.0
75%,2.0,1.0,3.0,11.0,,,,,9.0,74.0,1.0,,,28091.0


In [17]:
print([*df1.pt_state.unique()])

['RI', 'MA', 'MN', 'CT', 'CA', 'ME', 'IL', 'KY', 'VA', 'FL', 'XX', 'PA', 'TX', 'GA', 'NH', 'SC', 'AL', 'NC', 'WI', 'NY', 'QC', 'ID', nan, 'LA', 'NE', 'MD', 'MI', 'TN', 'NJ', 'NM', 'VT', 'KS', 'OR', 'NV', 'OH', 'PR', 'AZ', 'WV', 'AK', 'DE', 'IN', 'YY', 'OK', 'CO', 'MO', 'HI', 'UT', 'IA', 'ON', 'YT', 'DC', 'AR', 'MS', '75', 'VI', 'NB', 'UK', 'EN', 'ND', 'WA', '`', 'SD', 'FI', 'FC', 'U', 'Q', 'MT', '-2', 'UN', 'WY', 'R', 'D', 'BC', 'RD', 'PQ', 'CD', 'RO', '40', 'RT', 'AB', '02', '__', 'AS', 'ri', 'GU']


In [18]:
state_codes = {'RI', 'MA', 'MN', 'CT', 'CA', 'ME', 'IL', 'KY', 'VA', 'FL', 'PA', 'TX', 'GA', 'NH', 'SC', 'AL', 'NC', 
              'WI', 'NY', 'QC', 'ID', 'LA', 'NE', 'MD', 'MI', 'TN', 'NJ', 'NM', 'VT', 'KS', 'OR', 'NV', 'OH', 'PR',
              'AZ', 'WV', 'AK', 'DE', 'IN', 'OK', 'CO', 'MO', 'HI', 'UT', 'IA', 'ON', 'YT', 'DC', 'AR', 'MS', 'VI',
              'NB', 'UK', 'EN', 'ND', 'WA', 'SD', 'FI', 'FC', 'MT', 'UN', 'WY', 'BC', 'RD', 'PQ', 'CD', 'RO', 'RT',
              'AB', 'AS', 'GU', '9'}
print(len(state_codes))
ansi = {"AK", "AL", "AR", "AS", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "FM", "GA", "GU", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MH", "MI", "MN", "MO", "MP", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "PR", "PW", "RI", "SC", "SD", "TN", "TX", "UM", "UT", "VA", "VI", "VT", "WA", "WI", "WV", "WY"}

state_codes = state_codes.union(ansi)
print(len(state_codes))

df1.pt_state = df1.pt_state.cat.set_categories(state_codes)
df1.pt_state = df1.pt_state.fillna('9')
print(df1.groupby(['pt_state']).size().sort_values(ascending=False))

72
77
pt_state
RI    1157711
MA      71857
CT      20816
FL       1908
NY       1677
9         667
NJ        604
NH        502
PA        412
CA        405
ME        374
VA        289
TX        259
NC        237
MD        225
GA        205
IL        155
VT        142
SC        138
OH        137
AZ        124
MI        105
IN         67
WA         66
CO         62
MN         60
NV         58
TN         57
LA         51
WI         49
       ...   
PR         14
KS         12
IA         12
AK         11
ND          9
MT          9
SD          7
QC          6
BC          5
WY          5
VI          4
FI          2
RO          2
NB          2
PQ          2
AB          1
RT          1
GU          1
CD          1
RD          1
UK          1
EN          1
YT          1
UN          1
AS          1
MP          0
FM          0
MH          0
UM          0
PW          0
Length: 77, dtype: int64


Code missing values as 9

In [19]:
df1.er_mode = df1.er_mode.fillna(0)
df1.race = df1.race.fillna(9)
df1.race = df1.race.where(df1.race > 0, 9)
#df1.sex = df1.sex.fillna(9)
df1.asource = df1.asource.fillna('9')
df1.admtype = df1.admtype.fillna(9)
df1.age = df1.age.fillna(df1.age.median())
df1.diag_adm = df1.diag_adm.cat.add_categories(['XXX'])
df1.diag_adm = df1.diag_adm.fillna('XXX')

In [20]:
df2 = df1.copy()

In [21]:
print(df2.shape)

(1260006, 14)


In [22]:
df2[col_arrival + col_target].isna().sum()

sex         0
er_mode     0
admtype     0
yoa         0
campus      0
pay_ub92    0
provider    0
asource     0
moa         0
age         0
race        0
diag_adm    0
pt_state    0
tot         0
dtype: int64

In [23]:
df2[col_arrival + col_target].describe(include='all')

Unnamed: 0,sex,er_mode,admtype,yoa,campus,pay_ub92,provider,asource,moa,age,race,diag_adm,pt_state,tot
count,1260006.0,1260006.0,1260006.0,1260006.0,1260006.0,1260006.0,1260006.0,1260006.0,1260006.0,1260006.0,1260006.0,1260006,1260006,1260006.0
unique,,,,,6.0,15.0,14.0,20.0,,,,6633,72,
top,,,,,0.0,1.0,7205.0,1.0,,,,V3000,RI,
freq,,,,,871432.0,356231.0,308331.0,464072.0,,,,75222,1157711,
mean,,,,8.901591,,,,,,,,,,25760.76
std,0.0,0.0,0.0,2.556645,,,,,0.0,0.0,0.0,,,45335.06
min,1.0,0.0,1.0,5.0,,,,,1.0,0.0,1.0,,,1.0
25%,1.0,0.0,1.0,7.0,,,,,3.0,29.0,1.0,,,8692.0
50%,2.0,0.0,1.0,9.0,,,,,6.0,53.0,1.0,,,15473.0
75%,2.0,1.0,3.0,11.0,,,,,9.0,74.0,1.0,,,28091.0


## Categorize Diagnosis Codes

In [24]:
df3 = df2.copy()

In [25]:
df3.head()

Unnamed: 0,pay_ub92,age,sex,provider,moa,yoa,admtype,asource,tot,pt_state,diag_adm,campus,er_mode,race
282340,6,87.0,2.0,7210,2.0,5.0,1.0,7,3412.0,RI,486,0,0.0,1.0
282341,6,38.0,1.0,7210,2.0,5.0,1.0,7,4132.0,RI,5128,0,0.0,1.0
282342,6,22.0,2.0,7210,2.0,5.0,1.0,9,12583.0,RI,29650,0,0.0,1.0
282343,6,45.0,2.0,7210,1.0,5.0,1.0,7,17462.0,RI,7850,0,0.0,1.0
282344,6,26.0,2.0,7210,1.0,5.0,2.0,1,10357.0,RI,650,0,0.0,9.0


In [26]:
diag_cat3 = df1.diag_adm.astype(str).str[:3]
df3 = df3.assign(diag_cat3=diag_cat3.values)

In [27]:
diag_cat4 = df1.diag_adm.astype(str).str[:4]
df3 = df3.assign(diag_cat4=diag_cat4.values)

In [28]:
df3.head()

Unnamed: 0,pay_ub92,age,sex,provider,moa,yoa,admtype,asource,tot,pt_state,diag_adm,campus,er_mode,race,diag_cat3,diag_cat4
282340,6,87.0,2.0,7210,2.0,5.0,1.0,7,3412.0,RI,486,0,0.0,1.0,486,486
282341,6,38.0,1.0,7210,2.0,5.0,1.0,7,4132.0,RI,5128,0,0.0,1.0,512,5128
282342,6,22.0,2.0,7210,2.0,5.0,1.0,9,12583.0,RI,29650,0,0.0,1.0,296,2965
282343,6,45.0,2.0,7210,1.0,5.0,1.0,7,17462.0,RI,7850,0,0.0,1.0,785,7850
282344,6,26.0,2.0,7210,1.0,5.0,2.0,1,10357.0,RI,650,0,0.0,9.0,650,650


In [29]:
df4 = df3.copy()

## Change Datatypes

In [30]:
print(df4.dtypes)

pay_ub92     category
age           float16
sex           float16
provider     category
moa           float16
yoa           float64
admtype       float16
asource      category
tot           float64
pt_state     category
diag_adm     category
campus       category
er_mode       float16
race          float16
diag_cat3      object
diag_cat4      object
dtype: object


In [31]:
df4.provider = df4.provider.astype('int16') - 7200

In [32]:
types = {'pay_ub92': 'int8', 
         'sex': 'int8', 
         'provider': 'int8',
         'moa': 'int8',
         'yoa': 'int8',
         'admtype': 'int8',
         'er_mode': 'int8',
         'race': 'int8',
         'diag_cat3': 'category',
         'diag_cat4': 'category',
        }

In [33]:
for t, _ in types.items():
    print(t, list(df4[t].unique()))

pay_ub92 ['6', '2', '10', '9', '1', '8', '3', '7', '4', '11', '13', '99', '5', '12', '98']
sex [2.0, 1.0, 9.0]
provider [10, 5, 1, 11, 4, 3, 13, 14, 6, 9, 2, 16, 15, 12]
moa [2.0, 1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
yoa [5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]
admtype [1.0, 2.0, 3.0, 4.0, 9.0]
er_mode [0.0, 9.0, 1.0, 5.0, 3.0, 4.0, 2.0]
race [1.0, 9.0, 3.0, 5.0, 2.0, 6.0, 7.0, 4.0]
diag_cat3 ['486', '512', '296', '785', '650', '311', '038', '995', '997', '428', '185', '654', '715', '291', '560', '530', '789', '682', '557', '541', 'V30', '814', '642', '586', '411', '786', 'V10', '410', '427', '540', '592', '625', '578', '491', 'V55', '434', '401', '453', '487', '787', '780', '782', '996', '157', '196', '340', '599', '584', '722', '788', '415', '285', '515', '600', 'V45', '444', '442', '562', '286', '496', '153', '309', '620', '726', '218', '810', '825', '574', '276', '784', '707', '820', '591', '305', '823', '300', '781', '202', '250', '724', '493', '555',

In [34]:
df4 = df4.astype(types)

In [35]:
df4.head()

Unnamed: 0,pay_ub92,age,sex,provider,moa,yoa,admtype,asource,tot,pt_state,diag_adm,campus,er_mode,race,diag_cat3,diag_cat4
282340,6,87.0,2,10,2,5,1,7,3412.0,RI,486,0,0,1,486,486
282341,6,38.0,1,10,2,5,1,7,4132.0,RI,5128,0,0,1,512,5128
282342,6,22.0,2,10,2,5,1,9,12583.0,RI,29650,0,0,1,296,2965
282343,6,45.0,2,10,1,5,1,7,17462.0,RI,7850,0,0,1,785,7850
282344,6,26.0,2,10,1,5,2,1,10357.0,RI,650,0,0,9,650,650


In [36]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1260006 entries, 282340 to 1544746
Data columns (total 16 columns):
pay_ub92     1260006 non-null int8
age          1260006 non-null float16
sex          1260006 non-null int8
provider     1260006 non-null int8
moa          1260006 non-null int8
yoa          1260006 non-null int8
admtype      1260006 non-null int8
asource      1260006 non-null category
tot          1260006 non-null float64
pt_state     1260006 non-null category
diag_adm     1260006 non-null category
campus       1260006 non-null category
er_mode      1260006 non-null int8
race         1260006 non-null int8
diag_cat3    1260006 non-null category
diag_cat4    1260006 non-null category
dtypes: category(6), float16(1), float64(1), int8(8)
memory usage: 42.7 MB


Print out datatypes as a dictionary to make importing easier in the next notebook

In [37]:
dtypes = df4.dtypes
dt = {}
for i in range(len(dtypes)):
    dt[dtypes.index[i]] = str(dtypes.iloc[i])
pprint(dt)

{'admtype': 'int8',
 'age': 'float16',
 'asource': 'category',
 'campus': 'category',
 'diag_adm': 'category',
 'diag_cat3': 'category',
 'diag_cat4': 'category',
 'er_mode': 'int8',
 'moa': 'int8',
 'pay_ub92': 'int8',
 'provider': 'int8',
 'pt_state': 'category',
 'race': 'int8',
 'sex': 'int8',
 'tot': 'float64',
 'yoa': 'int8'}


## Filter Data

In [38]:
#df4.to_csv("df_cleansed2_all.csv", index=False)

In [39]:
df4 = df4[df4.tot <= 30000]

## Save to CSV

In [40]:
#df_sample = df4.sample(20000, random_state=0)

In [41]:
#df_sample.to_csv("df_sample_cleansed2.csv", index=False)

In [42]:
#df4.to_csv("df_cleansed2.csv", index=False)