# Predicting Hospitalization Costs

Chris Defreitas

November 2018

Bryant University

# Data Cleaning and Transformation

### Imports

Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
%matplotlib inline
plt.style.use('ggplot')

Import dataset

In [2]:
# Specify the selected field and their datatypes 

type_arrival = {'sex': 'float16',
                'er_mode': 'float16',
                'admtype': 'float16',
                'yoa': 'float16',
                'campus': 'category',
                'pay_ub92': 'category',
                'provider': 'category', 
                'asource': 'category',
                'moa': 'float16',
                'age': 'float16',
                'race': 'float16'}

type_diag = {'diag_adm': 'category', 'dx1': 'category', 'dx2': 'category', 'dx3': 'category', 'dx4': 'category', 
                'dx5': 'category', 'dx6': 'category', 'dx7': 'category', 'dx8': 'category', 'dx9': 'category', 
                'dx10': 'category', 'dx11': 'category', 'dx12': 'category', 'dx13': 'category', 'dx14': 'category', 
                'dx15': 'category', 'dx16': 'category', 'dx17': 'category', 'dx18': 'category', 'dx19': 'category', 
                'dx20': 'category', 'dx21': 'category', 'dx22': 'category', 'dx23': 'category', 'dx24': 'category',
                'dx25': 'category', 
             'poa1': 'category', 'poa2': 'category', 'poa3': 'category', 'poa4': 'category', 'poa5': 'category', 
                'poa6': 'category', 'poa7': 'category', 'poa8': 'category', 'poa9': 'category', 'poa10': 'category', 
                'poa11': 'category','poa12': 'category', 'poa13': 'category', 'poa14': 'category', 'poa15': 'category', 
                'poa16': 'category', 'poa17': 'category', 'poa18': 'category', 'poa19': 'category', 'poa20': 'category', 
                'poa21': 'category', 'poa22': 'category','poa23': 'category','poa24': 'category','poa25': 'category'}

type_target = {'tot': 'float64'}

col_arrival = [*type_arrival]
col_diag = [*type_diag]
col_target = [*type_target]

usecols = col_arrival + col_diag + col_target
dtype = {}
for d in [type_arrival, type_diag, type_target]:
    for k, v in d.items():
        dtype[k] = v

In [3]:
# import data using relevant columns and datatypes
df0 = pd.read_csv('hdd0313cy.csv', 
                  usecols=usecols, 
                  dtype=dtype,
                  #nrows=10000
                 )

In [4]:
print(df0.shape)

(1544747, 63)


## Missing Data

Filter on yoa of at least 2010

In [5]:
df1 = df0.copy()

In [6]:
def getFullYear(y):
    '''Converts yoa from yy format to yyyy format'''
    if y == 0:
        return 0
    elif y < 10:
        return float("200"+str(y)) - 2000
    elif y < 25:
        return float("20"+str(y)) - 2000
    elif y < 100:
        return float("19"+str(y)) - 2000
    else:
        return y - 2000

In [7]:
df1.yoa = df1.yoa.apply(getFullYear)

In [8]:
df1 = df1[df1.yoa >= 10]

In [9]:
print(df1.shape)

(537599, 63)


In [10]:
df1[col_arrival + col_target].isna().sum()

sex             0
er_mode     19126
admtype         1
yoa             0
campus          0
pay_ub92        0
provider        0
asource        14
moa             0
age             1
race           10
tot             0
dtype: int64

In [11]:
df1[col_arrival + col_target].describe(include='all')

Unnamed: 0,sex,er_mode,admtype,yoa,campus,pay_ub92,provider,asource,moa,age,race,tot
count,537599.0,518473.0,537598.0,537599.0,537599.0,537599.0,537599.0,537585.0,537599.0,537598.0,537589.0,537599.0
unique,,,,,5.0,15.0,14.0,17.0,,,,
top,,,,,0.0,1.0,7205.0,1.0,,,,
freq,,,,,371170.0,152265.0,137260.0,260502.0,,,,
mean,,,,11.459344,,,,,,,,29156.93
std,0.0,0.0,0.0,1.112772,,,,,0.0,0.0,0.0,49306.46
min,1.0,0.0,1.0,10.0,,,,,1.0,0.0,0.0,-1966.0
25%,1.0,0.0,1.0,10.0,,,,,3.0,29.0,1.0,10779.0
50%,2.0,0.0,1.0,11.0,,,,,6.0,54.0,1.0,18140.0
75%,2.0,1.0,2.0,12.0,,,,,9.0,73.0,1.0,31909.5


Code missing values as 9

In [12]:
df1.er_mode = df1.er_mode.fillna(0)
df1.race = df1.race.fillna(9)
df1.sex = df1.sex.fillna(9)
df1.asource = df1.asource.fillna('9')
df1.admtype = df1.admtype.fillna(9)

In [13]:
df2 = df1.copy()

Drop one record with missing age

In [14]:
df2 = df2[False == pd.isna(df2.age)]

In [15]:
print(df2.shape)

(537598, 63)


In [16]:
df2[col_arrival + col_target].isna().sum()

sex         0
er_mode     0
admtype     0
yoa         0
campus      0
pay_ub92    0
provider    0
asource     0
moa         0
age         0
race        0
tot         0
dtype: int64

In [17]:
df2[col_arrival + col_target].describe(include='all')

Unnamed: 0,sex,er_mode,admtype,yoa,campus,pay_ub92,provider,asource,moa,age,race,tot
count,537598.0,537598.0,537598.0,537598.0,537598.0,537598.0,537598.0,537598.0,537598.0,537598.0,537598.0,537598.0
unique,,,,,5.0,15.0,14.0,17.0,,,,
top,,,,,0.0,1.0,7205.0,1.0,,,,
freq,,,,,371169.0,152265.0,137260.0,260502.0,,,,
mean,,,,11.459343,,,,,,,,29156.97
std,0.0,0.0,0.0,1.112773,,,,,0.0,0.0,0.0,49306.49
min,1.0,0.0,1.0,10.0,,,,,1.0,0.0,0.0,-1966.0
25%,1.0,0.0,1.0,10.0,,,,,3.0,29.0,1.0,10779.0
50%,2.0,0.0,1.0,11.0,,,,,6.0,54.0,1.0,18140.0
75%,2.0,0.0,2.0,12.0,,,,,9.0,73.0,1.0,31909.75


In [18]:
df2[col_diag].isna().sum()

diag_adm       374
dx1            292
dx2          28554
dx3          64869
dx4         103605
dx5         143509
dx6         183099
dx7         221179
dx8         257031
dx9         290652
dx10        321812
dx11        350944
dx12        377281
dx13        401009
dx14        422275
dx15        441013
dx16        458260
dx17        473132
dx18        485277
dx19        495026
dx20        503045
dx21        509691
dx22        515237
dx23        519595
dx24        523077
dx25        526015
poa1           783
poa2         41417
poa3         76274
poa4        100121
poa5        149324
poa6        185468
poa7        208107
poa8        254104
poa9        285172
poa10       301869
poa11       341323
poa12       365777
poa13       377102
poa14       397400
poa15       415354
poa16       431256
poa17       445735
poa18       457483
poa19       466964
poa20       474781
poa21       481242
poa22       486653
poa23       490917
poa24       494362
poa25       497243
dtype: int64

Drop records with missing diagnosis on admission

In [19]:
df2 = df2[False == pd.isna(df2.diag_adm)]

In [20]:
df2.shape

(537224, 63)

In [21]:
df2[col_diag].isna().sum()

diag_adm         0
dx1              2
dx2          28261
dx3          64567
dx4         103295
dx5         143188
dx6         182769
dx7         220840
dx8         256689
dx9         290307
dx10        321462
dx11        350590
dx12        376921
dx13        400640
dx14        421905
dx15        440642
dx16        457889
dx17        472761
dx18        484906
dx19        494655
dx20        502673
dx21        509319
dx22        514864
dx23        519222
dx24        522703
dx25        525641
poa1           495
poa2         41125
poa3         75976
poa4         99816
poa5        149012
poa6        185148
poa7        207780
poa8        253774
poa9        284840
poa10       301532
poa11       340981
poa12       365429
poa13       376746
poa14       397043
poa15       414996
poa16       430899
poa17       445377
poa18       457125
poa19       466606
poa20       474423
poa21       480884
poa22       486294
poa23       490558
poa24       494002
poa25       496883
dtype: int64

Remove columns that are not known upon admission. Remove rows that are missing relevant data.

In [22]:
df2 = df2[df2.age <= 100]
print(df2.shape)

(536923, 63)


## Cleanse Diagnosis Codes

In [23]:
df3 = df2.copy()

In [24]:
#df3.to_csv("df3.csv")

In [25]:
df3.head()

Unnamed: 0,pay_ub92,age,sex,provider,moa,yoa,admtype,asource,dx1,dx2,...,poa16,poa17,poa18,poa19,poa20,poa21,poa22,poa23,poa24,poa25
1005352,4,20.0,2.0,7214,3.0,10.0,2.0,1,66401,64821,...,,,,,,,,,,
1005353,4,0.0,2.0,7214,3.0,10.0,4.0,S,V3000,7746,...,,,,,,,,,,
1005354,4,28.0,2.0,7214,3.0,10.0,2.0,1,64893,514,...,,,,,,,,,,
1005355,4,28.0,2.0,7214,3.0,10.0,2.0,1,64863,4280,...,,,,,,,,,,
1005356,4,26.0,2.0,7214,3.0,10.0,2.0,1,65813,64843,...,,,,,,,,,,


Preview data

In [26]:
cxx = ['sex', 'age', 'provider', 'diag_adm', 'dx1', 'dx2', 'dx3', 'dx4', 'dx5', 'poa1', 'poa2', 'poa3', 'poa4', 'poa5']
df3_preview = df3.filter(items=cxx, axis='columns')

## Encoding All dx Present on Admission

Delete diagnostic codes that are not present on admission using poa(1-25)

In [27]:
dxs = ["dx{}".format(i) for i in range(1, 26)]
dxs.append('diag_adm')

In [28]:
print(len(dxs))

26


In [29]:
def dx_poa(df, a=1, b=26):
    for i in range(a, b):
        dx = "dx{}".format(i)
        poa = "poa{}".format(i)
        df[dx] = df[dx][df[poa]=="Y"]
    return df

In [30]:
df4 = dx_poa(df3)

In [31]:
drop_cols = ["poa{}".format(i) for i in range(1,26)]

In [32]:
df4 = df4.drop(drop_cols, axis=1)

In [33]:
df4.shape

(536923, 38)

In [34]:
df4.head()

Unnamed: 0,pay_ub92,age,sex,provider,moa,yoa,admtype,asource,dx1,dx2,...,dx17,dx18,dx19,dx20,dx21,dx22,dx23,dx24,dx25,race
1005352,4,20.0,2.0,7214,3.0,10.0,2.0,1,,64821,...,,,,,,,,,,1.0
1005353,4,0.0,2.0,7214,3.0,10.0,4.0,S,,7746,...,,,,,,,,,,1.0
1005354,4,28.0,2.0,7214,3.0,10.0,2.0,1,64893.0,514,...,,,,,,,,,,1.0
1005355,4,28.0,2.0,7214,3.0,10.0,2.0,1,64863.0,4280,...,,,,,,,,,,1.0
1005356,4,26.0,2.0,7214,3.0,10.0,2.0,1,65813.0,64843,...,,,,,,,,,,1.0


## Change Datatypes

In [35]:
print(df4.dtypes)

pay_ub92    category
age          float16
sex          float16
provider    category
moa          float16
yoa          float64
admtype      float16
asource     category
dx1         category
dx2         category
dx3         category
dx4         category
dx5         category
dx6         category
dx7         category
tot          float64
dx8         category
dx9         category
dx10        category
dx11        category
diag_adm    category
campus      category
er_mode      float16
dx12        category
dx13        category
dx14        category
dx15        category
dx16        category
dx17        category
dx18        category
dx19        category
dx20        category
dx21        category
dx22        category
dx23        category
dx24        category
dx25        category
race         float16
dtype: object


In [36]:
df4.provider = df4.provider.astype('int16') - 7200

In [37]:
types = {'pay_ub92': 'int8', 
         'sex': 'int8', 
         'provider': 'int8',
         'moa': 'int8',
         'yoa': 'int8',
         'admtype': 'int8',
         'er_mode': 'int8',
         'race': 'int8'
        }

In [38]:
for t, _ in types.items():
    print(t, list(df4[t].unique()))

pay_ub92 ['4', '1', '6', '3', '10', '9', '7', '8', '2', '13', '12', '99', '11', '98', '5']
sex [2.0, 1.0, 9.0]
provider [14, 13, 10, 9, 1, 15, 2, 5, 6, 11, 4, 3, 16, 12]
moa [3.0, 2.0, 1.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
yoa [10.0, 11.0, 12.0, 13.0]
admtype [2.0, 4.0, 1.0, 3.0, 9.0]
er_mode [0.0, 1.0, 4.0, 3.0, 5.0, 9.0]
race [1.0, 3.0, 5.0, 9.0, 2.0, 6.0, 4.0, 0.0]


In [39]:
df4 = df4.astype(types)

In [40]:
df4.head()

Unnamed: 0,pay_ub92,age,sex,provider,moa,yoa,admtype,asource,dx1,dx2,...,dx17,dx18,dx19,dx20,dx21,dx22,dx23,dx24,dx25,race
1005352,4,20.0,2,14,3,10,2,1,,64821,...,,,,,,,,,,1
1005353,4,0.0,2,14,3,10,4,S,,7746,...,,,,,,,,,,1
1005354,4,28.0,2,14,3,10,2,1,64893.0,514,...,,,,,,,,,,1
1005355,4,28.0,2,14,3,10,2,1,64863.0,4280,...,,,,,,,,,,1
1005356,4,26.0,2,14,3,10,2,1,65813.0,64843,...,,,,,,,,,,1


## Save to CSV

In [41]:
df_sample = df4.sample(10000, random_state=0)

In [42]:
df_sample.to_csv("df_sample.csv")

In [43]:
df4.to_csv("df_cleansed.csv")