# Data Cleaning

This notebook clean the data to prepare it for the analysis. The cleaned data is then save to df_clean.csv

We will do the follows. 

1. Remove variables(columns) that has more than 70% of NaN value

2.  Handle categorical values with too many categories: pt_state, ecodub92 and diag_adm:  We will find a reasonable way to group these categories into fewer categories. 

3. Save the data to df_clean.csv
 

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('df.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### 1. Remove variables 

We remove variables that has more than 70% of missing values. 

In [3]:
DROP_LIMIT = 0.7
for var in df.columns:
    if df[var].isna().sum()/df[var].count() > DROP_LIMIT:
        df.drop(var, axis=1, inplace=True)

In [4]:
orig_col = ['yod','age','sex','b_wt','ethnic','pt_state','race','raceethn','campus','er_mode','admtype', 'payer',
      'yoa','pay_ub92','provider','asource','ecodepoa','moa','service','ecodub92','diag_adm','los']

for c in orig_col:
    if c not in df.columns:
        print(c)

race
er_mode
ecodepoa


#### Removed columns:
* race
* er_mode
* ecodepoa

### 2. Handle pt_state

In [5]:
df['pt_state'].unique()

array([nan, "b'00'", "b'RI'", "b'CA'", "b'01'", "b'08'", "b'TX'", "b'CT'",
       "b'NH'", "b'02'", "b'NC'", "b'DE'", "b'YY'", "b'MA'", "b'NY'",
       "b'05'", "b'25'", "b'26'", "b'20'", "b'31'", "b'FL'", "b'11'",
       "b'06'", "b'18'", "b'PA'", "b'03'", "b'16'", "b'IL'", "b'10'",
       "b'13'", "b'21'", "b'07'", "b'09'", "b'LA'", "b'17'", "b'12'",
       "b'NE'", "b'ME'", "b'04'", "b'22'", "b'NJ'", "b'15'", "b'19'",
       "b'34'", "b'14'", "b'MI'", "b'MD'", "b'OR'", "b'NV'", "b'27'",
       "b'OH'", "b'GA'", "b'AK'", "b'53'", "b'E8'", "b'MO'", "b'IN'",
       "b'TN'", "b'VT'", "b'CO'", "b'AZ'", "b'VA'", "b'WI'", "b'KS'",
       "b'-2'", "b'SC'", "b'DC'", "b'30'", "b'V4'", "b'MN'", "b'UK'",
       "b'OK'", "b'V1'", "b'24'", "b'44'", "b'36'", "b'40'", "b'33'",
       "b'61'", "b'80'", "b'45'", "b'73'", "b'AL'", "b'HI'", "b'0-'",
       "b'XX'", "b'KY'", "b'E9'", "b'78'", "b'42'", "b'69'", "b'V0'",
       "b'74'", "b'23'", "b'-3'", "b'WV'", "b'VI'", "b'AR'", "b'UT'",
       "b'28'",

In [6]:
from collections import defaultdict

In [7]:
map_dict = defaultdict(lambda: 'Other')
map_dict["b'RI'"] = 'RI'
map_dict["b'CT'"] = 'CT'
map_dict["b'MA'"] = 'MA'

In [8]:
df['pt_state'] = df['pt_state'].map(map_dict, na_action='ignore').fillna('Unknown')

In [9]:
df['pt_state'].unique()

array(['Unknown', 'Other', 'RI', 'CT', 'MA'], dtype=object)

### 3. Handle ecodub92


In [10]:
df = df.drop(['ecodub92'], axis=1)

### 4. Handle diag_adm

In [11]:
# fill NA values with 'Unknown'
df['diag_adm'] = df['diag_adm'].fillna('Unknown')

# Keep the first three letters of the diagnosis (Categories of dianosis)
df['diag_adm'] = [i if i=='Unknown' else i[2:5] for i in df['diag_adm']]

# Group diagnosis codes with few values into Others
u = df['diag_adm'].value_counts()
v = u.index[u.values<5]
df['diag_adm'] = ['Others' if i in v else i for i in df['diag_adm']]

In [70]:
def f(x):
    r = pd.Series()
    r['Unique Values'] =  x.unique().__len__()
    missing_values = x.isna().sum()
    r['Missing'] = missing_values
    r['Missing Percentage'] =  missing_values/len(x)
    r['Type'] = x.dtype
    return(r)

In [71]:
u = df.apply(f, axis=0)
u.transpose()

  


Unnamed: 0,Unique Values,Missing,Missing Percentage,Type
yod,16,0,0.0,float64
age,120,25,1.13046e-05,float64
sex,4,3,1.35655e-06,object
b_wt,4847,590017,0.266796,object
ethnic,6,285208,0.128966,float64
pt_state,5,0,0.0,object
raceethn,10,146005,0.0660211,object
campus,34,903208,0.408416,object
admtype,7,75,3.39138e-05,object
payer,18,0,0.0,object


### 5. Save the data

In [12]:
df.to_csv('df_clean.csv', index=False)