##Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from scipy import stats
import yaml, time, sys, os, glob

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

DATASET = "Texas_Inpatient_Discharge"
SPLIT_TRAINING = True
DEBUG = False
SEED = 42

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

In [None]:
if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(d): os.makedirs(d)
  if not os.path.isdir(ROOT): os.makedirs(ROOT)

##Dataset

In [None]:
df = pd.read_pickle(f"{ROOT}/data/df_train_sample_00_of_1_clean.pkl")
print(df.shape)
df.head()

(99969, 195)


Unnamed: 0,RECORD_ID,DISCHARGE,THCIC_ID,PROVIDER_NAME,TYPE_OF_ADMISSION,SOURCE_OF_ADMISSION,SPEC_UNIT_1,SPEC_UNIT_2,SPEC_UNIT_3,SPEC_UNIT_4,SPEC_UNIT_5,PAT_STATE,PAT_ZIP,PAT_COUNTRY,COUNTY,PUBLIC_HEALTH_REGION,PAT_STATUS,SEX_CODE,RACE,ETHNICITY,ADMIT_WEEKDAY,LENGTH_OF_STAY,PAT_AGE,FIRST_PAYMENT_SRC,SECONDARY_PAYMENT_SRC,TYPE_OF_BILL,TOTAL_CHARGES,TOTAL_NON_COV_CHARGES,TOTAL_CHARGES_ACCOMM,TOTAL_NON_COV_CHARGES_ACCOMM,TOTAL_CHARGES_ANCIL,TOTAL_NON_COV_CHARGES_ANCIL,POA_PROVIDER_INDICATOR,ADMITTING_DIAGNOSIS,PRINC_DIAG_CODE,POA_PRINC_DIAG_CODE,OTH_DIAG_CODE_1,POA_OTH_DIAG_CODE_1,OTH_DIAG_CODE_2,POA_OTH_DIAG_CODE_2,OTH_DIAG_CODE_3,POA_OTH_DIAG_CODE_3,OTH_DIAG_CODE_4,POA_OTH_DIAG_CODE_4,OTH_DIAG_CODE_5,POA_OTH_DIAG_CODE_5,OTH_DIAG_CODE_6,POA_OTH_DIAG_CODE_6,OTH_DIAG_CODE_7,POA_OTH_DIAG_CODE_7,OTH_DIAG_CODE_8,POA_OTH_DIAG_CODE_8,OTH_DIAG_CODE_9,POA_OTH_DIAG_CODE_9,OTH_DIAG_CODE_10,POA_OTH_DIAG_CODE_10,OTH_DIAG_CODE_11,POA_OTH_DIAG_CODE_11,OTH_DIAG_CODE_12,POA_OTH_DIAG_CODE_12,OTH_DIAG_CODE_13,POA_OTH_DIAG_CODE_13,OTH_DIAG_CODE_14,POA_OTH_DIAG_CODE_14,OTH_DIAG_CODE_15,POA_OTH_DIAG_CODE_15,OTH_DIAG_CODE_16,POA_OTH_DIAG_CODE_16,OTH_DIAG_CODE_17,POA_OTH_DIAG_CODE_17,OTH_DIAG_CODE_18,POA_OTH_DIAG_CODE_18,OTH_DIAG_CODE_19,POA_OTH_DIAG_CODE_19,OTH_DIAG_CODE_20,POA_OTH_DIAG_CODE_20,OTH_DIAG_CODE_21,POA_OTH_DIAG_CODE_21,OTH_DIAG_CODE_22,POA_OTH_DIAG_CODE_22,OTH_DIAG_CODE_23,POA_OTH_DIAG_CODE_23,OTH_DIAG_CODE_24,POA_OTH_DIAG_CODE_24,E_CODE_1,POA_E_CODE_1,E_CODE_2,POA_E_CODE_2,E_CODE_3,POA_E_CODE_3,E_CODE_4,POA_E_CODE_4,E_CODE_5,POA_E_CODE_5,E_CODE_6,POA_E_CODE_6,E_CODE_7,POA_E_CODE_7,E_CODE_8,POA_E_CODE_8,E_CODE_9,POA_E_CODE_9,E_CODE_10,POA_E_CODE_10,PRINC_SURG_PROC_CODE,PRINC_SURG_PROC_DAY,PRINC_ICD9_CODE,OTH_SURG_PROC_CODE_1,OTH_SURG_PROC_DAY_1,OTH_ICD9_CODE_1,OTH_SURG_PROC_CODE_2,OTH_SURG_PROC_DAY_2,OTH_ICD9_CODE_2,OTH_SURG_PROC_CODE_3,OTH_SURG_PROC_DAY_3,OTH_ICD9_CODE_3,OTH_SURG_PROC_CODE_4,OTH_SURG_PROC_DAY_4,OTH_ICD9_CODE_4,OTH_SURG_PROC_CODE_5,OTH_SURG_PROC_DAY_5,OTH_ICD9_CODE_5,OTH_SURG_PROC_CODE_6,OTH_SURG_PROC_DAY_6,OTH_ICD9_CODE_6,OTH_SURG_PROC_CODE_7,OTH_SURG_PROC_DAY_7,OTH_ICD9_CODE_7,OTH_SURG_PROC_CODE_8,OTH_SURG_PROC_DAY_8,OTH_ICD9_CODE_8,OTH_SURG_PROC_CODE_9,OTH_SURG_PROC_DAY_9,OTH_ICD9_CODE_9,OTH_SURG_PROC_CODE_10,OTH_SURG_PROC_DAY_10,OTH_ICD9_CODE_10,OTH_SURG_PROC_CODE_11,OTH_SURG_PROC_DAY_11,OTH_ICD9_CODE_11,OTH_SURG_PROC_CODE_12,OTH_SURG_PROC_DAY_12,OTH_ICD9_CODE_12,OTH_SURG_PROC_CODE_13,OTH_SURG_PROC_DAY_13,OTH_ICD9_CODE_13,OTH_SURG_PROC_CODE_14,OTH_SURG_PROC_DAY_14,OTH_ICD9_CODE_14,OTH_SURG_PROC_CODE_15,OTH_SURG_PROC_DAY_15,OTH_ICD9_CODE_15,OTH_SURG_PROC_CODE_16,OTH_SURG_PROC_DAY_16,OTH_ICD9_CODE_16,OTH_SURG_PROC_CODE_17,OTH_SURG_PROC_DAY_17,OTH_ICD9_CODE_17,OTH_SURG_PROC_CODE_18,OTH_SURG_PROC_DAY_18,OTH_ICD9_CODE_18,OTH_SURG_PROC_CODE_19,OTH_SURG_PROC_DAY_19,OTH_ICD9_CODE_19,OTH_SURG_PROC_CODE_20,OTH_SURG_PROC_DAY_20,OTH_ICD9_CODE_20,OTH_SURG_PROC_CODE_21,OTH_SURG_PROC_DAY_21,OTH_ICD9_CODE_21,OTH_SURG_PROC_CODE_22,OTH_SURG_PROC_DAY_22,OTH_ICD9_CODE_22,OTH_SURG_PROC_CODE_23,OTH_SURG_PROC_DAY_23,OTH_ICD9_CODE_23,OTH_SURG_PROC_CODE_24,OTH_SURG_PROC_DAY_24,OTH_ICD9_CODE_24,MS_MDC,MS_DRG,MS_GROUPER_VERSION_NBR,MS_GROUPER_ERROR_CODE,APR_MDC,APR_DRG,RISK_MORTALITY,ILLNESS_SEVERITY,APR_GROUPER_VERSION_NBR,APR_GROUPER_ERROR_CODE,ATTENDING_PHYSICIAN_UNIF_ID,OPERATING_PHYSICIAN_UNIF_ID,ENCOUNTER_INDICATOR,CERT_STATUS,FILLER_SPACE,TARGET
884115,320136748870,2013Q3,838400,Memorial Hermann Rehab Hospital Katy,Elective,Transfer from a hospital,R,,,,,Texas,77095,US,201,6,7,Female,White,Not of Hispanic,2,1,20,MA,MB,111,1671.0,0.0,1145.0,0.0,526.0,0.0,X,V5789,V5789,,1919.0,Y,5119.0,Y,V8543,,78459.0,Y,27801.0,Y,7812.0,Y,5180.0,Y,34590.0,Y,4019,Y,7993,Y,2724.0,Y,72887.0,Y,36250,Y,78093.0,Y,36901.0,Y,V5878,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,945,1300,0,23,860,2,3,7300,0,1229763162,,1,1,,short
33602,120130546450,2013Q1,409000,John Peter Smith Hospital,Emergency,Non-healthcare Facility,I,,,,,Texas,76008,US,367,3,1,Male,Other,Hispanic Origin,2,2,13,MA,,111,53064.01,0.0,4092.0,0.0,48972.01,0.0,M,78650,41401,Y,42822.0,Y,4280.0,Y,496,Y,4139.0,Y,4019.0,Y,2720.0,Y,25000.0,Y,4148.0,Y,V4502,,V4581,,311.0,Y,30002.0,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3722.0,1.0,3722.0,66.0,1.0,66.0,3607.0,1.0,3607.0,8856.0,1.0,8856.0,8853.0,1.0,8853.0,45.0,1.0,45.0,40.0,1.0,40.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,247,1300,0,5,175,2,2,7300,0,1578252829,1578252829.0,1,2,,short
31627,420135644393,2013Q4,286000,Mother Frances Hospital,Emergency,Non-healthcare Facility,,,,,,Texas,75701,US,423,4,1,Male,Other,Hispanic Origin,5,2,15,BL,,111,17545.0,0.0,1922.0,0.0,15623.0,0.0,R,486,486,Y,4019.0,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,195,1300,0,4,139,1,1,7300,0,2348600662,,1,1,,short
112206,120137773390,2013Q1,76000,Tomball Regional Medical Center,Emergency,Non-healthcare Facility,I,,,,,Texas,77429,US,201,6,1,Female,White,Not of Hispanic,1,4,17,13,ZZ,111,36668.56,0.0,8940.0,0.0,27728.56,0.0,M,7802,7802,Y,5849.0,Y,25080.0,Y,2761,Y,42731.0,Y,2768.0,Y,42789.0,Y,4019.0,Y,2809.0,Y,5589,Y,2724,Y,42611.0,Y,49390.0,Y,V1588,,7802.0,,,,,,,,,,,,,,,,,,,,,,E9426,Y,E9413,Y,E9420,Y,E9443,Y,E9444,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,312,1300,0,5,204,3,3,7300,0,9999999998,,1,6,,medium
71288,420131237653,2013Q4,725400,Mother Frances Hospital-Jacksonville,Emergency,Non-healthcare Facility,,,,,,Texas,75766,US,73,4,1,Female,Other,Hispanic Origin,6,1,4,ZZ,,111,38984.47,0.0,923.0,0.0,38061.47,0.0,X,5409,5409,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4701.0,0.0,4701.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6,343,1300,0,6,225,1,1,7300,0,9999999998,9999999998.0,1,1,,short


##Model Building

In [None]:
target = "TARGET"
cat_features = ["TYPE_OF_ADMISSION","SOURCE_OF_ADMISSION","PAT_STATE", "RACE"]
num_features = []

features = cat_features + num_features

print(f"{cat_features}")
print(f"{num_features}")
print(f"{features}")

['TYPE_OF_ADMISSION', 'SOURCE_OF_ADMISSION', 'PAT_STATE', 'RACE']
[]
['TYPE_OF_ADMISSION', 'SOURCE_OF_ADMISSION', 'PAT_STATE', 'RACE']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import chi2, SelectPercentile

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score, recall_score

In [None]:
cat_preprocessor = Pipeline(steps=[
    ('imput', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore')),
    ('select', SelectPercentile(chi2, percentile=80)),
])

num_preprocessor = Pipeline(steps=[
    ('imput', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

preprocessor = ColumnTransformer(transformers=[
    ('cat', cat_preprocessor, cat_features),
    ('num', num_preprocessor, num_features),
])

model = Pipeline(steps=[
    ('pre', preprocessor),
    ('clf', RandomForestClassifier()),
])

In [None]:
x = df[features]
y = df[target]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,stratify=y,test_size=0.4,random_state=SEED)
x_train.shape, x_test.shape

((59981, 4), (39988, 4))

In [None]:
model.fit(x_train,y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
print(classification_report(y_test,y_pred,digits=4))

              precision    recall  f1-score   support

        long     0.6183    0.2321    0.3375      8017
      medium     0.4332    0.6508    0.5201     15319
       short     0.5615    0.4709    0.5122     16652

    accuracy                         0.4919     39988
   macro avg     0.5377    0.4513    0.4566     39988
weighted avg     0.5237    0.4919    0.4802     39988



In [43]:
import my_lib as my
my.make_assignment()

AttributeError: ignored