In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

healthData = pd.read_csv('./datasets/train_data.csv', delimiter=',')
healthData.head()

Unnamed: 0,ID,HealthServiceArea,Gender,Race,TypeOfAdmission,CCSProcedureCode,APRSeverityOfIllnessCode,PaymentTypology,BirthWeight,EmergencyDepartmentIndicator,AverageCostInCounty,AverageChargesInCounty,AverageCostInFacility,AverageChargesInFacility,AverageIncomeInZipCode,LengthOfStay
0,1,New York City,F,Other Race,Newborn,228,1,Medicaid,3700,N,2611,9227,1751,8951,45,1
1,2,New York City,M,Black/African American,Newborn,228,1,Medicaid,2900,N,3242,8966,3338,6409,34,1
2,3,New York City,M,Other Race,Newborn,220,1,Private Health Insurance,3200,N,3155,11381,4980,9323,45,1
3,4,New York City,F,Other Race,Newborn,0,1,Private Health Insurance,3300,N,3155,11381,5826,15680,59,1
4,5,New York City,F,Other Race,Newborn,228,1,Medicaid,2600,N,2611,9227,6000,14344,59,1


In [2]:
#remove ID
healthData = healthData.drop(columns='ID')
healthData.head()

Unnamed: 0,HealthServiceArea,Gender,Race,TypeOfAdmission,CCSProcedureCode,APRSeverityOfIllnessCode,PaymentTypology,BirthWeight,EmergencyDepartmentIndicator,AverageCostInCounty,AverageChargesInCounty,AverageCostInFacility,AverageChargesInFacility,AverageIncomeInZipCode,LengthOfStay
0,New York City,F,Other Race,Newborn,228,1,Medicaid,3700,N,2611,9227,1751,8951,45,1
1,New York City,M,Black/African American,Newborn,228,1,Medicaid,2900,N,3242,8966,3338,6409,34,1
2,New York City,M,Other Race,Newborn,220,1,Private Health Insurance,3200,N,3155,11381,4980,9323,45,1
3,New York City,F,Other Race,Newborn,0,1,Private Health Insurance,3300,N,3155,11381,5826,15680,59,1
4,New York City,F,Other Race,Newborn,228,1,Medicaid,2600,N,2611,9227,6000,14344,59,1


In [3]:

#converting categorical columns to categorical data type
for col in healthData.columns:
    if healthData[col].dtype == object:
        healthData[col] = healthData[col].astype('category')

In [4]:
healthData['LengthOfStay'] = pd.cut(healthData.LengthOfStay, bins=[0,4,11], right=False, labels=['class 0', 'class 1'])

In [5]:
healthData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59966 entries, 0 to 59965
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   HealthServiceArea             59966 non-null  category
 1   Gender                        59966 non-null  category
 2   Race                          59966 non-null  category
 3   TypeOfAdmission               59966 non-null  category
 4   CCSProcedureCode              59966 non-null  int64   
 5   APRSeverityOfIllnessCode      59966 non-null  int64   
 6   PaymentTypology               59966 non-null  category
 7   BirthWeight                   59966 non-null  int64   
 8   EmergencyDepartmentIndicator  59966 non-null  category
 9   AverageCostInCounty           59966 non-null  int64   
 10  AverageChargesInCounty        59966 non-null  int64   
 11  AverageCostInFacility         59966 non-null  int64   
 12  AverageChargesInFacility      59966 non-null  

In [6]:
pd.isna(healthData).sum()

HealthServiceArea               0
Gender                          0
Race                            0
TypeOfAdmission                 0
CCSProcedureCode                0
APRSeverityOfIllnessCode        0
PaymentTypology                 0
BirthWeight                     0
EmergencyDepartmentIndicator    0
AverageCostInCounty             0
AverageChargesInCounty          0
AverageCostInFacility           0
AverageChargesInFacility        0
AverageIncomeInZipCode          0
LengthOfStay                    0
dtype: int64

In [7]:
healthData.describe()

Unnamed: 0,CCSProcedureCode,APRSeverityOfIllnessCode,BirthWeight,AverageCostInCounty,AverageChargesInCounty,AverageCostInFacility,AverageChargesInFacility,AverageIncomeInZipCode
count,59966.0,59966.0,59966.0,59966.0,59966.0,59966.0,59966.0,59966.0
mean,155.404229,1.254594,3336.298903,2372.80669,7979.126922,2396.414318,7958.472668,59.811143
std,89.541978,0.546207,446.244475,639.755096,3220.291347,1248.501189,3859.294711,21.47017
min,-1.0,1.0,2500.0,712.0,1243.0,457.0,1120.0,28.0
25%,115.0,1.0,3000.0,2041.0,4620.0,1551.0,4438.0,45.0
50%,220.0,1.0,3300.0,2533.0,9227.0,1967.0,7945.0,55.0
75%,228.0,1.0,3600.0,2785.0,10644.0,2895.0,11619.0,74.0
max,231.0,4.0,7500.0,3242.0,11381.0,8114.0,18466.0,115.0


In [11]:
healthData['LengthOfStay'].value_counts().sort_index()

class 0    49895
class 1    10071
Name: LengthOfStay, dtype: int64

In [8]:
target = healthData['LengthOfStay']
attributes = healthData.drop(columns='LengthOfStay')

In [9]:
attributesNum = pd.get_dummies(attributes)
attributesNum.head()

Unnamed: 0,CCSProcedureCode,APRSeverityOfIllnessCode,BirthWeight,AverageCostInCounty,AverageChargesInCounty,AverageCostInFacility,AverageChargesInFacility,AverageIncomeInZipCode,HealthServiceArea_Capital/Adirond,HealthServiceArea_Central NY,...,PaymentTypology_Federal/State/Local/VA,"PaymentTypology_Managed Care, Unspecified",PaymentTypology_Medicaid,PaymentTypology_Medicare,PaymentTypology_Miscellaneous/Other,PaymentTypology_Private Health Insurance,PaymentTypology_Self-Pay,PaymentTypology_Unknown,EmergencyDepartmentIndicator_N,EmergencyDepartmentIndicator_Y
0,228,1,3700,2611,9227,1751,8951,45,0,0,...,0,0,1,0,0,0,0,0,1,0
1,228,1,2900,3242,8966,3338,6409,34,0,0,...,0,0,1,0,0,0,0,0,1,0
2,220,1,3200,3155,11381,4980,9323,45,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0,1,3300,3155,11381,5826,15680,59,0,0,...,0,0,0,0,0,1,0,0,1,0
4,228,1,2600,2611,9227,6000,14344,59,0,0,...,0,0,1,0,0,0,0,0,1,0


In [10]:
from sklearn import preprocessing 
  
le = preprocessing.LabelEncoder() 
le.fit(target) 
class_labels = le.inverse_transform([0,1]) 
dataY = le.transform(target) 
print(target) 
print(class_labels) 

0        class 0
1        class 0
2        class 0
3        class 0
4        class 0
          ...   
59961    class 1
59962    class 1
59963    class 1
59964    class 1
59965    class 1
Name: LengthOfStay, Length: 59966, dtype: category
Categories (2, object): ['class 0' < 'class 1']
['class 0' 'class 1']
