In [21]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("healthcare/train_data.csv")

In [3]:
data.head(2)

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50


In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data[["Stay"]] = data[["Stay"]].apply(le.fit_transform)

In [5]:
X = data.drop(["Stay"], axis=1)
y = data[["Stay"]]

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Train Set Processing

In [8]:
import numpy as np
trainCat = X_train.select_dtypes(object)
trainNum = X_train.select_dtypes(np.number)

In [9]:
trainCat.isna().sum()

Hospital_type_code      0
Hospital_region_code    0
Department              0
Ward_Type               0
Ward_Facility_Code      0
Type of Admission       0
Severity of Illness     0
Age                     0
dtype: int64

In [10]:
trainNum.isna().sum()

case_id                                 0
Hospital_code                           0
City_Code_Hospital                      0
Available Extra Rooms in Hospital       0
Bed Grade                              91
patientid                               0
City_Code_Patient                    3672
Visitors with Patient                   0
Admission_Deposit                       0
dtype: int64

In [11]:
from sklearn.impute import SimpleImputer
simImpCat = SimpleImputer(strategy = "most_frequent")
simImpNum = SimpleImputer(strategy = "mean")

In [12]:
simImpCat.fit(trainCat)
simImpNum.fit(trainNum)

SimpleImputer()

In [13]:
trainCatClean = pd.DataFrame(simImpCat.transform(trainCat), columns=trainCat.columns)
trainNumClean = pd.DataFrame(simImpNum.transform(trainNum), columns=trainNum.columns)

## Label Encoding Categorical Data

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [15]:
trainCatEncoded = trainCatClean.apply(le.fit_transform)

## Standard Scaling Numerical Data

In [16]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [17]:
ss.fit(trainNumClean)

StandardScaler()

In [18]:
trainNumScaled = pd.DataFrame(ss.transform(trainNumClean), columns=trainNumClean.columns)

In [19]:
trainFinal = pd.concat([trainCatEncoded, trainNumScaled], axis=1)

In [20]:
trainFinal.head()

Unnamed: 0,Hospital_type_code,Hospital_region_code,Department,Ward_Type,Ward_Facility_Code,Type of Admission,Severity of Illness,Age,case_id,Hospital_code,City_Code_Hospital,Available Extra Rooms in Hospital,Bed Grade,patientid,City_Code_Patient,Visitors with Patient,Admission_Deposit
0,3,1,2,2,1,1,2,3,-0.760313,-2.00592,1.684197,-0.17095,-0.716382,0.916987,-0.053735,1.536807,1.509457
1,1,1,2,1,3,1,2,6,-0.265828,-0.847794,-0.893314,0.68442,-0.716382,1.19458,-0.053735,-0.727939,-0.811368
2,1,1,3,3,3,1,2,6,1.627413,0.889395,-0.893314,-0.17095,0.42949,1.299764,0.795359,0.404434,-1.347438
3,4,0,2,3,4,2,1,2,-1.195992,-0.963606,-1.215502,0.68442,1.575362,-1.041204,-1.327376,0.404434,-0.061973
4,1,1,2,1,3,0,0,3,-0.204638,0.889395,-0.893314,0.68442,-0.716382,-1.305704,0.158538,-0.727939,-1.423757


# Training Model

In [22]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [37]:
y_train.value_counts()

Stay
2       69993
1       62511
3       44127
5       28014
0       18883
4        9394
7        8203
10       5347
8        3871
9        2212
6        2195
dtype: int64

In [38]:
weightDict = {0:2,1:1,2:1,3:1,4:3,5:2,6:4,7:3,8:4,9:4,10:3}

In [39]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(class_weight=weightDict)

In [23]:
lr.fit(trainFinal, y_train)

LogisticRegression()

In [40]:
dt.fit(trainFinal, y_train)

DecisionTreeClassifier(class_weight={0: 2, 1: 1, 2: 1, 3: 1, 4: 3, 5: 2, 6: 4,
                                     7: 3, 8: 4, 9: 4, 10: 3})

# Processing Test Data

In [24]:
testCat = X_test.select_dtypes(object)
testNum = X_test.select_dtypes(np.number)

In [25]:
testCatClean = pd.DataFrame(simImpCat.transform(testCat), columns=testCat.columns)
testNumClean = pd.DataFrame(simImpNum.transform(testNum), columns=testNum.columns)

## Label Encoding Categorical Data

In [26]:
testCatEncoded = testCatClean.apply(le.fit_transform)

## Standard Scaling Numerical Data

In [27]:
testNumScaled = pd.DataFrame(ss.transform(testNumClean), columns=testNumClean.columns)

In [28]:
testFinal = pd.concat([testCatEncoded, testNumScaled], axis=1)

# Predictions

In [29]:
predictions = lr.predict(testFinal)

In [41]:
predictionsDT = dt.predict(testFinal)

# Metrics

In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [31]:
print("Accuracy is: \t", accuracy_score(predictions, y_test)*100, "%")
print("Precision is: \t", precision_score(predictions, y_test, average='macro')*100, "%")
print("Recall is: \t", recall_score(predictions, y_test, average='macro')*100, "%")
print("F1 is: \t", f1_score(predictions, y_test, average='macro')*100, "%")

Accuracy is: 	 37.62875266926265 %
Precision is: 	 18.675891353405127 %
Recall is: 	 21.401798751350285 %
F1 is: 	 17.846026374268632 %


In [42]:
print("Accuracy is: \t", accuracy_score(predictionsDT, y_test)*100, "%")
print("Precision is: \t", precision_score(predictionsDT, y_test, average='macro')*100, "%")
print("Recall is: \t", recall_score(predictionsDT, y_test, average='macro')*100, "%")
print("F1 is: \t", f1_score(predictionsDT, y_test, average='macro')*100, "%")

Accuracy is: 	 29.84235648787841 %
Precision is: 	 22.802074680186195 %
Recall is: 	 22.237408427091214 %
F1 is: 	 22.4933593838094 %
