## Hospital Stay Length Prediction

Given *medical data about different countries from 1990-2018*, let's try to predict a patient's **average hospital stay**.

We will use various regression models to make our predictions.

Data source: https://www.kaggle.com/datasets/babyoda/healthcare-investments-and-length-of-hospital-stay

### Importing Libraries

In [9]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [10]:
data = pd.read_csv('Healthcare_Investments_and_Hospital_Stay (1).csv')
data

Unnamed: 0,Location,Time,Hospital_Stay,MRI_Units,CT_Scanners,Hospital_Beds
0,AUS,1992,6.6,1.43,16.71,1.43
1,AUS,1994,6.4,2.36,18.48,2.36
2,AUS,1995,6.5,2.89,20.55,2.89
3,AUS,1996,6.4,2.96,21.95,2.96
4,AUS,1997,6.2,3.53,23.34,3.53
...,...,...,...,...,...,...
513,LTU,2014,6.8,10.57,22.17,10.57
514,LTU,2015,6.6,11.02,21.00,11.02
515,LTU,2016,6.6,12.20,23.01,12.20
516,LTU,2017,6.5,12.37,23.33,12.37


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 518 entries, 0 to 517
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       518 non-null    object 
 1   Time           518 non-null    int64  
 2   Hospital_Stay  518 non-null    float64
 3   MRI_Units      518 non-null    float64
 4   CT_Scanners    518 non-null    float64
 5   Hospital_Beds  518 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 24.4+ KB


### Preprocessing

In [18]:
df = data.copy()

In [19]:
df

Unnamed: 0,Location,Time,Hospital_Stay,MRI_Units,CT_Scanners,Hospital_Beds
0,AUS,1992,6.6,1.43,16.71,1.43
1,AUS,1994,6.4,2.36,18.48,2.36
2,AUS,1995,6.5,2.89,20.55,2.89
3,AUS,1996,6.4,2.96,21.95,2.96
4,AUS,1997,6.2,3.53,23.34,3.53
...,...,...,...,...,...,...
513,LTU,2014,6.8,10.57,22.17,10.57
514,LTU,2015,6.6,11.02,21.00,11.02
515,LTU,2016,6.6,12.20,23.01,12.20
516,LTU,2017,6.5,12.37,23.33,12.37


In [20]:
df['Location'].unique()

array(['AUS', 'AUT', 'BEL', 'CAN', 'CZE', 'DNK', 'FIN', 'FRA', 'DEU',
       'GRC', 'HUN', 'IRL', 'ITA', 'JPN', 'KOR', 'LUX', 'NLD', 'NZL',
       'POL', 'PRT', 'SVK', 'ESP', 'TUR', 'GBR', 'USA', 'EST', 'ISR',
       'RUS', 'SVN', 'ISL', 'LVA', 'LTU'], dtype=object)

In [21]:
pd.get_dummies(df['Location'], dtype=int)

Unnamed: 0,AUS,AUT,BEL,CAN,CZE,DEU,DNK,ESP,EST,FIN,FRA,GBR,GRC,HUN,IRL,ISL,ISR,ITA,JPN,KOR,LTU,LUX,LVA,NLD,NZL,POL,PRT,RUS,SVK,SVN,TUR,USA
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
514,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
515,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
516,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [22]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], dtype=int)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [23]:
# One hot encode Location column
df = onehot_encode(df, 'Location')
df

Unnamed: 0,Time,Hospital_Stay,MRI_Units,CT_Scanners,Hospital_Beds,AUS,AUT,BEL,CAN,CZE,DEU,DNK,ESP,EST,FIN,FRA,GBR,GRC,HUN,IRL,ISL,ISR,ITA,JPN,KOR,LTU,LUX,LVA,NLD,NZL,POL,PRT,RUS,SVK,SVN,TUR,USA
0,1992,6.6,1.43,16.71,1.43,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1994,6.4,2.36,18.48,2.36,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1995,6.5,2.89,20.55,2.89,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1996,6.4,2.96,21.95,2.96,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1997,6.2,3.53,23.34,3.53,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,2014,6.8,10.57,22.17,10.57,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
514,2015,6.6,11.02,21.00,11.02,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
515,2016,6.6,12.20,23.01,12.20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
516,2017,6.5,12.37,23.33,12.37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [25]:
# Split df into X and y
y = df['Hospital_Stay'].copy()
X = df.drop('Hospital_Stay', axis=1).copy()

In [26]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=123)

In [27]:
X_train.shape, X_test.shape

((362, 36), (156, 36))

In [28]:
# Scale with a Standard scaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

In [29]:
X_train

Unnamed: 0,Time,MRI_Units,CT_Scanners,Hospital_Beds,AUS,AUT,BEL,CAN,CZE,DEU,DNK,ESP,EST,FIN,FRA,GBR,GRC,HUN,IRL,ISL,ISR,ITA,JPN,KOR,LTU,LUX,LVA,NLD,NZL,POL,PRT,RUS,SVK,SVN,TUR,USA
121,-0.303643,0.502340,-0.349986,0.502340,-0.207913,-0.207913,-0.177028,-0.215041,-0.207913,-0.16855,-0.074536,-0.150329,-0.159674,3.931227,-0.207913,-0.150329,-0.118345,-0.215041,-0.159674,-0.168550,-0.193001,-0.235358,-0.140422,-0.193001,-0.221981,-0.177028,-0.193001,-0.177028,-0.159674,-0.177028,-0.052632,-0.193001,-0.177028,-0.140422,-0.193001,-0.140422
378,-0.738679,-0.697320,-0.873325,-0.697320,-0.207913,-0.207913,-0.177028,-0.215041,-0.207913,-0.16855,-0.074536,-0.150329,-0.159674,-0.254374,-0.207913,6.652067,-0.118345,-0.215041,-0.159674,-0.168550,-0.193001,-0.235358,-0.140422,-0.193001,-0.221981,-0.177028,-0.193001,-0.177028,-0.159674,-0.177028,-0.052632,-0.193001,-0.177028,-0.140422,-0.193001,-0.140422
91,0.131392,-0.562147,-0.392384,-0.562147,-0.207913,-0.207913,-0.177028,-0.215041,4.809712,-0.16855,-0.074536,-0.150329,-0.159674,-0.254374,-0.207913,-0.150329,-0.118345,-0.215041,-0.159674,-0.168550,-0.193001,-0.235358,-0.140422,-0.193001,-0.221981,-0.177028,-0.193001,-0.177028,-0.159674,-0.177028,-0.052632,-0.193001,-0.177028,-0.140422,-0.193001,-0.140422
310,0.276404,-0.018076,-0.295665,-0.018076,-0.207913,-0.207913,-0.177028,-0.215041,-0.207913,-0.16855,-0.074536,-0.150329,-0.159674,-0.254374,-0.207913,-0.150329,-0.118345,-0.215041,-0.159674,-0.168550,-0.193001,-0.235358,-0.140422,-0.193001,-0.221981,-0.177028,-0.193001,-0.177028,6.262765,-0.177028,-0.052632,-0.193001,-0.177028,-0.140422,-0.193001,-0.140422
479,1.001462,1.174825,1.272365,1.174825,-0.207913,-0.207913,-0.177028,-0.215041,-0.207913,-0.16855,-0.074536,-0.150329,-0.159674,-0.254374,-0.207913,-0.150329,-0.118345,-0.215041,-0.159674,5.932959,-0.193001,-0.235358,-0.140422,-0.193001,-0.221981,-0.177028,-0.193001,-0.177028,-0.159674,-0.177028,-0.052632,-0.193001,-0.177028,-0.140422,-0.193001,-0.140422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,1.146474,-0.248996,-0.302952,-0.248996,-0.207913,-0.207913,-0.177028,-0.215041,4.809712,-0.16855,-0.074536,-0.150329,-0.159674,-0.254374,-0.207913,-0.150329,-0.118345,-0.215041,-0.159674,-0.168550,-0.193001,-0.235358,-0.140422,-0.193001,-0.221981,-0.177028,-0.193001,-0.177028,-0.159674,-0.177028,-0.052632,-0.193001,-0.177028,-0.140422,-0.193001,-0.140422
322,0.276404,-0.678170,-0.378472,-0.678170,-0.207913,-0.207913,-0.177028,-0.215041,-0.207913,-0.16855,-0.074536,-0.150329,-0.159674,-0.254374,-0.207913,-0.150329,-0.118345,-0.215041,-0.159674,-0.168550,-0.193001,-0.235358,-0.140422,-0.193001,-0.221981,-0.177028,-0.193001,-0.177028,-0.159674,5.648813,-0.052632,-0.193001,-0.177028,-0.140422,-0.193001,-0.140422
382,-0.013620,-0.589181,-0.850140,-0.589181,-0.207913,-0.207913,-0.177028,-0.215041,-0.207913,-0.16855,-0.074536,-0.150329,-0.159674,-0.254374,-0.207913,6.652067,-0.118345,-0.215041,-0.159674,-0.168550,-0.193001,-0.235358,-0.140422,-0.193001,-0.221981,-0.177028,-0.193001,-0.177028,-0.159674,-0.177028,-0.052632,-0.193001,-0.177028,-0.140422,-0.193001,-0.140422
365,-0.013620,-0.317709,-0.623580,-0.317709,-0.207913,-0.207913,-0.177028,-0.215041,-0.207913,-0.16855,-0.074536,-0.150329,-0.159674,-0.254374,-0.207913,-0.150329,-0.118345,-0.215041,-0.159674,-0.168550,-0.193001,-0.235358,-0.140422,-0.193001,-0.221981,-0.177028,-0.193001,-0.177028,-0.159674,-0.177028,-0.052632,-0.193001,-0.177028,-0.140422,5.181327,-0.140422


In [30]:
X_train.mean()

Time            -1.147517e-14
MRI_Units        3.434944e-17
CT_Scanners      2.453532e-17
Hospital_Beds    3.434944e-17
AUS              9.752788e-17
AUT              3.925650e-17
BEL              2.453532e-17
CAN             -3.189591e-17
CZE              2.698885e-17
DEU             -6.379182e-17
DNK              4.907063e-18
ESP             -2.944238e-17
EST              5.397769e-17
FIN              1.055019e-16
FRA              3.925650e-17
GBR             -1.472119e-17
GRC              4.171004e-17
HUN              6.992565e-17
IRL              3.434944e-17
ISL             -5.643123e-17
ISR              5.397769e-17
ITA              5.888476e-17
JPN              5.397769e-17
KOR              5.888476e-17
LTU             -1.226766e-17
LUX              9.814126e-18
LVA              6.011152e-17
NLD              3.925650e-17
NZL              8.587360e-18
POL              1.962825e-17
PRT             -1.962825e-17
RUS              4.907063e-17
SVK              2.944238e-17
SVN       

In [31]:
X_train.var()

Time             1.00277
MRI_Units        1.00277
CT_Scanners      1.00277
Hospital_Beds    1.00277
AUS              1.00277
AUT              1.00277
BEL              1.00277
CAN              1.00277
CZE              1.00277
DEU              1.00277
DNK              1.00277
ESP              1.00277
EST              1.00277
FIN              1.00277
FRA              1.00277
GBR              1.00277
GRC              1.00277
HUN              1.00277
IRL              1.00277
ISL              1.00277
ISR              1.00277
ITA              1.00277
JPN              1.00277
KOR              1.00277
LTU              1.00277
LUX              1.00277
LVA              1.00277
NLD              1.00277
NZL              1.00277
POL              1.00277
PRT              1.00277
RUS              1.00277
SVK              1.00277
SVN              1.00277
TUR              1.00277
USA              1.00277
dtype: float64

### Training

In [36]:
models = {
    "                     Linear Regression": LinearRegression(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor(verbose=0),
    "                              CatBoost": CatBoostRegressor(verbose=0)
}

In [37]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
                   K-Nearest Neighbors trained.




                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
                              LightGBM trained.
                              CatBoost trained.


### Results

In [38]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_test, y_test))) 

                     Linear Regression R^2 Score: 0.85204
                   K-Nearest Neighbors R^2 Score: 0.90746
                        Neural Network R^2 Score: 0.93006
Support Vector Machine (Linear Kernel) R^2 Score: 0.84239
   Support Vector Machine (RBF Kernel) R^2 Score: 0.87885
                         Decision Tree R^2 Score: 0.93313
                         Random Forest R^2 Score: 0.95626
                     Gradient Boosting R^2 Score: 0.93159
                               XGBoost R^2 Score: 0.97491
                              LightGBM R^2 Score: 0.30192
                              CatBoost R^2 Score: 0.96683
