In [22]:
import pandas as pd
import numpy as np 
import plotly_express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('Healthcare_Investments_and_Hospital_Stay (1).csv')
data

Unnamed: 0,Location,Time,Hospital_Stay,MRI_Units,CT_Scanners,Hospital_Beds
0,AUS,1992,6.6,1.43,16.71,1.43
1,AUS,1994,6.4,2.36,18.48,2.36
2,AUS,1995,6.5,2.89,20.55,2.89
3,AUS,1996,6.4,2.96,21.95,2.96
4,AUS,1997,6.2,3.53,23.34,3.53
...,...,...,...,...,...,...
513,LTU,2014,6.8,10.57,22.17,10.57
514,LTU,2015,6.6,11.02,21.00,11.02
515,LTU,2016,6.6,12.20,23.01,12.20
516,LTU,2017,6.5,12.37,23.33,12.37


In [24]:
data.corr()

Unnamed: 0,Time,Hospital_Stay,MRI_Units,CT_Scanners,Hospital_Beds
Time,1.0,-0.360443,0.477647,0.274668,0.477647
Hospital_Stay,-0.360443,1.0,0.058874,0.249306,0.058874
MRI_Units,0.477647,0.058874,1.0,0.759864,1.0
CT_Scanners,0.274668,0.249306,0.759864,1.0,0.759864
Hospital_Beds,0.477647,0.058874,1.0,0.759864,1.0


In [8]:
# Japan, Russia and Korea have the longest LOS
# Denmark, Turkey and Greece have the shortest LOS
location_LOS = data.groupby('Location')['Hospital_Stay'].mean().sort_values(ascending = True)
location_LOS

Location
DNK     3.625000
TUR     4.464706
GRC     5.460000
ISR     5.468421
AUS     5.595652
ISL     5.608333
USA     5.615385
NZL     5.709091
FRA     5.709524
EST     5.850000
IRL     5.938462
ESP     6.100000
SVN     6.316667
LVA     6.512500
FIN     6.585714
GBR     6.625000
SVK     6.831250
NLD     6.850000
ITA     6.854545
POL     7.071429
AUT     7.078261
HUN     7.082759
BEL     7.231250
LUX     7.394118
LTU     7.515789
CZE     7.710526
CAN     7.920000
PRT     8.433333
DEU     8.438889
KOR     9.744444
RUS    11.836000
JPN    21.700000
Name: Hospital_Stay, dtype: float64

In [10]:
# no null/missing vals 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 518 entries, 0 to 517
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       518 non-null    object 
 1   Time           518 non-null    int64  
 2   Hospital_Stay  518 non-null    float64
 3   MRI_Units      518 non-null    float64
 4   CT_Scanners    518 non-null    float64
 5   Hospital_Beds  518 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 24.4+ KB


# Preprocess inputs

In [13]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column])
    df = pd.concat([df, dummies], axis = 1)
    df = df.drop(column, axis = 1)
    
    return df

def preprocessing_inputs(df):
    df = df.copy()

    #One hot encoding Location column
    df = onehot_encode(df, column = 'Location')

    # splitting into X and Y
    Y = df['Hospital_Stay'].copy()
    X = df.drop('Hospital_Stay', axis = 1).copy()

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.7, random_state = 1)

    # Scale x-inputs using standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = pd.DataFrame(scaler.transform(X_train), columns = X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)

    return X_train, X_test, Y_train, Y_test

In [17]:
X_train, X_test, Y_train, Y_test = preprocessing_inputs(data)

In [18]:
X_train

Unnamed: 0,Time,MRI_Units,CT_Scanners,Hospital_Beds,AUS,AUT,BEL,CAN,CZE,DEU,...,LVA,NLD,NZL,POL,PRT,RUS,SVK,SVN,TUR,USA
0,-1.215654,-1.005117,-0.944338,-1.005117,-0.221981,-0.221981,-0.193001,-0.193001,-0.200574,-0.185164,...,-0.16855,-0.193001,-0.105703,-0.159674,-0.091414,-0.207913,-0.150329,-0.159674,-0.185164,-0.159674
1,-1.797176,-0.368898,0.272487,-0.368898,-0.221981,4.504899,-0.193001,-0.193001,-0.200574,-0.185164,...,-0.16855,-0.193001,-0.105703,-0.159674,-0.091414,-0.207913,-0.150329,-0.159674,-0.185164,-0.159674
2,-0.343371,-0.660590,2.400128,-0.660590,4.504899,-0.221981,-0.193001,-0.193001,-0.200574,-0.185164,...,-0.16855,-0.193001,-0.105703,-0.159674,-0.091414,-0.207913,-0.150329,-0.159674,-0.185164,-0.159674
3,1.401195,0.082399,0.248897,0.082399,-0.221981,-0.221981,5.181327,-0.193001,-0.200574,-0.185164,...,-0.16855,-0.193001,-0.105703,-0.159674,-0.091414,-0.207913,-0.150329,-0.159674,-0.185164,-0.159674
4,1.401195,0.694402,-0.063664,0.694402,-0.221981,-0.221981,-0.193001,-0.193001,-0.200574,-0.185164,...,-0.16855,-0.193001,-0.105703,-0.159674,-0.091414,-0.207913,-0.150329,-0.159674,-0.185164,-0.159674
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,0.965053,1.653134,0.094255,1.653134,-0.221981,-0.221981,-0.193001,-0.193001,-0.200574,-0.185164,...,-0.16855,-0.193001,-0.105703,-0.159674,-0.091414,-0.207913,-0.150329,-0.159674,-0.185164,-0.159674
358,0.092770,-0.491078,-0.590496,-0.491078,-0.221981,-0.221981,-0.193001,-0.193001,-0.200574,-0.185164,...,-0.16855,-0.193001,-0.105703,-0.159674,-0.091414,-0.207913,-0.150329,-0.159674,-0.185164,-0.159674
359,-0.488752,-0.567028,-0.558388,-0.567028,-0.221981,-0.221981,-0.193001,5.181327,-0.200574,-0.185164,...,-0.16855,-0.193001,-0.105703,-0.159674,-0.091414,-0.207913,-0.150329,-0.159674,-0.185164,-0.159674
360,-0.052610,1.009210,0.712169,1.009210,-0.221981,-0.221981,-0.193001,-0.193001,-0.200574,-0.185164,...,-0.16855,-0.193001,-0.105703,-0.159674,-0.091414,-0.207913,-0.150329,-0.159674,-0.185164,-0.159674


In [19]:
Y_train

192    7.1
23     9.5
13     5.9
61     6.6
358    6.0
      ... 
129    6.6
144    5.7
72     7.2
235    6.8
37     6.6
Name: Hospital_Stay, Length: 362, dtype: float64

# Training the Regression Model

In [28]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, Y_train)

decision_tree = DecisionTreeRegressor()
decision_tree.fit(X_train, Y_train)

In [30]:
Y_pred = lin_reg.predict(X_test)

df = pd.DataFrame({'Actual': Y_test, 
'Predicted': Y_pred})
df

Unnamed: 0,Actual,Predicted
485,7.4,7.381378
273,7.2,7.971222
420,5.7,5.980377
315,5.1,5.196930
256,11.0,11.104889
...,...,...
495,6.0,5.722199
329,6.6,6.411774
214,6.1,6.240387
95,6.0,7.012848


In [33]:
Y_pred2 = decision_tree.predict(X_test)

df1 = pd.DataFrame({'Actual': Y_test, 
'Predicted': Y_pred})
df1

Unnamed: 0,Actual,Predicted
485,7.4,7.381378
273,7.2,7.971222
420,5.7,5.980377
315,5.1,5.196930
256,11.0,11.104889
...,...,...
495,6.0,5.722199
329,6.6,6.411774
214,6.1,6.240387
95,6.0,7.012848


In [34]:
# evaluation
print('Classical Linear Regression: ')
print('Mean Squared Error:', mean_squared_error(Y_test,Y_pred))
print('R^2 Score: ', r2_score(Y_test, Y_pred))

print('Decision Tree Solution: ')
print('Mean Squared Error:', mean_squared_error(Y_test,Y_pred2))
print('R^2 Score: ', r2_score(Y_test, Y_pred2))

Classical Linear Regression: 
Mean Squared Error: 0.650863839983064
R^2 Score:  0.9014824299583708
Decision Tree Solution: 
Mean Squared Error: 0.6182051282051282
R^2 Score:  0.9064257940345436


Based on the model evaluation above, the decision tree perfroms more effectively. It has a slightly higher r^2 score meaning better fit on the model as well as a lower mean squared error 