In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Display settings
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 120)
%matplotlib inline

In [24]:
# Load datasets
microsoft_los = pd.read_csv("raw-data/LengthOfStay.csv")
investments_los = pd.read_csv("raw-data/Healthcare_Investments_and_Hospital_Stay (1).csv")

# Display first few rows of each dataset
print("Microsoft Length of Stay Data:")
print(microsoft_los.head())
print("\nInvestments Length of Stay Data:")
print(investments_los.head())

Microsoft Length of Stay Data:
   eid       vdate rcount gender  dialysisrenalendstage  asthma  irondef  pneum  substancedependence  \
0    1   8/29/2012      0      F                      0       0        0      0                    0   
1    2   5/26/2012     5+      F                      0       0        0      0                    0   
2    3   9/22/2012      1      F                      0       0        0      0                    0   
3    4    8/9/2012      0      F                      0       0        0      0                    0   
4    5  12/20/2012      0      F                      0       0        0      1                    0   

   psychologicaldisordermajor  depress  psychother  fibrosisandother  malnutrition  hemo  hematocrit  neutrophils  \
0                           0        0           0                 0             0     0        11.5        14.20   
1                           0        0           0                 0             0     0         9.0         4

In [25]:
# Inspect Microsoft dataset

microsoft_los.info()
microsoft_los.describe(include="all").T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   eid                         100000 non-null  int64  
 1   vdate                       100000 non-null  object 
 2   rcount                      100000 non-null  object 
 3   gender                      100000 non-null  object 
 4   dialysisrenalendstage       100000 non-null  int64  
 5   asthma                      100000 non-null  int64  
 6   irondef                     100000 non-null  int64  
 7   pneum                       100000 non-null  int64  
 8   substancedependence         100000 non-null  int64  
 9   psychologicaldisordermajor  100000 non-null  int64  
 10  depress                     100000 non-null  int64  
 11  psychother                  100000 non-null  int64  
 12  fibrosisandother            100000 non-null  int64  
 13  malnutrition   

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
eid,100000.0,,,,50000.5,28867.657797,1.0,25000.75,50000.5,75000.25,100000.0
vdate,100000.0,367.0,10/3/2012,333.0,,,,,,,
rcount,100000.0,6.0,0,55031.0,,,,,,,
gender,100000.0,2.0,F,57643.0,,,,,,,
dialysisrenalendstage,100000.0,,,,0.03642,0.187334,0.0,0.0,0.0,0.0,1.0
asthma,100000.0,,,,0.03527,0.184462,0.0,0.0,0.0,0.0,1.0
irondef,100000.0,,,,0.09494,0.293134,0.0,0.0,0.0,0.0,1.0
pneum,100000.0,,,,0.03945,0.194664,0.0,0.0,0.0,0.0,1.0
substancedependence,100000.0,,,,0.06306,0.243072,0.0,0.0,0.0,0.0,1.0
psychologicaldisordermajor,100000.0,,,,0.23904,0.426499,0.0,0.0,0.0,0.0,1.0


In [26]:
ms = microsoft_los.copy()

# Parse dates
ms["vdate"] = pd.to_datetime(ms["vdate"])
ms["discharged"] = pd.to_datetime(ms["discharged"])

# Normalize rcount and convert to int
ms["rcount"] = ms["rcount"].replace({"5+": 5}).astype(int)

# Categorical columns
ms["gender"] = ms["gender"].astype("category")
ms["facid"] = ms["facid"].astype("category")

# Verify length of stay calculation
ms["los_from_dates"] = (ms["discharged"] - ms["vdate"]).dt.days

# Agreement check
agreement_rate = (ms["los_from_dates"] == ms["lengthofstay"]).mean()
agreement_rate

np.float64(1.0)

In [27]:
# Find nulls
ms.isnull().sum()

eid                           0
vdate                         0
rcount                        0
gender                        0
dialysisrenalendstage         0
asthma                        0
irondef                       0
pneum                         0
substancedependence           0
psychologicaldisordermajor    0
depress                       0
psychother                    0
fibrosisandother              0
malnutrition                  0
hemo                          0
hematocrit                    0
neutrophils                   0
sodium                        0
glucose                       0
bloodureanitro                0
creatinine                    0
bmi                           0
pulse                         0
respiration                   0
secondarydiagnosisnonicd9     0
discharged                    0
facid                         0
lengthofstay                  0
los_from_dates                0
dtype: int64