In [12]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import check_array
from tqdm import tqdm
import pdb

# Get the parent directory of this file
current_directory = os.path.dirname(os.path.abspath(__name__))

# Get the data directory
data_directory = os.path.join(current_directory, 'data')

# Get the CSV files
csv_files = [f for f in os.listdir(data_directory) if f.endswith('.csv')]

# Iterate over each CSV file and merge them by subject_id
combined_data = pd.DataFrame()
for file in csv_files:
    file_path = os.path.join(data_directory, file)
    data = pd.read_csv(file_path)
    combined_data = pd.concat([combined_data, data], ignore_index=True)

In [13]:
combined_data.columns

Index(['subject_id', 'gender', 'anchor_age', 'anchor_year',
       'anchor_year_group', 'dod', 'hadm_id', 'seq_num', 'chartdate',
       'icd_code', 'icd_version', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admit_provider_id', 'admission_location',
       'discharge_location', 'insurance', 'language', 'marital_status', 'race',
       'edregtime', 'edouttime', 'hospital_expire_flag'],
      dtype='object')

In [20]:
combined_data.admittime = pd.to_datetime(combined_data.admittime)
combined_data.dischtime = pd.to_datetime(combined_data.dischtime)
combined_data.deathtime = pd.to_datetime(combined_data.deathtime)

# Convert to days
combined_data['length_of_stay'] = (combined_data.dischtime - combined_data.admittime).dt.days
combined_data.deathtime = (combined_data.deathtime - combined_data.admittime.min()) / np.timedelta64(1, 'D')
combined_data.dischtime = (combined_data.dischtime - combined_data.admittime.min()) / np.timedelta64(1, 'D')
combined_data.admittime = (combined_data.admittime - combined_data.admittime.min()) / np.timedelta64(1, 'D')

In [21]:
combined_data.admittime

5392      0.000000
5420    232.640972
5453    278.990972
5331    581.354861
5491    890.170139
           ...    
5323           NaN
5324           NaN
5325           NaN
5326           NaN
5327           NaN
Name: admittime, Length: 5603, dtype: float64

In [22]:
combined_data.sort_values(by='admittime', inplace=True)

In [87]:
data = combined_data[['admittime', 'race', 'marital_status', 'insurance', 'language','length_of_stay']]
# Cut rows with nan
data = data.dropna()
data = data[data.race.isin(['WHITE', 'BLACK/AFRICAN AMERICAN', 'HISPANIC OR LATINO'])]

In [88]:
data.race.unique()

array(['WHITE', 'BLACK/AFRICAN AMERICAN', 'HISPANIC OR LATINO'],
      dtype=object)

In [94]:
classifier = "gradient_boosting"
if classifier == "gradient_boosting":
    classifier_function = HistGradientBoostingClassifier(max_depth=5)
elif classifier == "logistic_regression":
    classifier_function = LogisticRegression(max_iter=10)
else:
    raise ValueError(f"Invalid classifier: {classifier}")

# Load the data
#data = pd.read_csv('./raw_data/application_train.csv')

columns = data.columns

# Separate features and target
X_test = data.drop('length_of_stay', axis=1)
y_test = data['length_of_stay']

# Identify numeric and categorical columns
numeric_features = X_test.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_test.select_dtypes(include=['object']).columns

# Fit the encoder to all possible categories
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_test[categorical_features])
print(categorical_features)

# Create transformers for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', encoder)])

# Combine transformers into a preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
        remainder='passthrough')

# Choose classifier

# Create a pipeline with preprocessor and a classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier_function)])

# Split data into train and test sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Fine-tune the model online using a growing dataset
y_predict_proba = []
idxs = [ len(X_test) // 12 * i for i in range(12) ]
idxs = idxs[1:]
idxs[-1] = len(X_test)
print(categorical_features)

for i in tqdm(range(10)):
    idx = idxs[i]
    _X_train = X_test[:idx]
    _y_train = y_test[:idx]
    
    model.fit(_X_train, _y_train)
    y_predict_proba.append(model.predict_proba(X_test[idx:idxs[i+1]])[:, 1])
    
y_predict_proba = np.concatenate(y_predict_proba)
# Fill in zeros for the first batch of predictions
y_predict_proba = np.concatenate([np.zeros(len(X_test) - len(y_predict_proba)), y_predict_proba])

# Save the X_test, y_test, y_predict_proba as a dataframe, with the original columns plus 'target' and 'prediction' columns
os.makedirs('./.cache', exist_ok=True)
print(X_test.shape, y_test.shape, y_predict_proba.shape)
df = pd.DataFrame(np.column_stack([X_test, y_test, y_predict_proba]), columns=list(X_test.columns) + ['target', 'prediction'])
df.to_pickle(f"./.cache/{classifier}.pkl")

Index(['race', 'marital_status', 'insurance', 'language'], dtype='object')
Index(['race', 'marital_status', 'insurance', 'language'], dtype='object')


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:23<00:00,  2.33s/it]

(217, 5) (217,) (217,)



