In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
icuStays = pd.read_csv('/Users/ashademeij/Downloads/Thesis/Code/Data/augmented_df.csv')
diagnoses_num = pd.read_csv("/Users/ashademeij/Downloads/Thesis/Code/Data/diagnoses_enum.csv")
diag = pd.read_csv("/Users/ashademeij/Downloads/Thesis/Code/Data/diag.csv")

### Extra preproocessing dummies

**ALREADY BINARY ATTRIBUTES**
- 'raceGrouped' 
- 'gender' 
- age groups: 'age_18-24','age_25-44','age_45-64','age_65-88','age_89+'




In [53]:
def get_icu_shortcut(icu_name):
    icu_map = {
        'Surgical Intensive Care Unit (SICU)': 'SICU',
        'Medical/Surgical Intensive Care Unit (MICU/SICU)': 'MICU/SICU',
        'Medical Intensive Care Unit (MICU)': 'MICU',
        'Cardiac Vascular Intensive Care Unit (CVICU)': 'CVICU',
        'Coronary Care Unit (CCU)': 'CCU',
        'Neuro Intermediate': 'Neuro Intermediate',
        'Trauma SICU (TSICU)': 'TSICU',
        'Neuro Stepdown': 'Neuro Stepdown',
        'Neuro Surgical Intensive Care Unit (Neuro SICU)': 'Neuro SICU'
    }
    return icu_map.get(icu_name, 'first_careunit')

merged['first_careunit'] = merged['first_careunit'].apply(get_icu_shortcut)

In [6]:
merged = merged.dropna()
merged = merged[merged['admission_location'] != 'INFORMATION NOT AVAILABLE']

def group_admission_location(icu_name):
    if icu_name == 'EMERGENCY ROOM':
        return 'Emergency'
    elif icu_name in ['PHYSICIAN REFERRAL', 'CLINIC REFERRAL', 'AMBULATORY SURGERY TRANSFER']:
        return 'Referral'
    elif icu_name in ['TRANSFER FROM HOSPITAL', 'TRANSFER FROM SKILLED NURSING FACILITY', 'INTERNAL TRANSFER TO OR FROM PSYCH']:
        return 'Transfer'
    else:
        return 'Other'

merged['admission_location'] = merged['admission_location'].apply(group_admission_location)

# 'AMBULATORY SURGERY TRANSFER' = transfer of a patient from an ambulatory surgery center (ASC) to another healthcare facility, such as a hospital, for further care or observation.
# -->  patient is being referred from an ASC to a hospital for further specialized care or treatment

In [7]:
columns_to_drop = ['subject_id', 'hadm_id', 'stay_id', 'last_careunit', 'intime', 'outtime', 'discharge_location', 'race', 'language']
mergedC = merged.drop(columns=columns_to_drop)

In [8]:
# columns for which dummy variables need to be created
columns_to_encode = ['first_careunit', 'admission_location', 'insurance', 'marital_status', 'raceGroup_1', 'raceGroup_2', 'gender']

# acronyms for the prefixes 
prefixes = {
    'first_careunit': 'FCU',
    'admission_location': 'ADM',
    'insurance': 'INS',
    'marital_status': 'MS',
    'raceGroup_1': 'RG1',
    'raceGroup_2': 'RG2',
    'gender': 'GEN'
}

dummy_variables = pd.get_dummies(mergedC[columns_to_encode], prefix=prefixes)
mergedC = pd.concat([mergedC, dummy_variables], axis=1)
mergedC = mergedC.drop(columns=columns_to_encode)

In [9]:
age_bins = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
age_labels = ['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-100']
mergedC['age_group'] = pd.cut(mergedC['anchor_age'], bins=age_bins, labels=age_labels, right=False)

age_group_dummies = pd.get_dummies(mergedC['age_group'], prefix='age')
mergedC = pd.concat([mergedC, age_group_dummies], axis=1)

In [10]:
age_bins = [18, 25, 45, 65, 89, float('inf')]
age_labels = ['18-24', '25-44', '45-64', '65-88', '89+']
mergedC['age_group2'] = pd.cut(mergedC['anchor_age'], bins=age_bins, labels=age_labels, right=False)

# Generate dummy variables for age groups
age_group_dummies = pd.get_dummies(mergedC['age_group2'], prefix='ageG')
mergedC = pd.concat([mergedC, age_group_dummies], axis=1)

In [34]:
mergedC = mergedC[mergedC['los'] < 22]

bin_edges = [1, 2, 3, 5, 8, 14, 21]
bin_labels = ['1-2', '2-3', '3-5', '5-8', '8-14', '14-21']

mergedC['los_bins'] = pd.cut(mergedC['los'], bins=bin_edges, labels=bin_labels, right=False)

# If you want to handle NaN values, you can fill them with a default value, for example:
mergedC.dropna(subset=['los_bins'], inplace=True)

# Define mapping for numerical to string labels
numerical_to_string = {
    '1-2': 'one',
    '2-3': 'two',
    '3-5': 'three',
    '5-8': 'four',
    '8-14': 'five',
    '14-21': 'six'
}

# Map numerical labels to string labels
mergedC['los_bins'] = mergedC['los_bins'].map(numerical_to_string)
mergedC.head()

Unnamed: 0,los,anchor_age,diagnoses_num,Chapter I,Chapter II,Chapter III,Chapter IV,Chapter V,Chapter VI,Chapter VII,...,age_70-79,age_80-89,age_90-100,age_group2,ageG_18-24,ageG_25-44,ageG_45-64,ageG_65-88,ageG_89+,los_bins
0,1.118032,55,10.0,1.0,0.0,0.0,0.0,1.0,4.0,0.0,...,0,0,0,45-64,0,0,1,0,0,one
1,1.338588,46,18.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,...,0,0,0,45-64,0,0,1,0,0,one
2,9.171817,68,36.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,...,0,0,0,65-88,0,0,0,1,0,five
3,1.314352,53,22.0,0.0,0.0,0.0,4.0,2.0,1.0,0.0,...,0,0,0,45-64,0,0,1,0,0,one
4,6.178912,80,20.0,0.0,1.0,1.0,3.0,1.0,0.0,0.0,...,0,1,0,65-88,0,0,0,1,0,four


In [32]:
numerical_to_string = {
    '1': 'one',
    '2': 'two',
    '3': 'three',
    '4': 'four',
    '5': 'five',
    '6': 'six'
}

# Map numerical labels to string labels
mergedC['los_bins'] = mergedC['los_bins'].map(numerical_to_string)

In [33]:
mergedC['los_bins'].unique()

array([nan])

# Sample

In [35]:
random_seed = 42
np.random.seed(random_seed)

# sample of 2500 entries
sample = mergedC.sample(n=2500, random_state=random_seed)

print(sample.shape)
sample.head()

(2500, 70)


Unnamed: 0,los,anchor_age,diagnoses_num,Chapter I,Chapter II,Chapter III,Chapter IV,Chapter V,Chapter VI,Chapter VII,...,age_70-79,age_80-89,age_90-100,age_group2,ageG_18-24,ageG_25-44,ageG_45-64,ageG_65-88,ageG_89+,los_bins
8889,6.87375,72,19.0,0.0,1.0,0.0,3.0,2.0,0.0,0.0,...,1,0,0,65-88,0,0,0,1,0,four
8740,1.321377,67,26.0,1.0,0.0,1.0,2.0,2.0,0.0,0.0,...,0,0,0,65-88,0,0,0,1,0,one
17591,15.61228,62,30.0,1.0,0.0,0.0,7.0,2.0,2.0,0.0,...,0,0,0,45-64,0,0,1,0,0,six
47307,1.003565,83,12.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,1,0,65-88,0,0,0,1,0,one
11988,2.76125,48,9.0,0.0,2.0,0.0,0.0,2.0,3.0,0.0,...,0,0,0,45-64,0,0,1,0,0,two


In [36]:
sample.columns

Index(['los', 'anchor_age', 'diagnoses_num', 'Chapter I', 'Chapter II',
       'Chapter III', 'Chapter IV', 'Chapter V', 'Chapter VI', 'Chapter VII',
       'Chapter VIII', 'Chapter IX', 'Chapter X', 'Chapter XI', 'Chapter XII',
       'Chapter XIII', 'Chapter XIV', 'Chapter XIX', 'Chapter XV',
       'Chapter XVI', 'Chapter XVII', 'Chapter XVIII', 'Chapter XX',
       'Chapter XXI', 'FCU_CCU', 'FCU_CVICU', 'FCU_MICU', 'FCU_MICU/SICU',
       'FCU_Neuro Intermediate', 'FCU_Neuro SICU', 'FCU_Neuro Stepdown',
       'FCU_SICU', 'FCU_TSICU', 'ADM_Emergency', 'ADM_Other', 'ADM_Referral',
       'ADM_Transfer', 'INS_Medicaid', 'INS_Medicare', 'INS_Other',
       'MS_DIVORCED', 'MS_MARRIED', 'MS_SINGLE', 'MS_WIDOWED', 'RG1_Asian',
       'RG1_Black/African American', 'RG1_Hispanic/Latino', 'RG1_Other',
       'RG1_White', 'RG2_Minority', 'RG2_NonMinority', 'GEN_F', 'GEN_M',
       'age_group', 'age_10-19', 'age_20-29', 'age_30-39', 'age_40-49',
       'age_50-59', 'age_60-69', 'age_70-79', '

In [37]:
sample.head()

Unnamed: 0,los,anchor_age,diagnoses_num,Chapter I,Chapter II,Chapter III,Chapter IV,Chapter V,Chapter VI,Chapter VII,...,age_70-79,age_80-89,age_90-100,age_group2,ageG_18-24,ageG_25-44,ageG_45-64,ageG_65-88,ageG_89+,los_bins
8889,6.87375,72,19.0,0.0,1.0,0.0,3.0,2.0,0.0,0.0,...,1,0,0,65-88,0,0,0,1,0,four
8740,1.321377,67,26.0,1.0,0.0,1.0,2.0,2.0,0.0,0.0,...,0,0,0,65-88,0,0,0,1,0,one
17591,15.61228,62,30.0,1.0,0.0,0.0,7.0,2.0,2.0,0.0,...,0,0,0,45-64,0,0,1,0,0,six
47307,1.003565,83,12.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,1,0,65-88,0,0,0,1,0,one
11988,2.76125,48,9.0,0.0,2.0,0.0,0.0,2.0,3.0,0.0,...,0,0,0,45-64,0,0,1,0,0,two


# Multinomial reg

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

# Assume mergedC is your DataFrame with the specified features and target

# Step 1: Prepare the data
features = ['FCU_CCU', 'FCU_CVICU', 'FCU_MICU', 'FCU_MICU/SICU', 'FCU_Neuro Intermediate', 'FCU_Neuro SICU',
            'FCU_Neuro Stepdown', 'FCU_SICU', 'FCU_TSICU',
            'ADM_Emergency', 'ADM_Other', 'ADM_Referral', 'ADM_Transfer',
            'ageG_18-24', 'ageG_25-44', 'ageG_45-64', 'ageG_65-88', 'ageG_89+',
            'Chapter I', 'Chapter II', 'Chapter III', 'Chapter IV', 'Chapter IX', 'Chapter V', 'Chapter VI',
            'Chapter VII', 'Chapter VIII', 'Chapter X', 'Chapter XI', 'Chapter XII', 'Chapter XIII', 
            'Chapter XIV', 'Chapter XIX', 'Chapter XV', 'Chapter XVI', 'Chapter XVII', 'Chapter XVIII',
            'Chapter XX', 'Chapter XXI', 'INS_Medicaid', 'INS_Medicare', 'INS_Other',
            'RG2_Minority', 'RG2_NonMinority', 'GEN_F', 'GEN_M']

target = 'los_bins'

X = mergedC[features]
y = mergedC[target]

# Encode the string labels into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train the Logistic Regression model
logistic_regression = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
logistic_regression.fit(X_train, y_train)

# Make predictions
y_pred = logistic_regression.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

print(classification_report(y_test, y_pred))


Accuracy: 0.4393097742583619
              precision    recall  f1-score   support

           0       0.28      0.22      0.25       525
           1       0.26      0.05      0.09       820
           2       0.48      0.93      0.63      3599
           3       0.23      0.03      0.06       203
           4       0.24      0.12      0.16      1544
           5       0.31      0.00      0.01      1770

    accuracy                           0.44      8461
   macro avg       0.30      0.23      0.20      8461
weighted avg       0.36      0.44      0.33      8461



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
import numpy as np

# Get the feature names
feature_names = X.columns

# Get the coefficients of the logistic regression model
coefficients = logistic_regression.coef_

# Calculate the absolute coefficients to determine importance
absolute_coefficients = np.abs(coefficients)

# Create a DataFrame to display feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': absolute_coefficients.flatten()})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance
print(feature_importance_df)


ValueError: All arrays must be of the same length

In [48]:
# Ensure that the lengths of feature names and coefficients are the same
assert len(feature_names) == len(absolute_coefficients.flatten()), "Lengths of feature names and coefficients must match"

# Create a DataFrame to display feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': absolute_coefficients.flatten()})

# Sort by the absolute coefficients
feature_importance_df = feature_importance_df.reindex(feature_importance_df['Importance'].abs().sort_values(ascending=False).index)

# Display the feature importance
print(feature_importance_df)


AssertionError: Lengths of feature names and coefficients must match

# XGBoost

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder


# Assume mergedC is your DataFrame with the specified features and target

# Step 1: Prepare the data
features = ['FCU_CCU', 'FCU_CVICU', 'FCU_MICU', 'FCU_MICU/SICU', 'FCU_Neuro Intermediate', 'FCU_Neuro SICU',
            'FCU_Neuro Stepdown', 'FCU_SICU', 'FCU_TSICU',
            'ADM_Emergency', 'ADM_Other', 'ADM_Referral', 'ADM_Transfer',
            'ageG_18-24', 'ageG_25-44', 'ageG_45-64', 'ageG_65-88', 'ageG_89+',
            'Chapter I', 'Chapter II', 'Chapter III', 'Chapter IV', 'Chapter IX', 'Chapter V', 'Chapter VI',
            'Chapter VII', 'Chapter VIII', 'Chapter X', 'Chapter XI', 'Chapter XII', 'Chapter XIII', 
            'Chapter XIV', 'Chapter XIX', 'Chapter XV', 'Chapter XVI', 'Chapter XVII', 'Chapter XVIII',
            'Chapter XX', 'Chapter XXI', 'INS_Medicaid', 'INS_Medicare', 'INS_Other',
            'RG2_Minority', 'RG2_NonMinority', 'GEN_F', 'GEN_M']

target = 'los_bins'


X = mergedC[features]
y = mergedC[target]

# Encode the string labels into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Inverse transform the predicted labels to get back the original string labels
y_pred_original = label_encoder.inverse_transform(y_pred)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

print(classification_report(y_test, y_pred))




Accuracy: 0.4303273844699208
              precision    recall  f1-score   support

           0       0.27      0.20      0.23       525
           1       0.20      0.08      0.11       820
           2       0.49      0.87      0.63      3599
           3       0.26      0.11      0.15       203
           4       0.27      0.14      0.18      1544
           5       0.22      0.06      0.10      1770

    accuracy                           0.43      8461
   macro avg       0.28      0.24      0.24      8461
weighted avg       0.35      0.43      0.35      8461



In [42]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.1, 0.3]
}

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Initialize Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Perform Grid Search
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Make predictions
y_pred = best_xgb_model.predict(X_test)

# Inverse transform the predicted labels to get back the original string labels
y_pred_original = label_encoder.inverse_transform(y_pred)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 729 candidates, totalling 3645 fits




[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.8; total time=  17.3s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.8; total time=  17.7s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.6; total time=  17.8s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.8; total time=  17.8s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.6; total time=  18.0s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.6; total time=  18.2s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.6; total time=  18.4s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.6; tot

KeyboardInterrupt: 