In [None]:
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")

# Load Data

In [None]:
data_with_latent_feature_path = 'your_path_to_data_here'

In [None]:
data = pd.read_csv(data_with_latent_feature_path)
data.drop(columns=["id"], inplace=True)

print(f"Data shape: {data.shape}")
print(f"Data columns: {data.columns}")

# Feature Engineering

### process features

In [None]:
# merge race into 4 categories: white, black, hispanic, other
race = data["race"].tolist()
new_race = []
for i in race:
  if "white" in i:
    new_race.append("white")
  elif "black" in i:
    new_race.append("black")
  elif "hispanic" in i:
    new_race.append("hispanic")
  else:
    new_race.append("other")

data["race"] = new_race
print(data["race"].value_counts())
    

In [None]:
# fill unknown values with mean

def get_mean(data, column):
    temp = data[data[column] != 'unknown']
    return temp[column].astype(float).mean()

# replace unknow in height and weight, age with 0
data["bmi"] = data["bmi"].replace('unknown', get_mean(data, 'bmi'))
data['height'] = data['height'].replace('unknown', get_mean(data, 'height'))
data['weight'] = data['weight'].replace('unknown', get_mean(data, 'weight'))
data['age'] = data['age'].replace('unknown', get_mean(data, 'age'))
data['number_of_records'] = data['number_of_records'].replace('unknown', get_mean(data, 'number_of_records'))

# separate blood pressure into systolic and diastolic
data['blood_pressure'] = data['blood_pressure'].replace('unknown', '0/0')
data['systolic'] = data['blood_pressure'].str.split('/').str[0]
data['diastolic'] = data['blood_pressure'].str.split('/').str[1]

data.drop(columns=['admit_time', 'discharge_time', 'blood_pressure'], inplace=True)

number_columns = ['height', 'weight', 'systolic', 'diastolic', 'age', "number_of_records", "bmi"]
for column in number_columns:
    data[column] = data[column].astype(float)

data.head()

### split data

In [None]:
X = data.drop('discharge_location', axis=1)
y = data['discharge_location']

X_encoded = pd.get_dummies(X, drop_first=True)
y_encoded = y.map({'home': 0, 'other': 1, 'died': 2})

seeds = [42,126,88,999,255]
X_trains = []
y_trains = []
X_tests = []
y_tests = []

for seed in seeds:
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.3, random_state=seed, stratify=y_encoded)
    X_trains.append(X_train)
    y_trains.append(y_train)
    X_tests.append(X_test)
    y_tests.append(y_test)
    print(f"Seed: {seed}")
    print()
    print(f"Target distribution in train: {y_train.value_counts(normalize=True)}")
    print(f"Target distribution in test: {y_test.value_counts(normalize=True)}")
    print("-------------------------------------------------")

### deal with data imbalance

In [None]:
X_processed_trains = []
y_processed_trains = []
for X_encoded_train, y_encoded_train, seed in zip(X_trains, y_trains, seeds):
  smote = SMOTE(random_state=seed)
  X_encoded_train_smote, y_encoded_train_smote = smote.fit_resample(X_encoded_train, y_encoded_train)
  print(f"Seed: {seed}")
  print(f"SMOTE train shape: {X_encoded_train_smote.shape}")
  print(f"SMOTE train discharge_location distribution: {y_encoded_train_smote.value_counts()}")
  print("-------------------------------------------------")
  X_processed_trains.append(X_encoded_train_smote)
  y_processed_trains.append(y_encoded_train_smote)

### creat baseline

In [None]:
X_processed_trains_wolf = []
X_tests_wolf = []

for X_encoded_train, X_encoded_test, seed in zip(X_processed_trains, X_tests, seeds):
  X_train_wolf = X_encoded_train.drop(columns=["social_support_Weak"])
  X_test_wolf = X_encoded_test.drop(columns=["social_support_Weak"])
  X_processed_trains_wolf.append(X_train_wolf)
  X_tests_wolf.append(X_test_wolf)
  print(X_encoded_train.columns)
  print()
  print( X_train_wolf.columns)
  print()
  print(X_test.columns)
  print()
  print(X_test_wolf.columns)
  print()

# Linear Regression 

### with latent feature

In [None]:
accuracies = []
F1_scores = []

for X_train, y_train, X_test, y_test, seed in zip(X_processed_trains, y_processed_trains, X_tests, y_tests, seeds):
    
    param_grid = {'C': [0.1, 1, 10, 100], 'max_iter': [1000, 10000], 'solver': ['liblinear', 'lbfgs'], 'penalty': ['l1', 'l2', 'elasticnet']}
    grid = GridSearchCV(LogisticRegression(), param_grid, verbose=0)
    grid.fit(X_train, y_train)
    print(f"Seed: {seed}")
    print(f"Best hyperparameters: {grid.best_params_}")
    print("-------------------------------------------------")
    
    # train model with best hyperparameters
    model = LogisticRegression(C=grid.best_params_['C'], max_iter=grid.best_params_['max_iter'])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    f1_scores = f1_score(y_test, y_pred, average='weighted')
    F1_scores.append(f1_scores)
    print(classification_report(y_test, y_pred))
    print("\n")
    
print("#################################################")
print("Summary of model with latent feature")
print(f"Average accuracy: {sum(accuracies)/len(accuracies)}")
print(f"Standard deviation of accuracy: {np.std(accuracies)}")
print(f"Average F1 score: {sum(F1_scores)/len(F1_scores)}")
print(f"Standard deviation of F1 score: {np.std(F1_scores)}")

### without latent features(baseline)

In [None]:
accuracies = []
F1_scores = []

for X_train, y_train, X_test, y_test, seed in zip(X_processed_trains_wolf, y_processed_trains, X_tests_wolf, y_tests, seeds):
    
    param_grid = {'C': [0.1, 1, 10, 100], 'max_iter': [1000, 10000], 'solver': ['liblinear', 'lbfgs'], 'penalty': ['l1', 'l2', 'elasticnet']}
    grid = GridSearchCV(LogisticRegression(), param_grid, verbose=0)
    grid.fit(X_train, y_train)
    print(f"Seed: {seed}")
    print(f"Best hyperparameters: {grid.best_params_}")
    print("-------------------------------------------------")
    
    # train model with best hyperparameters
    model = LogisticRegression(C=grid.best_params_['C'], max_iter=grid.best_params_['max_iter'])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    f1_scores = f1_score(y_test, y_pred, average='weighted')
    F1_scores.append(f1_scores)
    print(classification_report(y_test, y_pred))
    print("\n")
    
print("#################################################")
print("Summary of model without latent feature (baseline)")
print(f"Average accuracy: {sum(accuracies)/len(accuracies)}")
print(f"Standard deviation of accuracy: {np.std(accuracies)}")
print(f"Average F1 score: {sum(F1_scores)/len(F1_scores)}")
print(f"Standard deviation of F1 score: {np.std(F1_scores)}")

# MLP

### with latent feature

In [None]:
accuracies_mlp = []
F1_scores_mlp = []

for X_train, y_train, X_test, y_test, seed in zip(X_processed_trains, y_processed_trains, X_tests, y_tests, seeds):
      
      param_grid = {'hidden_layer_sizes': [(100,), (200,), (300,)], 'activation': ['relu', 'tanh', 'logistic'], 'solver': ['adam', 'sgd'], 'max_iter': [1000, 10000]}
      grid = GridSearchCV(MLPClassifier(), param_grid, verbose=0)
      grid.fit(X_train, y_train)
      print(f"Seed: {seed}")
      print(f"Best hyperparameters: {grid.best_params_}")
      print("-------------------------------------------------")
      
      # train model with best hyperparameters
      model = MLPClassifier(hidden_layer_sizes=grid.best_params_['hidden_layer_sizes'], activation=grid.best_params_['activation'], solver=grid.best_params_['solver'], max_iter=grid.best_params_['max_iter'])
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
      
      accuracy = accuracy_score(y_test, y_pred)
      accuracies_mlp.append(accuracy)
      f1_scores = f1_score(y_test, y_pred, average='weighted')
      F1_scores_mlp.append(f1_scores)
      print(classification_report(y_test, y_pred))
      print("\n")

print("#################################################")
print("Summary of model with latent feature")
print(f"Average accuracy: {sum(accuracies_mlp)/len(accuracies_mlp)}")
print(f"Standard deviation of accuracy: {np.std(accuracies_mlp)}")
print(f"Average F1 score: {sum(F1_scores_mlp)/len(F1_scores_mlp)}")
print(f"Standard deviation of F1 score: {np.std(F1_scores_mlp)}")

### without latent features(baseline)

In [None]:
accuracies_mlp_wolf = []
F1_scores_mlp_wolf = []

for X_train, y_train, X_test, y_test, seed in zip(X_processed_trains_wolf, y_processed_trains, X_tests_wolf, y_tests, seeds):
        
        param_grid = {'hidden_layer_sizes': [(100,), (200,), (300,)], 'activation': ['relu', 'tanh', 'logistic'], 'solver': ['adam', 'sgd'], 'max_iter': [1000, 10000]}
        grid = GridSearchCV(MLPClassifier(), param_grid, verbose=0)
        grid.fit(X_train, y_train)
        print(f"Seed: {seed}")
        print(f"Best hyperparameters: {grid.best_params_}")
        print("-------------------------------------------------")
        
        # train model with best hyperparameters
        model = MLPClassifier(hidden_layer_sizes=grid.best_params_['hidden_layer_sizes'], activation=grid.best_params_['activation'], solver=grid.best_params_['solver'], max_iter=grid.best_params_['max_iter'])
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        accuracies_mlp_wolf.append(accuracy)
        f1_scores = f1_score(y_test, y_pred, average='weighted')
        F1_scores_mlp_wolf.append(f1_scores)
        print(classification_report(y_test, y_pred))
        print("\n")
        
print("#################################################")
print("Summary of model without latent feature (baseline)")
print(f"Average accuracy: {sum(accuracies_mlp_wolf)/len(accuracies_mlp_wolf)}")
print(f"Standard deviation of accuracy: {np.std(accuracies_mlp_wolf)}")
print(f"Average F1 score: {sum(F1_scores_mlp_wolf)/len(F1_scores_mlp_wolf)}")
print(f"Standard deviation of F1 score: {np.std(F1_scores_mlp_wolf)}")

# Random Forest

### with latent feature

In [None]:
accuracies_rf = []
F1_scores_rf = []

for X_train, y_train, X_test, y_test, seed in zip(X_processed_trains, y_processed_trains, X_tests, y_tests, seeds):
        
        param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 9], 'verbose': [0]}
        grid = GridSearchCV(RandomForestClassifier(), param_grid, verbose=0, n_jobs=-1)
        grid.fit(X_train, y_train)
        print(f"Seed: {seed}")
        print(f"Best hyperparameters: {grid.best_params_}")
        print("-------------------------------------------------")
        
        model = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'], max_depth=grid.best_params_['max_depth'])
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        accuracies_rf.append(accuracy)
        f1_scores = f1_score(y_test, y_pred, average='weighted')
        F1_scores_rf.append(f1_scores)
        print(classification_report(y_test, y_pred))
        print("\n")
        
print("#################################################")
print("Summary of Random Forest model with latent feature")
print(f"Average accuracy: {sum(accuracies_rf)/len(accuracies_rf)}")
print(f"Standard deviation of accuracy: {np.std(accuracies_rf)}")
print(f"Average F1 score: {sum(F1_scores_rf)/len(F1_scores_rf)}")
print(f"Standard deviation of F1 score: {np.std(F1_scores_rf)}")

### without latent features(baseline)

In [None]:
accuracies_rf_wolf = []
F1_scores_rf_wolf = []

for X_train, y_train, X_test, y_test, seed in zip(X_processed_trains_wolf, y_processed_trains, X_tests_wolf, y_tests, seeds):
          
          param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 9], 'verbose': [0]}
          grid = GridSearchCV(RandomForestClassifier(), param_grid, verbose=0, n_jobs=-1)
          grid.fit(X_train, y_train)
          print(f"Seed: {seed}")
          print(f"Best hyperparameters: {grid.best_params_}")
          print("-------------------------------------------------")
          
          model = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'], max_depth=grid.best_params_['max_depth'])
          model.fit(X_train, y_train)
          y_pred = model.predict(X_test)
          
          accuracy = accuracy_score(y_test, y_pred)
          accuracies_rf_wolf.append(accuracy)
          f1_scores = f1_score(y_test, y_pred, average='weighted')
          F1_scores_rf_wolf.append(f1_scores)
          print(classification_report(y_test, y_pred))
          print("\n")
          
print("#################################################")
print("Summary of Random Forest model without latent feature (baseline)")
print(f"Average accuracy: {sum(accuracies_rf_wolf)/len(accuracies_rf_wolf)}")
print(f"Standard deviation of accuracy: {np.std(accuracies_rf_wolf)}")
print(f"Average F1 score: {sum(F1_scores_rf_wolf)/len(F1_scores_rf_wolf)}")
print(f"Standard deviation of F1 score: {np.std(F1_scores_rf_wolf)}")

# Gradient Boosting Tree

### with latent feature

In [None]:
accuracies_gbt = []
F1_scores_gbt = []

for X_train, y_train, X_test, y_test, seed in zip(X_processed_trains, y_processed_trains, X_tests, y_tests, seeds):
            
            param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 9], 'verbose': [0]}
            grid = GridSearchCV(GradientBoostingClassifier(), param_grid, verbose=1)
            grid.fit(X_train, y_train)
            print(f"Seed: {seed}")
            print(f"Best hyperparameters: {grid.best_params_}")
            print("-------------------------------------------------")
            
            model = GradientBoostingClassifier(n_estimators=grid.best_params_['n_estimators'], max_depth=grid.best_params_['max_depth'])
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            accuracies_gbt.append(accuracy)
            f1_scores = f1_score(y_test, y_pred, average='weighted')
            F1_scores_gbt.append(f1_scores)
            print(classification_report(y_test, y_pred))
            print("\n")
            
print("#################################################")
print("Summary of Gradient Boosting model with latent feature")
print(f"Average accuracy: {sum(accuracies_gbt)/len(accuracies_gbt)}")
print(f"Standard deviation of accuracy: {np.std(accuracies_gbt)}")
print(f"Average F1 score: {sum(F1_scores_gbt)/len(F1_scores_gbt)}")
print(f"Standard deviation of F1 score: {np.std(F1_scores_gbt)}")

### without latent features(baseline)

In [None]:
accuracies_gbt_wolf = []
F1_scores_gbt_wolf = []

for X_train, y_train, X_test, y_test, seed in zip(X_processed_trains_wolf, y_processed_trains, X_tests_wolf, y_tests, seeds):
                
                param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 9], 'verbose': [0]}
                grid = GridSearchCV(GradientBoostingClassifier(), param_grid, verbose=0)
                grid.fit(X_train, y_train)
                print(f"Seed: {seed}")
                print(f"Best hyperparameters: {grid.best_params_}")
                print("-------------------------------------------------")
                
                model = GradientBoostingClassifier(n_estimators=grid.best_params_['n_estimators'], max_depth=grid.best_params_['max_depth'])
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                
                accuracy = accuracy_score(y_test, y_pred)
                accuracies_gbt_wolf.append(accuracy)
                f1_scores = f1_score(y_test, y_pred, average='weighted')
                F1_scores_gbt_wolf.append(f1_scores)
                print(classification_report(y_test, y_pred))
                print("\n")
                
print("#################################################")
print("Summary of Gradient Boosting model without latent feature (baseline)")
print(f"Average accuracy: {sum(accuracies_gbt_wolf)/len(accuracies_gbt_wolf)}")
print(f"Standard deviation of accuracy: {np.std(accuracies_gbt_wolf)}")
print(f"Average F1 score: {sum(F1_scores_gbt_wolf)/len(F1_scores_gbt_wolf)}")
print(f"Standard deviation of F1 score: {np.std(F1_scores_gbt_wolf)}")