In [12]:
# Random forest model you provided that did not run

import torch, pickle
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import label_binarize

def load_dict_from_pickle(filename):
    with open(filename, 'rb') as file:
        loaded_dict = pickle.load(file)
    return loaded_dict


def load_data(file_path):
    # Load data
    X = torch.load(f'{file_path}/X.pt')
    Y = torch.load(f'{file_path}/Y.pt')
    
    # # Check if X is a sparse matrix and convert to dense if necessary
    # if isinstance(X, csr_matrix):
    #     X = X.toarray()  # Convert sparse to dense

    # Y = transform_y_for_multi_class(Y)
    Nodes = load_dict_from_pickle(f'{file_path}/Nodes.pkl')

    # # Subset to patient nodes (assuming node name starts with 'V')
    # indeces = {p: i for i, p in enumerate(Nodes)}
    # visit_indeces = [indeces[v] for v in Nodes if v[0] == 'V']

    # X = X[visit_indeces]
    # Y = Y[visit_indeces]

    return X, Y

# Transform Y for Multi-Class Classification
def transform_y_for_multi_class(Y):
    # Convert LoS into 4 classes:
    # 0 if LoS <= 1 (one day or less),
    # 1 if LoS > 1 and LoS <= 3 (three days),
    # 2 if LoS > 3 and LoS <= 7 (up to a week),
    # 3 if LoS > 7 (more than a week)
    return np.where(Y <= 1, 0,
                    np.where(Y <= 3, 1,
                             np.where(Y <= 7, 2, 3)))

# 1. Apply SMOTE
def apply_smote(X, Y):
    smote = SMOTE(random_state=42)
    X_resampled, Y_resampled = smote.fit_resample(X, Y)
    return X_resampled, Y_resampled

# 2. Train Random Forest
def train_random_forest(X, Y, num_classes=4, num_estimators=100):
    # Split data into training, validation, and test sets
    X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, train_size=0.8, random_state=42)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, train_size=0.5, random_state=42)

    # Apply SMOTE to the training set
    X_train_resampled, Y_train_resampled = apply_smote(X_train, Y_train)

    # Initialize the Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=num_estimators, random_state=42)

    # Train the model
    clf.fit(X_train_resampled, Y_train_resampled)

    # Evaluate on validation set
    Y_val_pred = clf.predict(X_val)
    val_metrics = evaluate_multi_class(Y_val, Y_val_pred, num_classes)

    print("Validation Metrics:")
    print(val_metrics)

    # Evaluate on test set
    Y_test_pred = clf.predict(X_test)
    test_metrics = evaluate_multi_class(Y_test, Y_test_pred, num_classes)

    print("Test Metrics:")
    print(test_metrics)

    return clf, test_metrics

# 3. Evaluation for Multi-Class Classification
def evaluate_multi_class(true_labels, preds, num_classes):
    # Binarize the true labels for multi-class AUC calculation
    true_labels_binarized = label_binarize(true_labels, classes=np.arange(num_classes))

    try:
        # Calculate AUC for each class and then macro-average
        auc = roc_auc_score(true_labels_binarized, label_binarize(preds, classes=np.arange(num_classes)), average='macro', multi_class='ovr')
    except ValueError:
        auc = None

    accuracy = accuracy_score(true_labels, preds)
    precision = precision_score(true_labels, preds, average='weighted')
    recall = recall_score(true_labels, preds, average='weighted')
    f1 = f1_score(true_labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc
    }

# Main Function
def main_random_forest(file_path, num_estimators=100):
    # Load data (replace with actual data loading)
    X, Y = load_data(file_path)

    # Train Random Forest
    clf, metrics = train_random_forest(X, Y, num_classes=4, num_estimators=num_estimators)
    
    return metrics

# Set parameters and run the main training process
saving_path = '../../Data/infectious'
results = main_random_forest(saving_path, num_estimators=100)
print(results)


Validation Metrics:
{'accuracy': 0.7211934156378601, 'precision': 0.7311460255509488, 'recall': 0.7211934156378601, 'f1_score': 0.7182227828757093, 'auc': None}
Test Metrics:
{'accuracy': 0.7429305912596401, 'precision': 0.7461524389172034, 'recall': 0.7429305912596401, 'f1_score': 0.7385397523789776, 'auc': None}
{'accuracy': 0.7429305912596401, 'precision': 0.7461524389172034, 'recall': 0.7429305912596401, 'f1_score': 0.7385397523789776, 'auc': None}


In [13]:
# Put above into chat gpt and changed some of the code as node length did not match up with X.
# Look at output 

# With def transform_y_for_multi_class(Y):

import torch
import pickle
import pandas as pd
import numpy as np
import scipy.sparse
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize

def load_dict_from_pickle(filename):
    with open(filename, 'rb') as file:
        loaded_dict = pickle.load(file)
    return loaded_dict

def load_data(file_path):
    import scipy.sparse

    # Load data
    X = torch.load(f'{file_path}/X.pt')
    Y = torch.load(f'{file_path}/Y.pt')
    Nodes = load_dict_from_pickle(f'{file_path}/Nodes.pkl')

    # Debugging: Print types and shapes
    print(f"Type of X after loading: {type(X)}")
    print(f"Type of Y after loading: {type(Y)}")
    print(f"Shape of X: {X.shape}")
    print(f"Shape of Y: {Y.shape}")
    print(f"Length of Nodes: {len(Nodes)}")

    # Convert to NumPy arrays if necessary
    if isinstance(X, torch.Tensor):
        X = X.numpy()
    if isinstance(Y, torch.Tensor):
        Y = Y.numpy()

    # Ensure X is a NumPy array
    X = np.asarray(X)
    Y = np.asarray(Y)

    # Transform Y for classification
    print(set(Y))
    Y = transform_y_for_multi_class(Y)
    print(set(Y))

    # # Adjust Nodes to match X if necessary
    # if len(Nodes) > X.shape[0]:
    #     print("Warning: Nodes has more entries than X. Adjusting Nodes to match X.")
    #     Nodes = Nodes[:X.shape[0]]
    #     print(len(Nodes))

    # # Create indices mapping
    # indices = {p: i for i, p in enumerate(Nodes)}
    # visit_indices = [indices[v] for v in Nodes if v[0] == 'V']

    # # Check for invalid indices
    # invalid_indices = [idx for idx in visit_indices if idx >= X.shape[0]]
    # if invalid_indices:
    #     print(f"Invalid indices found: {invalid_indices}")
    #     visit_indices = [idx for idx in visit_indices if idx < X.shape[0]]

    # # Subset X and Y
    # X = X[visit_indices]
    # Y = Y[visit_indices]

    return X, Y

# Rest of your code remains the same...

def transform_y_for_multi_class(Y):
    # Convert LoS into 4 classes:
    # 0 if LoS <= 1 (one day or less),
    # 1 if LoS > 1 and LoS <= 3 (three days),
    # 2 if LoS > 3 and LoS <= 7 (up to a week),
    # 3 if LoS > 7 (more than a week)
    return np.where(Y <= 1, 0,
                    np.where(Y <= 3, 1,
                             np.where(Y <= 7, 2, 3)))

def apply_smote(X, Y):
    smote = SMOTE(random_state=42)
    X_resampled, Y_resampled = smote.fit_resample(X, Y)
    
    # Ensure data is in the correct format
    if scipy.sparse.issparse(X_resampled):
        X_resampled.indptr = X_resampled.indptr.astype(np.int32)
        X_resampled.indices = X_resampled.indices.astype(np.int32)
        X_resampled = X_resampled.toarray()
    else:
        # Ensure X_resampled is a NumPy array
        X_resampled = np.asarray(X_resampled)
    
    return X_resampled, Y_resampled

def train_random_forest(X, Y, num_classes=4, num_estimators=100):
    # Split data into training, validation, and test sets
    X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, train_size=0.8, random_state=42)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, train_size=0.5, random_state=42)

    # Apply SMOTE to the training set
    X_train_resampled, Y_train_resampled = apply_smote(X_train, Y_train)

    # Initialize the Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=num_estimators, random_state=42)

    # Train the model
    clf.fit(X_train_resampled, Y_train_resampled)

    # Evaluate on validation set
    Y_val_pred = clf.predict(X_val)
    val_metrics = evaluate_multi_class(Y_val, Y_val_pred, num_classes)

    print("Validation Metrics:")
    print(val_metrics)

    # Evaluate on test set
    Y_test_pred = clf.predict(X_test)
    test_metrics = evaluate_multi_class(Y_test, Y_test_pred, num_classes)

    print("Test Metrics:")
    print(test_metrics)

    return clf, test_metrics

def evaluate_multi_class(true_labels, preds, num_classes):
    # Binarize the labels for multi-class AUC calculation
    true_labels_binarized = label_binarize(true_labels, classes=np.arange(num_classes))
    preds_binarized = label_binarize(preds, classes=np.arange(num_classes))

    try:
        # Calculate AUC for each class and then macro-average
        auc = roc_auc_score(true_labels_binarized, preds_binarized, average='macro', multi_class='ovr')
    except ValueError:
        auc = None

    accuracy = accuracy_score(true_labels, preds)
    precision = precision_score(true_labels, preds, average='weighted')
    recall = recall_score(true_labels, preds, average='weighted')
    f1 = f1_score(true_labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc
    }

def main_random_forest(file_path, num_estimators=100):
    # Load data
    X, Y = load_data(file_path)

    # Train Random Forest
    clf, metrics = train_random_forest(X, Y, num_classes=4, num_estimators=num_estimators)
    
    return metrics

# Set parameters and run the main training process
if __name__ == "__main__":
    saving_path = '../../Data/infectious'  # Adjust the path as needed
    results = main_random_forest(saving_path, num_estimators=100)
    print("Final Test Metrics:")
    print(results)


Type of X after loading: <class 'numpy.ndarray'>
Type of Y after loading: <class 'numpy.ndarray'>
Shape of X: (19442, 1456)
Shape of Y: (19442,)
Length of Nodes: 106822
{0, 1, 2}
{0, 1}
Validation Metrics:
{'accuracy': 0.808641975308642, 'precision': 0.8097579990601905, 'recall': 0.808641975308642, 'f1_score': 0.8090946457127548, 'auc': None}
Test Metrics:
{'accuracy': 0.8133676092544987, 'precision': 0.814702485330709, 'recall': 0.8133676092544987, 'f1_score': 0.8138759925547685, 'auc': None}
Final Test Metrics:
{'accuracy': 0.8133676092544987, 'precision': 0.814702485330709, 'recall': 0.8133676092544987, 'f1_score': 0.8138759925547685, 'auc': None}


In [14]:
# Without def transform_y_for_multi_class(Y):

import torch
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize

def load_dict_from_pickle(filename):
    with open(filename, 'rb') as file:
        loaded_dict = pickle.load(file)
    return loaded_dict

def load_data(file_path):
    # Load data
    X = torch.load(f'{file_path}/X.pt')
    Y = torch.load(f'{file_path}/Y.pt')
    Nodes = load_dict_from_pickle(f'{file_path}/Nodes.pkl')

    # Debugging: Print types and shapes
    print(f"Type of X after loading: {type(X)}")
    print(f"Type of Y after loading: {type(Y)}")
    print(f"Shape of X: {X.shape}")
    print(f"Shape of Y: {Y.shape}")
    print(f"Length of Nodes: {len(Nodes)}")

    # Convert to NumPy arrays if necessary
    if isinstance(X, torch.Tensor):
        X = X.numpy()
    if isinstance(Y, torch.Tensor):
        Y = Y.numpy()

    # Ensure X is a NumPy array
    X = np.asarray(X)
    Y = np.asarray(Y)

    # No need to transform Y, it already has 3 classes: [0, 1, 2]

    # # Adjust Nodes to match X if necessary
    # if len(Nodes) > X.shape[0]:
    #     print("Warning: Nodes has more entries than X. Adjusting Nodes to match X.")
    #     Nodes = Nodes[:X.shape[0]]
    #     print(len(Nodes))

    # # Create indices mapping
    # indices = {p: i for i, p in enumerate(Nodes)}
    # visit_indices = [indices[v] for v in Nodes if v[0] == 'V']

    # # Check for invalid indices
    # invalid_indices = [idx for idx in visit_indices if idx >= X.shape[0]]
    # if invalid_indices:
    #     print(f"Invalid indices found: {invalid_indices}")
    #     visit_indices = [idx for idx in visit_indices if idx < X.shape[0]]

    # # Subset X and Y
    # X = X[visit_indices]
    # Y = Y[visit_indices]

    return X, Y

def apply_smote(X, Y):
    smote = SMOTE(random_state=42)
    X_resampled, Y_resampled = smote.fit_resample(X, Y)

    # Ensure data is in the correct format
    X_resampled = np.asarray(X_resampled)
    
    return X_resampled, Y_resampled

def train_random_forest(X, Y, num_classes=3, num_estimators=100):
    # Split data into training, validation, and test sets
    X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, train_size=0.8, random_state=42)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, train_size=0.5, random_state=42)

    # Debugging: Check class distributions after splitting
    print(f"Class distribution in Y_train: {np.bincount(Y_train)}")
    print(f"Class distribution in Y_val: {np.bincount(Y_val)}")
    print(f"Class distribution in Y_test: {np.bincount(Y_test)}")

    # Apply SMOTE to the training set only if there are multiple classes
    if len(np.unique(Y_train)) > 1:
        X_train_resampled, Y_train_resampled = apply_smote(X_train, Y_train)
    else:
        print("Skipping SMOTE as there's only one class in Y_train.")
        X_train_resampled, Y_train_resampled = X_train, Y_train

    # Initialize the Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=num_estimators, random_state=42)

    # Train the model
    clf.fit(X_train_resampled, Y_train_resampled)

    # Evaluate on validation set
    Y_val_pred = clf.predict(X_val)
    val_metrics = evaluate_multi_class(Y_val, Y_val_pred, num_classes)

    print("Validation Metrics:")
    print(val_metrics)

    # Evaluate on test set
    Y_test_pred = clf.predict(X_test)
    test_metrics = evaluate_multi_class(Y_test, Y_test_pred, num_classes)

    print("Test Metrics:")
    print(test_metrics)

    return clf, test_metrics

def evaluate_multi_class(true_labels, preds, num_classes=3):
    # Binarize the labels for multi-class AUC calculation
    true_labels_binarized = label_binarize(true_labels, classes=np.arange(num_classes))
    preds_binarized = label_binarize(preds, classes=np.arange(num_classes))

    try:
        # Calculate AUC for each class and then macro-average
        auc = roc_auc_score(true_labels_binarized, preds_binarized, average='macro', multi_class='ovr')
    except ValueError:
        auc = None

    accuracy = accuracy_score(true_labels, preds)
    precision = precision_score(true_labels, preds, average='weighted')
    recall = recall_score(true_labels, preds, average='weighted')
    f1 = f1_score(true_labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc
    }

def main_random_forest(file_path, num_estimators=100):
    # Load data
    X, Y = load_data(file_path)

    # Train Random Forest
    clf, metrics = train_random_forest(X, Y, num_classes=3, num_estimators=num_estimators)
    
    return metrics

# Set parameters and run the main training process
if __name__ == "__main__":
    saving_path = '../../Data/infectious'  # Adjust the path as needed
    results = main_random_forest(saving_path, num_estimators=100)
    print("Final Test Metrics:")
    print(results)





Type of X after loading: <class 'numpy.ndarray'>
Type of Y after loading: <class 'numpy.ndarray'>
Shape of X: (19442, 1456)
Shape of Y: (19442,)
Length of Nodes: 106822
Class distribution in Y_train: [1962 4267 9324]
Class distribution in Y_val: [ 237  529 1178]
Class distribution in Y_test: [ 193  582 1170]
Validation Metrics:
{'accuracy': 0.7211934156378601, 'precision': 0.7311460255509488, 'recall': 0.7211934156378601, 'f1_score': 0.7182227828757093, 'auc': 0.7215652478846941}
Test Metrics:
{'accuracy': 0.7429305912596401, 'precision': 0.7461524389172034, 'recall': 0.7429305912596401, 'f1_score': 0.7385397523789776, 'auc': 0.733720940579991}
Final Test Metrics:
{'accuracy': 0.7429305912596401, 'precision': 0.7461524389172034, 'recall': 0.7429305912596401, 'f1_score': 0.7385397523789776, 'auc': 0.733720940579991}
