In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from scipy.stats import randint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf
%cd ../
from src import create_fake_patients

import warnings

# Filter out specific warning types
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

#### Prepare data

In [None]:
cv_data = create_fake_patients.create_fake_patient_df(num_patients=9000, max_events=100, max_nodes=512)
test_data = create_fake_patients.create_fake_patient_df(num_patients=2000, max_events=100, max_nodes=512)
post_recal_data = create_fake_patients.create_fake_patient_df(num_patients=2000, max_events=100, max_nodes=512)

In [None]:
def map_values(value):
    if 'hip' in value:
        return [1]
    else:
        return [0]
    
def prep_data_for_rnn(data):
    max_length = max(len(item) for item in data['indices'])
    print(f"Maximum list length in '{'indices'}': {max_length}")

    # Get a column of the sequence of events
    data['event_seq'] = "2"
    for i, row in data.iterrows():
        event_sequence = [sublist[1] for sublist in data['indices'][i]]#[::-1]
        event_seq_padded = (event_sequence + [-1] * (max_length - len(event_sequence)))
        reverse_event_seq = event_seq_padded[::-1]
        data['event_seq'][i] = reverse_event_seq
    
    # Apply the function to create column
    data['ohe_hip_binary'] = data['replace_type'].apply(map_values)
    return data
    
cv_data = prep_data_for_rnn(cv_data)
test_data = prep_data_for_rnn(test_data)
post_recal_data = prep_data_for_rnn(post_recal_data)

In [None]:
list_of_lists = cv_data['event_seq'].to_list()
merged_list = [item for sublist in list_of_lists for item in sublist]

# Calculate the length of the set of unique elements in the merged list
unique_elements_length = len(set(merged_list))

print("Number of Unique Events:", unique_elements_length)

In [None]:
# Prepare inputs and labels for model

def input_labels_rnn(data, top_100=False):
    if top_100:
        for i, row in data.iterrows():
            pat_list = data['event_seq'][i]
            #print(pat_list)
            for i, event in enumerate(pat_list):
                if pat_list[i] >= 100:
                    del pat_list[i]
                    pat_list.append(-1)
            data['event_seq'][i] = pat_list
            
    X = data['event_seq'].tolist()
    X = np.array(X, dtype=np.float32)

    y = data['ohe_hip_binary'].tolist()
    flat_y = [item for sublist in y for item in sublist]
    y = np.array(flat_y, dtype=np.float32)

    X = X.reshape((X.shape[0], X.shape[1], 1)) # samples (num of patients), timesteps, number of variables in each time step
    
    return X, y

X_cv, y_cv = input_labels_rnn(cv_data)
X_test, y_test = input_labels_rnn(test_data)
X_test2, y_test2 = input_labels_rnn(post_recal_data)

##### RNN Model

In [None]:
# RNN Model

def create_rnn_model(num_hidden_layers=1, activation_function='relu', learning_rate=0.001):
    model = Sequential()
    model.add(SimpleRNN(units=64, activation=activation_function))
    #model.add(Dropout(0.2))
    for _ in range(num_hidden_layers):
        model.add(Dense(128, activation=activation_function))#, kernel_regularizer='l1'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=learning_rate), 
                  metrics=['accuracy'])
    return model


metric_scoring = ['accuracy', 'roc_auc', 'f1', 'precision', 'recall']

#### RNN Read Codes only

Sequential only, no elapsed time is included.

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)

keras_classifier = KerasClassifier(build_fn=create_rnn_model, verbose=0)

activation_type = 'relu' #'gelu'

param_grid = {
    'num_hidden_layers': [2], #[2, 3, 4],
    'activation_function': [activation_type],
    'learning_rate': [0.01], #[0.001, 0.01]
}


grid_search = GridSearchCV(estimator=keras_classifier,
                           param_grid=param_grid,
                           scoring= metric_scoring,
                           cv=5,
                           verbose=2,
                           return_train_score=True,
                           refit='accuracy')


# Perform grid search and cross-validation
grid_search.fit(X_cv, y_cv, epochs=20, callbacks=[callback])

best_params = grid_search.best_params_ # gives the best results on the holdout data

cv_results = grid_search.cv_results_



print(f'Best Parameters: {best_params}')
mean_auc = cv_results['mean_test_roc_auc'][grid_search.best_index_]
std_auc = cv_results['std_test_roc_auc'][grid_search.best_index_]
mean_accuracy = cv_results['mean_test_accuracy'][grid_search.best_index_]
std_accuracy = cv_results['std_test_accuracy'][grid_search.best_index_]
print(f'Mean test (cv) AUC: {mean_auc:.4f} +/- {std_auc:.4f}')
print(f'Mean test (cv) Accuracy: {mean_accuracy:.4f}% +/- {std_accuracy:.4f}%')
df = pd.DataFrame(grid_search.cv_results_)


best_estimator = grid_search.best_estimator_
y_pred_proba = best_estimator.predict_proba(X_test)
y_pred_proba2 = best_estimator.predict_proba(X_test2)    

file_name_layers = param_grid.get('num_hidden_layers')[0]
file_name_lr = param_grid.get('learning_rate')[0]

df.to_csv("temp_RNN_"+str(file_name_layers)+'_layers_'+str(file_name_lr)+'_lr_'+activation_type+"_results.csv")


# SAVE THE PROBABILITIES AND TRUE VALUES FROM THE BEST RNN MODEL ON BOTH TEST SETS

file_full_name_proba = 'pred_proba_and_true/RNN_'+str(file_name_layers)+'_layers_'+str(file_name_lr)+'_lr_'+activation_type+'_holdout1_proba.npy'
with open(file_full_name_proba, 'wb') as f:
    np.save(f, y_pred_proba)

file_full_name_true = 'pred_proba_and_true/RNN_'+str(file_name_layers)+'_layers_'+str(file_name_lr)+'_lr_'+activation_type+'_holdout1_true.npy'
with open(file_full_name_true, 'wb') as f:
    np.save(f, y_test)
    
    

    
file_full_name_proba2 = 'pred_proba_and_true/RNN_'+str(file_name_layers)+'_layers_'+str(file_name_lr)+'_lr_'+activation_type+'_holdout2_proba.npy'
with open(file_full_name_proba2, 'wb') as f:
    np.save(f, y_pred_proba2)

file_full_name_true2 = 'pred_proba_and_true/RNN_'+str(file_name_layers)+'_layers_'+str(file_name_lr)+'_lr_'+activation_type+'_holdout2_true.npy'
with open(file_full_name_true2, 'wb') as f:
    np.save(f, y_test2)
    
    

df