In [1]:
import os
import numpy as np
import pandas as pd
import hickle as hkl

from collections import defaultdict

from sklearn.metrics import roc_auc_score, r2_score, accuracy_score, explained_variance_score, mean_absolute_error, mean_squared_error

import keras
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dropout, Dense, MaxPooling1D, Conv1D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from IPython.core.display import display, HTML



# Hide GPU from visible devices

#'''
tf.config.set_visible_devices([], 'GPU')
print(f'CUDA GPU AVAILABLE: {tf.test.is_gpu_available(cuda_only=True)}')
'''
THREADS = 8
os.environ['OMP_NUM_THREADS'] = str(THREADS)
os.environ['TF_NUM_INTEROP_THREADS'] = str(THREADS)
os.environ['TF_NUM_INTRAOP_THREADS'] = str(THREADS)
tf.config.threading.set_inter_op_parallelism_threads(THREADS)
tf.config.threading.set_intra_op_parallelism_threads(THREADS)
tf.config.set_soft_device_placement(True)
'''

#'''
display(HTML("<style>.container { width:100% !important; }</style>"))
#'''

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
CUDA GPU AVAILABLE: True


In [2]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean((y_pred - y_true) / y_true)

In [3]:
N_SPLITS = 5
N_STEPS = 20
N_DAYS = 60
DATA_DIR = f'data/processed_data/cv_data__{N_SPLITS}_class_id_folds__{N_STEPS}_steps__{N_DAYS}_days'

In [4]:
results = []

for i in range(N_SPLITS):

    # Load one partition of folds
    data = hkl.load(f'{DATA_DIR}/fold_{i}.hkl')
    action_training_input = data['action_training_input']
    recurrent_training_input = data['recurrent_training_input']
    prior_training_input = data['prior_training_input']
    completion_training_target = data['completion_training_target']
    problems_training_target = data['problems_training_target']
    training_target_sequence = data['training_target_sequence']
    action_testing_input = data['action_testing_input']
    recurrent_testing_input = data['recurrent_testing_input']
    prior_testing_input = data['prior_testing_input']
    completion_testing_target = data['completion_testing_target']
    problems_testing_target = data['problems_testing_target']
    testing_target_sequence = data['testing_target_sequence']
    
    # Clear session so models don't pile up
    keras.backend.clear_session()
    
    # Recurrent Input
    recurrent_input_layer = Input(shape=recurrent_training_input[0].shape, name='recurrent')
    recurrent_model = Conv1D(filters=128, kernel_size=3, activation='relu')(recurrent_input_layer)
    recurrent_model = MaxPooling1D(pool_size=2)(recurrent_model)
    recurrent_model = Dropout(0.5)(recurrent_model)
    recurrent_model = Conv1D(filters=64, kernel_size=3, activation='relu')(recurrent_model)
    recurrent_model = MaxPooling1D(pool_size=2)(recurrent_model)
    recurrent_model = Flatten()(recurrent_model)
    recurrent_model = Dropout(0.5)(recurrent_model)
    recurrent_model = Dense(units=32, activation='relu')(recurrent_model)
    recurrent_model = Dropout(0.5)(recurrent_model)
    completion_output_layer = Dense(units=1, activation='sigmoid', name='completion')(recurrent_model)
    problems_output_layer = Dense(units=1, activation='linear', name='problems')(recurrent_model)
    
    combined_model = Model(recurrent_input_layer, [completion_output_layer, problems_output_layer])
    combined_model.compile(optimizer=Adam(learning_rate=1e-4), loss={'completion': 'binary_crossentropy', 'problems': 'mse'})

    # Train model
    es = [EarlyStopping(monitor='val_loss', patience=10, min_delta=0, restore_best_weights=True)]
    weights = {'completion': np.ones_like(completion_training_target) * 16, 'problems': completion_training_target}
    combined_model.fit(x={'recurrent': recurrent_training_input},
                       y={'completion': completion_training_target, 'problems': problems_training_target},
                       epochs=1000,
                       validation_split=0.25,
                       callbacks=es,
                       sample_weight=weights,
                       verbose=1)
    
    # Store model predictions
    completion_testing_output, problems_testing_output = combined_model.predict({'recurrent': recurrent_testing_input})
    df = pd.DataFrame(zip(np.ones_like(testing_target_sequence).flatten() * i, 
                          testing_target_sequence.flatten(), 
                          completion_testing_target.flatten(), 
                          problems_testing_target.flatten(), 
                          completion_testing_output.flatten(), 
                          problems_testing_output.flatten()), 
                      columns = ['fold', 
                                 'target_sequence', 
                                 'completion_target', 
                                 'problems_target', 
                                 'completion_prediction', 
                                 'problems_prediction'])
    results.append(df)
    

pd.concat(results).to_csv('cnn_cross_validation_results.csv', index=False)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000


Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000


Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000


In [5]:
# Load the data

exp_norm_map = pd.read_csv('data/experiment_information/exp_norm_map.csv')
results = pd.read_csv('cnn_cross_validation_results.csv')

In [6]:
# Evaluate the results

norm_exp_dict = defaultdict(lambda: 'None')
for n, e in zip(exp_norm_map['normal_id'], exp_norm_map['experiment_id']):
    norm_exp_dict[n] = e
results['experiment_sequence'] = results['target_sequence'].map(norm_exp_dict)

metrics = []

# Calculate the metrics for each sequence
for sequence, df in results.groupby('target_sequence'):
    
    completion_target = df['completion_target']
    
    completion_prediction = df['completion_prediction']
    completion_auc = roc_auc_score(completion_target, completion_prediction) if len(completion_target.unique()) > 1 else None
    completion_acc = accuracy_score(completion_target, completion_prediction > 0.5)
    completion_r2 = r2_score(completion_target, completion_prediction) if len(completion_target) > 1 else None
    completion_ev = explained_variance_score(completion_target, completion_prediction) if len(completion_target) > 1 else None
    completion_mse = mean_squared_error(completion_target, completion_prediction)
    
    problems_target = df[df['completion_target'] == 1]['problems_target']
    problems_prediction = df[df['completion_target'] == 1]['problems_prediction']
    problems_mae = mean_absolute_error(problems_target, problems_prediction)
    problems_mape = mean_absolute_percentage_error(problems_target, problems_prediction)
    problems_r2 = r2_score(problems_target, problems_prediction) if len(problems_target) > 1 else None
    problems_ev = explained_variance_score(problems_target, problems_prediction) if len(problems_target) > 1 else None
    problems_mse = mean_squared_error(problems_target, problems_prediction)

    metrics.append([sequence, 
                    norm_exp_dict[sequence], 
                    len(df), 
                    completion_auc, 
                    completion_acc, 
                    completion_r2, 
                    completion_ev, 
                    completion_mse, 
                    problems_mae, 
                    problems_mape, 
                    problems_r2, 
                    problems_ev, 
                    problems_mse])

# Calculate the metrics for each fold
for fold, df in results.groupby('fold'):
    
    completion_target = df['completion_target']
    completion_prediction = df['completion_prediction']
    completion_auc = roc_auc_score(completion_target, completion_prediction) if len(completion_target.unique()) > 1 else None
    completion_acc = accuracy_score(completion_target, completion_prediction > 0.5)
    completion_r2 = r2_score(completion_target, completion_prediction) if len(completion_target) > 1 else None
    completion_ev = explained_variance_score(completion_target, completion_prediction) if len(completion_target) > 1 else None
    completion_mse = mean_squared_error(completion_target, completion_prediction)
    
    problems_target = df[df['completion_target'] == 1]['problems_target']
    problems_prediction = df[df['completion_target'] == 1]['problems_prediction']
    problems_mae = mean_absolute_error(problems_target, problems_prediction)
    problems_mape = mean_absolute_percentage_error(problems_target, problems_prediction)
    problems_r2 = r2_score(problems_target, problems_prediction) if len(problems_target) > 1 else None
    problems_ev = explained_variance_score(problems_target, problems_prediction) if len(problems_target) > 1 else None
    problems_mse = mean_squared_error(problems_target, problems_prediction)

    metrics.append([f'fold_{fold}', 
                    'None', 
                    len(df), 
                    completion_auc, 
                    completion_acc, 
                    completion_r2, 
                    completion_ev, 
                    completion_mse, 
                    problems_mae, 
                    problems_mape, 
                    problems_r2, 
                    problems_ev, 
                    problems_mse])

# Calculate the metrics for everything combined
completion_target = results['completion_target']
completion_prediction = results['completion_prediction']
completion_auc = roc_auc_score(completion_target, completion_prediction) if len(completion_target.unique()) > 1 else None
completion_acc = accuracy_score(completion_target, completion_prediction > 0.5)
completion_r2 = r2_score(completion_target, completion_prediction) if len(completion_target) > 1 else None
completion_ev = explained_variance_score(completion_target, completion_prediction) if len(completion_target) > 1 else None
completion_mse = mean_squared_error(completion_target, completion_prediction)

problems_target = results[results['completion_target'] == 1]['problems_target']
problems_prediction = results[results['completion_target'] == 1]['problems_prediction']
problems_mae = mean_absolute_error(problems_target, problems_prediction)
problems_mape = mean_absolute_percentage_error(problems_target, problems_prediction)
problems_r2 = r2_score(problems_target, problems_prediction) if len(problems_target) > 1 else None
problems_ev = explained_variance_score(problems_target, problems_prediction) if len(problems_target) > 1 else None
problems_mse = mean_squared_error(problems_target, problems_prediction)

metrics.append(['all_target_data', 
                'None', 
                len(results), 
                completion_auc, 
                completion_acc, 
                completion_r2, 
                completion_ev, 
                completion_mse, 
                problems_mae, 
                problems_mape, 
                problems_r2, 
                problems_ev, 
                problems_mse])

metrics = pd.DataFrame(metrics, 
                       columns=['sequence_id', 
                                'experiment_id', 
                                'sample_size', 
                                'completion_auc', 
                                'completion_acc', 
                                'completion_r2', 
                                'completion_ev', 
                                'completion_mse', 
                                'problems_mae', 
                                'problems_mape', 
                                'problems_r2', 
                                'problems_ev', 
                                'problems_mse'])

metrics.to_csv('cnn_cross_validation_metrics.csv', index=False)

In [7]:
metrics.iloc[-6:]

Unnamed: 0,sequence_id,experiment_id,sample_size,completion_auc,completion_acc,completion_r2,completion_ev,completion_mse,problems_mae,problems_mape,problems_r2,problems_ev,problems_mse
288,fold_0,,42114,0.713366,0.76065,0.107669,0.109761,0.167063,1.81415,0.118643,0.010503,0.026048,7.490417
289,fold_1,,42114,0.716603,0.756494,0.114959,0.115912,0.168484,1.863959,0.109904,-0.000234,0.019474,9.544704
290,fold_2,,42114,0.716577,0.758869,0.11549,0.116139,0.167756,1.897481,0.118699,0.00099,0.018566,10.232706
291,fold_3,,42113,0.717868,0.752238,0.113531,0.114868,0.17027,1.857417,0.136229,0.006839,0.017695,7.831766
292,fold_4,,42113,0.712242,0.756299,0.109536,0.110199,0.169385,1.86697,0.120647,0.003255,0.020051,8.111379
293,all_target_data,,210568,0.71396,0.75691,0.1123,0.112319,0.168592,1.859935,0.120805,0.004024,0.020033,8.641575
