In [1]:
import os
import time
import numpy as np
import pandas as pd
import hickle as hkl

from collections import defaultdict

from sklearn.metrics import roc_auc_score, r2_score, accuracy_score, explained_variance_score, mean_absolute_error, mean_squared_error

import keras
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Masking, Dropout, LSTM, Dense, Concatenate, LayerNormalization
from tensorflow.keras.callbacks import EarlyStopping

from IPython.core.display import display, HTML



# Hide GPU from visible devices

#'''
tf.config.set_visible_devices([], 'GPU')
print(f'CUDA GPU AVAILABLE: {tf.test.is_gpu_available(cuda_only=True)}')
'''
THREADS = 8
os.environ['OMP_NUM_THREADS'] = str(THREADS)
os.environ['TF_NUM_INTEROP_THREADS'] = str(THREADS)
os.environ['TF_NUM_INTRAOP_THREADS'] = str(THREADS)
tf.config.threading.set_inter_op_parallelism_threads(THREADS)
tf.config.threading.set_intra_op_parallelism_threads(THREADS)
tf.config.set_soft_device_placement(True)
'''

#'''
display(HTML("<style>.container { width:100% !important; }</style>"))
#'''

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
CUDA GPU AVAILABLE: True


In [2]:
# Wait for process_data to finish

while 'experiment_data__20_steps__60_days.hkl' not in os.listdir('data/processed_data'):
    time.sleep(600)

#time.sleep(3600)

In [3]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean((y_pred - y_true) / y_true)

In [4]:
N_STEPS = 20
N_DAYS = 60
DATA = f'data/processed_data/experiment_data__{N_STEPS}_steps__{N_DAYS}_days.hkl'

In [None]:
# Store results
results = []

# Load one partition of folds
data = hkl.load(DATA)
action_remnant_input = data['action_remnant_input']
recurrent_remnant_input = data['recurrent_remnant_input']
prior_remnant_input = data['prior_remnant_input']
completion_remnant_target = data['completion_remnant_target']
problems_remnant_target = data['problems_remnant_target']
action_experiment_input = data['action_experiment_input']
recurrent_experiment_input = data['recurrent_experiment_input']
prior_experiment_input = data['prior_experiment_input']
completion_experiment_target = data['completion_experiment_target']
problems_experiment_target = data['problems_experiment_target']
experiment_sequence = data['experiment_sequence']
experiment_assignment_log_id = data['experiment_assignment_log_id']

# Clear session so models don't pile up
keras.backend.clear_session()

# Create model

# Action Model
action_input_layer = Input(shape=action_remnant_input[0].shape, name='action')
action_model = Dropout(rate=0.5)(action_input_layer)
action_model_hook = LSTM(units=64, return_sequences=False, activation='relu', dropout=0.5, recurrent_dropout=0.5)(action_model)
action_model = Dropout(rate=0.5)(action_model_hook)

action_completion_output_layer = Dense(units=1, activation='sigmoid', name='action_completion')(action_model)
action_problems_output_layer = Dense(units=1, activation='linear', name='action_problems')(action_model)

action_model = Model(action_input_layer, [action_completion_output_layer, action_problems_output_layer])
action_model.compile(optimizer='adam', loss={'action_completion': 'binary_crossentropy', 'action_problems': 'mse'})


# Recurrent Model
recurrent_input_layer = Input(shape=recurrent_remnant_input[0].shape, name='recurrent')
recurrent_model = Masking(mask_value=0.0)(recurrent_input_layer)
recurrent_model = Dropout(rate=0.5)(recurrent_model)
recurrent_model_hook = LSTM(units=64, return_sequences=False, dropout=0.5, recurrent_dropout=0.5)(recurrent_model)
recurrent_model = Dropout(rate=0.5)(recurrent_model_hook)

recurrent_completion_output_layer = Dense(units=1, activation='sigmoid', name='recurrent_completion')(recurrent_model)
recurrent_problems_output_layer = Dense(units=1, activation='linear', name='recurrent_problems')(recurrent_model)

recurrent_model = Model(recurrent_input_layer, [recurrent_completion_output_layer, recurrent_problems_output_layer])
recurrent_model.compile(optimizer='adam', loss={'recurrent_completion': 'binary_crossentropy', 'recurrent_problems': 'mse'})


# Prior Model
prior_input_layer = Input(shape=prior_remnant_input[0].shape, name='prior')
prior_model = Dropout(rate=0.5)(prior_input_layer)
prior_model_hook = Dense(units=64, activation='sigmoid')(prior_model)
prior_model = Dropout(rate=0.5)(prior_model_hook)

prior_completion_output_layer = Dense(units=1, activation='sigmoid', name='prior_completion')(prior_model)
prior_problems_output_layer = Dense(units=1, activation='linear', name='prior_problems')(prior_model)

prior_model = Model(prior_input_layer, [prior_completion_output_layer, prior_problems_output_layer])
prior_model.compile(optimizer='adam', loss={'prior_completion': 'binary_crossentropy', 'prior_problems': 'mse'})


# Combined Model
combined_model = Concatenate()([action_model_hook, recurrent_model_hook, prior_model_hook])
combined_model = Dropout(rate=0.5)(combined_model)
combined_completion_output_layer = Dense(units=1, activation='sigmoid', name='combined_completion')(combined_model)
combined_problems_output_layer = Dense(units=1, activation='linear', name='combined_problems')(combined_model)

combined_model = Model([action_input_layer, recurrent_input_layer, prior_input_layer], [combined_completion_output_layer, combined_problems_output_layer])
combined_model.compile(optimizer='adam', loss={'combined_completion': 'binary_crossentropy', 'combined_problems': 'mse'})


# Train Models
es = [EarlyStopping(monitor='val_loss', patience=10, min_delta=0, restore_best_weights=True)]


# Train Partial Models
weights = {'action_completion': np.ones_like(completion_remnant_target), 'action_problems': np.zeros_like(problems_remnant_target)}
action_model.fit(x={'action': action_remnant_input},
                 y={'action_completion': completion_remnant_target, 'action_problems': problems_remnant_target},
                 epochs=1000,
                 validation_split=0.25,
                 callbacks=es,
                 sample_weight=weights,
                 verbose=1)

completion_experiment_output, problems_experiment_output = action_model.predict({'action': action_experiment_input})
df = pd.DataFrame(zip(np.array(['action'] * experiment_sequence.size), 
                      experiment_sequence.flatten(), 
                      experiment_assignment_log_id.flatten(), 
                      completion_experiment_target.flatten(), 
                      problems_experiment_target.flatten(), 
                      completion_experiment_output.flatten(), 
                      problems_experiment_output.flatten()), 
                  columns = ['model', 
                             'sequence_id', 
                             'assignment_log_id', 
                             'completion_target', 
                             'problems_target', 
                             'completion_prediction', 
                             'problems_prediction'])
results.append(df)


weights = {'recurrent_completion': np.ones_like(completion_remnant_target), 'recurrent_problems': np.zeros_like(problems_remnant_target)}
recurrent_model.fit(x={'recurrent': recurrent_remnant_input},
                    y={'recurrent_completion': completion_remnant_target, 'recurrent_problems': problems_remnant_target},
                    epochs=1000,
                    validation_split=0.25,
                    callbacks=es,
                    sample_weight=weights,
                    verbose=1)

completion_experiment_output, problems_experiment_output = recurrent_model.predict({'recurrent': recurrent_experiment_input})
df = pd.DataFrame(zip(np.array(['assignment'] * experiment_sequence.size), 
                      experiment_sequence.flatten(), 
                      experiment_assignment_log_id.flatten(), 
                      completion_experiment_target.flatten(), 
                      problems_experiment_target.flatten(), 
                      completion_experiment_output.flatten(), 
                      problems_experiment_output.flatten()), 
                  columns = ['model', 
                             'sequence_id', 
                             'assignment_log_id', 
                             'completion_target', 
                             'problems_target', 
                             'completion_prediction', 
                             'problems_prediction'])
results.append(df)


weights = {'prior_completion': np.ones_like(completion_remnant_target), 'prior_problems': np.zeros_like(problems_remnant_target)}
prior_model.fit(x={'prior': prior_remnant_input},
                y={'prior_completion': completion_remnant_target, 'prior_problems': problems_remnant_target},
                epochs=1000,
                validation_split=0.25,
                callbacks=es,
                sample_weight=weights,
                verbose=1)

completion_experiment_output, problems_experiment_output = prior_model.predict({'prior': prior_experiment_input})
df = pd.DataFrame(zip(np.array(['student'] * experiment_sequence.size), 
                      experiment_sequence.flatten(), 
                      experiment_assignment_log_id.flatten(), 
                      completion_experiment_target.flatten(), 
                      problems_experiment_target.flatten(), 
                      completion_experiment_output.flatten(), 
                      problems_experiment_output.flatten()), 
                  columns = ['model', 
                             'sequence_id', 
                             'assignment_log_id', 
                             'completion_target', 
                             'problems_target', 
                             'completion_prediction', 
                             'problems_prediction'])
results.append(df)

weights = {'combined_completion': np.ones_like(completion_remnant_target), 'combined_problems': np.zeros_like(problems_remnant_target)}
combined_model.fit(x={'action': action_remnant_input, 'recurrent': recurrent_remnant_input, 'prior': prior_remnant_input},
                   y={'combined_completion': completion_remnant_target, 'combined_problems': problems_remnant_target},
                   epochs=1000,
                   validation_split=0.25,
                   callbacks=es,
                   sample_weight=weights,
                   verbose=1)

# Store model predictions
completion_experiment_output, problems_experiment_output = combined_model.predict({'action': action_experiment_input, 
                                                                                   'recurrent': recurrent_experiment_input, 
                                                                                   'prior': prior_experiment_input})
df = pd.DataFrame(zip(np.array(['combined'] * experiment_sequence.size), 
                      experiment_sequence.flatten(), 
                      experiment_assignment_log_id.flatten(), 
                      completion_experiment_target.flatten(), 
                      problems_experiment_target.flatten(), 
                      completion_experiment_output.flatten(), 
                      problems_experiment_output.flatten()), 
                  columns = ['model', 
                             'sequence_id', 
                             'assignment_log_id', 
                             'completion_target', 
                             'problems_target', 
                             'completion_prediction', 
                             'problems_prediction'])
results.append(df)

results = pd.concat(results)
experiment_conditions = pd.read_csv('data/experiment_information/experiment_conditions.csv')
results = results.merge(experiment_conditions, how='left', on='assignment_log_id')
results.to_csv('experiment_results.csv', index=False)

In [None]:
# Load the data

results = pd.read_csv('experiment_results.csv')

In [None]:
# Evaluate the results

metrics = []

# Calculate the metrics for each sequence

for keys, df in results.groupby(['model', 'target_sequence']):
    
    model, sequence = keys
    
    completion_target = df['completion_target']
    
    completion_prediction = df['completion_prediction']
    completion_auc = roc_auc_score(completion_target, completion_prediction) if len(completion_target.unique()) > 1 else None
    completion_acc = accuracy_score(completion_target, completion_prediction > 0.5)
    completion_r2 = r2_score(completion_target, completion_prediction) if len(completion_target) > 1 else None
    completion_ev = explained_variance_score(completion_target, completion_prediction) if len(completion_target) > 1 else None
    completion_mse = mean_squared_error(completion_target, completion_prediction)
    
    problems_target = df[df['completion_target'] == 1]['problems_target']
    problems_prediction = df[df['completion_target'] == 1]['problems_prediction']
    problems_mae = mean_absolute_error(problems_target, problems_prediction)
    problems_mape = mean_absolute_percentage_error(problems_target, problems_prediction)
    problems_r2 = r2_score(problems_target, problems_prediction) if len(problems_target) > 1 else None
    problems_ev = explained_variance_score(problems_target, problems_prediction) if len(problems_target) > 1 else None
    problems_mse = mean_squared_error(problems_target, problems_prediction)

    metrics.append([model, 
                    sequence, 
                    len(df), 
                    completion_auc, 
                    completion_acc, 
                    completion_r2, 
                    completion_ev, 
                    completion_mse, 
                    problems_mae, 
                    problems_mape, 
                    problems_r2, 
                    problems_ev, 
                    problems_mse])


# Calculate the metrics for everything combined
for model, df in results.groupby('model'):

    completion_target = df['completion_target']
    completion_prediction = df['completion_prediction']
    completion_auc = roc_auc_score(completion_target, completion_prediction) if len(completion_target.unique()) > 1 else None
    completion_acc = accuracy_score(completion_target, completion_prediction > 0.5)
    completion_r2 = r2_score(completion_target, completion_prediction) if len(completion_target) > 1 else None
    completion_ev = explained_variance_score(completion_target, completion_prediction) if len(completion_target) > 1 else None
    completion_mse = mean_squared_error(completion_target, completion_prediction)

    problems_target = df[df['completion_target'] == 1]['problems_target']
    problems_prediction = df[df['completion_target'] == 1]['problems_prediction']
    problems_mae = mean_absolute_error(problems_target, problems_prediction)
    problems_mape = mean_absolute_percentage_error(problems_target, problems_prediction)
    problems_r2 = r2_score(problems_target, problems_prediction) if len(problems_target) > 1 else None
    problems_ev = explained_variance_score(problems_target, problems_prediction) if len(problems_target) > 1 else None
    problems_mse = mean_squared_error(problems_target, problems_prediction)

    metrics.append([model,
                    'all_data', 
                    len(df), 
                    completion_auc, 
                    completion_acc, 
                    completion_r2, 
                    completion_ev, 
                    completion_mse, 
                    problems_mae, 
                    problems_mape, 
                    problems_r2, 
                    problems_ev, 
                    problems_mse])

metrics = pd.DataFrame(metrics, 
                       columns=['model',
                                'group', 
                                'sample_size', 
                                'completion_auc', 
                                'completion_acc', 
                                'completion_r2', 
                                'completion_ev', 
                                'completion_mse', 
                                'problems_mae', 
                                'problems_mape', 
                                'problems_r2', 
                                'problems_ev', 
                                'problems_mse'])

metrics.to_csv('experiment_metrics.csv', index=False)

In [None]:
metrics.iloc[-10:]