In [1]:
import os
import numpy as np
import pandas as pd
import hickle as hkl

from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA

In [2]:
# Constants

N_SPLITS = 5
SPLIT_ON = 'class_id'
MDL = 60
MSL = 20

In [3]:
# Load the raw data

remnant_actions = pd.read_csv('raw_data/all_skill_builders_with_skill_info_in_remnant/remnant_actions.csv')
remnant_inputs = pd.read_csv('raw_data/all_skill_builders_with_skill_info_in_remnant/remnant_inputs.csv')
remnant_targets = pd.read_csv('raw_data/all_skill_builders_with_skill_info_in_remnant/remnant_targets.csv')
experiment_actions = pd.read_csv('raw_data/all_skill_builders_with_skill_info_in_remnant/experiment_actions.csv')
experiment_inputs = pd.read_csv('raw_data/all_skill_builders_with_skill_info_in_remnant/experiment_inputs.csv')
experiment_targets = pd.read_csv('raw_data/all_skill_builders_with_skill_info_in_remnant/experiment_targets.csv')
exp_norm_map = pd.read_csv('experiment_information/exp_norm_map.csv')

In [4]:
# Begin to partition the features that will be processed

action_continuous_features = ['ln_action_1_count',
                              'ln_action_2_count',
                              'ln_action_3_count',
                              'ln_action_4_count',
                              'ln_action_5_count',
                              'ln_action_6_count',
                              'ln_action_7_count',
                              'ln_action_8_count',
                              'ln_action_9_count',
                              'ln_action_10_count',
                              'ln_action_11_count',
                              'ln_action_12_count',
                              'ln_action_13_count',
                              'ln_action_14_count',
                              'ln_action_15_count',
                              'ln_action_16_count',
                              'ln_action_17_count',
                              'ln_action_18_count',
                              'ln_action_19_count',
                              'ln_action_20a_count',
                              'ln_action_20b_count',
                              'ln_action_21_count',
                              'ln_action_22_count',
                              'ln_action_23_count',
                              'ln_action_24_count',
                              'ln_action_25_count',
                              'ln_action_26_count',
                              'ln_action_27_count',
                              'ln_action_28_count',
                              'ln_action_29_count',
                              'ln_action_30_count',
                              'ln_action_31_count',
                              'ln_action_32_count',
                              'ln_action_33_count',
                              'ln_action_34_count',
                              'ln_action_35_count']

recurrent_categorical_features = [#'directory_1',
                                  #'directory_2',
                                  #'directory_3',
                                  'is_skill_builder',
                                  'has_due_date',
                                  'assignment_completed']

recurrent_continuous_features = ['time_since_last_assignment_start',
                                 'session_count_raw',
                                 'session_count_normalized',
                                 'session_count_class_percentile',
                                 'day_count_raw',
                                 'day_count_normalized',
                                 'day_count_class_percentile',
                                 'completed_problem_count_raw',
                                 'completed_problem_count_normalized',
                                 'completed_problem_count_class_percentile',
                                 'median_ln_problem_time_on_task_raw',
                                 'median_ln_problem_time_on_task_normalized',
                                 'median_ln_problem_time_on_task_class_percentile',
                                 'median_ln_problem_first_response_time_raw',
                                 'median_ln_problem_first_response_time_normalized',
                                 'median_ln_problem_first_response_time_class_percentile',
                                 'average_problem_attempt_count',
                                 'average_problem_attempt_count_normalized',
                                 'average_problem_attempt_count_class_percentile',
                                 'average_problem_answer_first',
                                 'average_problem_answer_first_normalized',
                                 'average_problem_answer_first_class_percentile',
                                 'average_problem_correctness',
                                 'average_problem_correctness_normalized',
                                 'average_problem_correctness_class_percentile',
                                 'average_problem_hint_count',
                                 'average_problem_hint_count_normalized',
                                 'average_problem_hint_count_class_percentile',
                                 'average_problem_answer_given',
                                 'average_problem_answer_given_normalized',
                                 'average_problem_answer_given_class_percentile']

prior_categorical_features = ['has_due_date']

prior_continuous_features = ['student_prior_assignments_started',
                             'student_prior_assignments_percent_completed',
                             'student_prior_median_ln_assignment_time_on_task',
                             'student_prior_average_problems_per_assignment',
                             'student_prior_median_ln_problem_time_on_task',
                             'student_prior_median_ln_problem_first_response_time',
                             'student_prior_average_problem_correctness',
                             'student_prior_average_problem_attempt_count',
                             'student_prior_average_answer_first',
                             'student_prior_average_problem_hint_count',
                             'student_skill_prior_average_problems_per_assignment',
                             'student_skill_prior_median_ln_problem_time_on_task',
                             'student_skill_prior_median_ln_problem_first_response_time',
                             'student_skill_prior_average_problem_correctness',
                             'student_skill_prior_average_problem_attempt_count',
                             'student_skill_prior_average_answer_first',
                             'student_skill_prior_average_problem_hint_count']

In [5]:
# Functions

def process_action_input(df_list, target_times, max_day_length, scaler=None, pca=None, n_components=None):
    if scaler is None:
        continuous_data = np.concatenate([df[action_continuous_features].values for df in df_list])
        scaler = MinMaxScaler().fit(continuous_data)
        
        continuous_data = scaler.transform(continuous_data)
        pca = PCA().fit(continuous_data)
        n_components = np.min(np.where(np.cumsum(pca.explained_variance_ratio_) > 0.9)) + 1
        
    processed_input = []
    for df, target_time in zip(df_list, target_times):
        end_time = target_time - (target_time % 86400)
        start_time = end_time - max_day_length * 86400
        timeseries = pd.DataFrame(data=np.arange(start_time, end_time, 86400), columns=['timestamp'])
        timeseries = timeseries.merge(df, how='left', on='timestamp')
        timeseries = timeseries[action_continuous_features].fillna(0).values
        timeseries = scaler.transform(timeseries)
        timeseries = pca.transform(timeseries)[:,:n_components]
        processed_input.append(timeseries)
    return np.stack(processed_input), scaler, pca, n_components


def process_recurrent_input(df_list, max_sequence_length, one_hot_encoder=None, normalizer=None, pca=None, n_components=None):
    if one_hot_encoder is None:
        categorical_data = np.concatenate([df[recurrent_categorical_features].values for df in df_list])
        continuous_data = np.concatenate([df[recurrent_continuous_features].values for df in df_list])
        one_hot_encoder = OneHotEncoder(handle_unknown='ignore').fit(categorical_data)
        normalizer = StandardScaler().fit(continuous_data)
        
        categorical_data = one_hot_encoder.transform(categorical_data).toarray()
        continuous_data = np.nan_to_num(normalizer.transform(continuous_data))
        combined_data = np.concatenate([categorical_data, continuous_data], axis=1)
        pca = PCA().fit(combined_data)
        n_components = np.min(np.where(np.cumsum(pca.explained_variance_ratio_) > 0.9)) + 1
    
    processed_input = []
    for df in df_list:
        categorical_data = one_hot_encoder.transform(df[recurrent_categorical_features]).toarray()
        continuous_data = np.nan_to_num(normalizer.transform(df[recurrent_continuous_features]))
        combined_data = np.concatenate([categorical_data, continuous_data], axis=1)
        if combined_data.shape[0] >= max_sequence_length:
            resized_data = combined_data[-max_sequence_length:, :]
        else:
            resized_data = np.zeros((max_sequence_length, combined_data.shape[1]))
            resized_data[-combined_data.shape[0]:, :] = combined_data
        pca.transform(resized_data)[:,:n_components]
        processed_input.append(resized_data)
    return np.stack(processed_input), one_hot_encoder, normalizer, pca, n_components


def process_prior_input(df, one_hot_encoder=None, normalizer=None, pca=None, n_components=None):
    if one_hot_encoder is None:
        categorical_data = df[prior_categorical_features].values
        continuous_data = df[prior_continuous_features].values
        one_hot_encoder = OneHotEncoder(handle_unknown='ignore').fit(categorical_data)
        normalizer = StandardScaler().fit(continuous_data)
        
        categorical_data = one_hot_encoder.transform(categorical_data).toarray()
        continuous_data = np.nan_to_num(normalizer.transform(continuous_data))
        combined_data = np.concatenate([categorical_data, continuous_data], axis=1)
        pca = PCA().fit(combined_data)
        n_components = np.min(np.where(np.cumsum(pca.explained_variance_ratio_) > 0.9)) + 1
        
    categorical_data = one_hot_encoder.transform(df[prior_categorical_features]).toarray()
    continuous_data = np.nan_to_num(normalizer.transform(df[prior_continuous_features]))
    combined_data = np.concatenate([categorical_data, continuous_data], axis=1)
    return combined_data, one_hot_encoder, normalizer, pca, n_components

In [6]:
# Create additional features and target variables

# Create variable for the corresponding experiment sequence ID
exp_norm = exp_norm_map.dropna()
norm_exp_dict = dict(zip(exp_norm['normal_id'], exp_norm['experiment_id']))
exp_exp_dict = dict(zip(exp_norm['experiment_id'], exp_norm['experiment_id']))
remnant_targets['shared_sequence_id'] = remnant_targets['target_sequence'].map(norm_exp_dict).fillna('None')
experiment_targets['shared_sequence_id'] = experiment_targets['target_sequence'].map(exp_exp_dict).fillna('None')
prior_categorical_features.append('shared_sequence_id')

# An explicit feature for which cluster the time_since_last_assignment_start falls into
remnant_times = remnant_inputs['time_since_last_assignment_start'].values.reshape(-1, 1)
experiment_times = experiment_inputs['time_since_last_assignment_start'].values.reshape(-1, 1)
gmm = GaussianMixture(n_components=4).fit(remnant_times)
remnant_inputs['time_since_last_assignment_start_cluster'] = gmm.predict(remnant_times)
experiment_inputs['time_since_last_assignment_start_cluster'] = gmm.predict(experiment_times)
recurrent_categorical_features.append('time_since_last_assignment_start_cluster')

# A feature for whether or not there is a folder path
remnant_inputs['custom_assignment'] = remnant_inputs['directory_1'].isna().astype(int)
experiment_inputs['custom_assignment'] = experiment_inputs['directory_1'].isna().astype(int)
recurrent_categorical_features.append('custom_assignment')

# A feature for whether or not there is any problem level data
remnant_inputs['no_problem_statistics'] = remnant_inputs['median_ln_problem_time_on_task_raw'].isna().astype(int)
experiment_inputs['no_problem_statistics'] = experiment_inputs['median_ln_problem_time_on_task_raw'].isna().astype(int)
recurrent_categorical_features.append('no_problem_statistics')

# feature for whether or not the student did problems of the same skill previously
remnant_targets['no_skill_priors'] = remnant_targets['student_skill_prior_average_problems_per_assignment'].isna().astype(int)
experiment_targets['no_skill_priors'] = experiment_targets['student_skill_prior_average_problems_per_assignment'].isna().astype(int)
prior_categorical_features.append('no_skill_priors')

# Features for the year and month of the assignments
remnant_inputs['year'] = pd.DatetimeIndex(pd.to_datetime(remnant_inputs['assignment_start_time'], unit='s')).year
remnant_inputs['month'] = pd.DatetimeIndex(pd.to_datetime(remnant_inputs['assignment_start_time'], unit='s')).month
experiment_inputs['year'] = pd.DatetimeIndex(pd.to_datetime(experiment_inputs['assignment_start_time'], unit='s')).year
experiment_inputs['month'] = pd.DatetimeIndex(pd.to_datetime(experiment_inputs['assignment_start_time'], unit='s')).month
recurrent_categorical_features.append('year')
recurrent_categorical_features.append('month')

remnant_targets['year'] = pd.DatetimeIndex(pd.to_datetime(remnant_targets['assignment_start_time'], unit='s')).year
remnant_targets['month'] = pd.DatetimeIndex(pd.to_datetime(remnant_targets['assignment_start_time'], unit='s')).month
experiment_targets['year'] = pd.DatetimeIndex(pd.to_datetime(experiment_targets['assignment_start_time'], unit='s')).year
experiment_targets['month'] = pd.DatetimeIndex(pd.to_datetime(experiment_targets['assignment_start_time'], unit='s')).month
prior_categorical_features.append('year')
prior_categorical_features.append('month')

# Replace NaN categorical values with -1
remnant_inputs[recurrent_categorical_features] = remnant_inputs[recurrent_categorical_features].fillna(-1)
remnant_targets[prior_categorical_features] = remnant_targets[prior_categorical_features].fillna(-1)
experiment_inputs[recurrent_categorical_features] = experiment_inputs[recurrent_categorical_features].fillna(-1)
experiment_targets[prior_categorical_features] = experiment_targets[prior_categorical_features].fillna(-1)

In [7]:
# Determine the previous assignments for each sample

remnant_targets['previous_assignments'] = None
remnant_targets['previous_actions'] = None
for i, row in remnant_targets.iterrows():
    past_assignment = remnant_actions['timestamp'] + 86400 < row['assignment_start_time']
    same_student = remnant_actions['student_id'] == row['student_id']
    row_actions = remnant_actions[past_assignment & same_student].sort_values('timestamp')
    remnant_targets.at[i, 'previous_actions'] = row_actions if len(row_actions) > 0 else None
    
    past_assignment = remnant_inputs['assignment_start_time'] < row['assignment_start_time']
    same_student = remnant_inputs['student_id'] == row['student_id']
    row_input = remnant_inputs[past_assignment & same_student].sort_values('assignment_start_time')
    remnant_targets.at[i, 'previous_assignments'] = row_input if len(row_input) > 0 else None

remnant_targets = remnant_targets.dropna(subset=['previous_actions', 'previous_assignments'])


experiment_targets['previous_assignments'] = None
experiment_targets['previous_actions'] = None
for i, row in experiment_targets.iterrows():
    past_assignment = experiment_actions['timestamp'] + 86400 < row['assignment_start_time']
    same_student = experiment_actions['student_id'] == row['student_id']
    row_actions = experiment_actions[past_assignment & same_student].sort_values('timestamp')
    experiment_targets.at[i, 'previous_actions'] = row_actions if len(row_actions) > 0 else None
    
    past_assignment = experiment_inputs['assignment_start_time'] < row['assignment_start_time']
    same_student = experiment_inputs['student_id'] == row['student_id']
    row_input = experiment_inputs[past_assignment & same_student].sort_values('assignment_start_time')
    experiment_targets.at[i, 'previous_assignments'] = row_input if len(row_input) > 0 else None

experiment_targets = experiment_targets.dropna(subset=['previous_actions', 'previous_assignments'])

In [8]:
# Split the data into training and testing sets for validating the model

outdir = f'processed_data/pca_cv_data__{N_SPLITS}_{SPLIT_ON}_folds__{MSL}_steps__{MDL}_days'
os.mkdir(outdir)

count = 0
for train_index, test_index in GroupKFold(N_SPLITS).split(remnant_targets, groups=remnant_targets[SPLIT_ON]):

    action_training_input = remnant_targets.iloc[train_index]['previous_actions'].tolist()
    target_times = remnant_targets.iloc[train_index]['assignment_start_time'].tolist()
    action_training_content = process_action_input(action_training_input, target_times, MDL)
    action_training_input, action_scaler, action_pca, action_n_components = action_training_content
    
    recurrent_training_input = remnant_targets.iloc[train_index]['previous_assignments'].tolist()
    recurrent_training_content = process_recurrent_input(recurrent_training_input, MSL)
    recurrent_training_input, recurrent_encoder, recurrent_normalizer, recurrent_pca, recurrent_n_components = recurrent_training_content
    
    prior_training_input = remnant_targets.iloc[train_index]
    prior_training_content = process_prior_input(prior_training_input)
    prior_training_input, prior_encoder, prior_normalizer, prior_pca, prior_n_components = prior_training_content
    
    completion_training_target = remnant_targets.iloc[train_index][['assignment_completed']].values
    
    problems_training_target = remnant_targets.iloc[train_index][['problems_completed']].values
    
    training_target_sequence = remnant_targets.iloc[train_index][['target_sequence']].values
    
    action_testing_input = remnant_targets.iloc[test_index]['previous_actions'].tolist()
    target_times = remnant_targets.iloc[test_index]['assignment_start_time'].tolist()
    action_testing_input, _, _, _ = process_action_input(action_testing_input, target_times, MDL, action_scaler, action_pca, action_n_components)
    
    recurrent_testing_input = remnant_targets.iloc[test_index]['previous_assignments'].tolist()
    recurrent_testing_input, _, _, _, _ = process_recurrent_input(recurrent_testing_input, MSL, recurrent_encoder, recurrent_normalizer, recurrent_pca, recurrent_n_components)
    
    prior_testing_input = remnant_targets.iloc[test_index]
    prior_testing_input, _, _, _, _ = process_prior_input(prior_testing_input, prior_encoder, prior_normalizer, prior_pca, prior_n_components)
    
    completion_testing_target = remnant_targets.iloc[test_index][['assignment_completed']].values
    
    problems_testing_target = remnant_targets.iloc[test_index][['problems_completed']].values
    
    testing_target_sequence = remnant_targets.iloc[test_index][['target_sequence']].values
    
    data = {'action_training_input': action_training_input, 
            'recurrent_training_input': recurrent_training_input,
            'prior_training_input': prior_training_input,
            'completion_training_target': completion_training_target,
            'problems_training_target': problems_training_target,
            'training_target_sequence': training_target_sequence,
            'action_testing_input': action_testing_input, 
            'recurrent_testing_input': recurrent_testing_input,
            'prior_testing_input': prior_testing_input,
            'completion_testing_target': completion_testing_target,
            'problems_testing_target': problems_testing_target,
            'testing_target_sequence': testing_target_sequence}
    hkl.dump(data, f'{outdir}/fold_{count}.hkl', mode='w')
    count += 1

In [9]:
# Split the data into training and testing sets for validating the model

action_remnant_input = remnant_targets['previous_actions'].tolist()
target_times = remnant_targets['assignment_start_time'].tolist()
action_remnant_content = process_action_input(action_remnant_input, target_times, MDL)
action_remnant_input, action_scaler, action_pca, action_n_components = action_remnant_content

recurrent_remnant_input = remnant_targets['previous_assignments'].tolist()
recurrent_remnant_content = process_recurrent_input(recurrent_remnant_input, MSL)
recurrent_remnant_input, recurrent_encoder, recurrent_normalizer, recurrent_pca, recurrent_n_components = recurrent_remnant_content

prior_remnant_input = remnant_targets
prior_remnant_content = process_prior_input(prior_remnant_input)
prior_remnant_input, prior_encoder, prior_normalizer, prior_pca, prior_n_components = prior_remnant_content

completion_remnant_target = remnant_targets[['assignment_completed']].values

problems_remnant_target = remnant_targets[['problems_completed']].values

remnant_sequence = remnant_targets[['target_sequence']].values

action_experiment_input = experiment_targets['previous_actions'].tolist()
target_times = experiment_targets['assignment_start_time'].tolist()
action_experiment_input, _, _, _ = process_action_input(action_experiment_input, target_times, MDL, action_scaler, action_pca, action_n_components)

recurrent_experiment_input = experiment_targets['previous_assignments'].tolist()
recurrent_experiment_input, _, _, _, _ = process_recurrent_input(recurrent_experiment_input, MSL, recurrent_encoder, recurrent_normalizer, recurrent_pca, recurrent_n_components)

prior_experiment_input = experiment_targets
prior_experiment_input, _, _, _, _ = process_prior_input(prior_experiment_input, prior_encoder, prior_normalizer, prior_pca, prior_n_components)

completion_experiment_target = experiment_targets[['assignment_completed']].values

problems_experiment_target = experiment_targets[['problems_completed']].values

experiment_sequence = experiment_targets[['target_sequence']].values

data = {'action_remnant_input': action_remnant_input, 
        'recurrent_remnant_input': recurrent_remnant_input,
        'prior_remnant_input': prior_remnant_input,
        'completion_remnant_target': completion_remnant_target,
        'problems_remnant_target': problems_remnant_target,
        'remnant_sequence': remnant_sequence,
        'action_experiment_input': action_experiment_input, 
        'recurrent_experiment_input': recurrent_experiment_input,
        'prior_experiment_input': prior_experiment_input,
        'completion_experiment_target': completion_experiment_target,
        'problems_experiment_target': problems_experiment_target,
        'experiment_sequence': experiment_sequence}

hkl.dump(data, f'processed_data/pca_experiment_data__{MSL}_steps__{MDL}_days.hkl', mode='w')

In [10]:
print('done')

done


In [11]:
print(len(action_pca.explained_variance_ratio_), action_n_components)
print(len(recurrent_pca.explained_variance_ratio_), recurrent_n_components)
print(len(prior_pca.explained_variance_ratio_), prior_n_components)

36 7
61 22
63 17
