In [108]:
import json
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import tensorflow as tf
import pprint as pp

This code can work with either tf.VERSION = '1.4.1' (for MacOS High Sierra) or tf.VERSION = '0.12.1' (for RedHat based SuperComputer), functions may change for other versions. Also, this model does not have mini-batches.

Model Parameters:

In [166]:
training_set_split = 0.8
validation_set_split = 0.1
learning_rate = 0.1
num_units = 5 #number of units in RNN cell
training_steps = 20 #number of epochs
display_step = 1 #number of epochs after which to display progress
optimize_using = "adagrad" #other option: "momentum"

In [110]:
print(tf.VERSION)

1.4.1


Loading JSON file into dictionary called 'student_vectors'

In [86]:
filepath = "student_vectors_n_task_10_n_limit_10000.json"
student_vectors = json.load(open(filepath))

Collecting unique CCSSM labels and Task IDs

In [87]:
ccssm_labels = []
task_ids = []
for i in student_vectors:
    for j in student_vectors[i]:
        if j['ccssm'] not in ccssm_labels:
            ccssm_labels.append(j['ccssm'])
        if j['task_id'] not in task_ids:
            task_ids.append(j['task_id'])
print("Number of unique CCSSM Labels: " + str(len(ccssm_labels)))
print("Number of unique task IDs: " + str(len(task_ids)))
print("Number of students: " + str(len(student_vectors)))

Number of unique CCSSM Labels: 4
Number of unique task IDs: 10
Number of students: 1255


Creating 1-hot encoding for Task IDs and CCSSM Labels

In [88]:
#pre-processing for using MultiLabelBinarizer
temp_ids = []
for i in task_ids:
    temp_ids.append([i])
temp_labels = []
for i in ccssm_labels:
    temp_labels.append([i])
    
#generating encodings
enc = MultiLabelBinarizer()
task_ids_1hot = (enc.fit_transform(temp_ids)).astype(float)
task_ids_classes = enc.classes_
task_ids_dict = dict(zip(task_ids, task_ids_1hot))
labels_1hot = enc.fit_transform(temp_labels).astype(float)
labels_classes = enc.classes_
labels_dict = dict(zip(ccssm_labels,labels_1hot))
#pp.pprint(labels_dict)
#pp.pprint(task_ids_dict)

Generating input sequences of interactions to feed the network. Say we have 3 task IDs and 3 labels; here is an example of interaction vectors generated:
1. User correctly solves task 2 of label 3: [010   000   001 000]
2. User incorrectly solves task 1 of label 2: [000   100   000   010]

1-hot representation of task IDs: 
task ID 1: 1,0,0 ; 
task ID 2: 0,1,0 ; 
task ID 3: 0,0,1 ; 
and similarly for labels!

In the interaction vector, first 3 bits belong to taskID that user solved correctly; next 3 bits belong to taskID that user solved incorrectly; next 3 bits belong to label corresponding to task ID solved by user correctly and last 3 bits belong to label corresponding to the task ID solved by the user incorrectly.

In [141]:
sequences = []
output_y_ccssm = []
output_y_taskid = []
output_y = []
seqlen = []
incorrect_tid_vec = np.zeros((len(task_ids)), dtype=np.float)
incorrect_csm_vec = np.zeros((len(ccssm_labels)),dtype=np.float)
for i in student_vectors:
    temp_seq = []
    for j in student_vectors[i]:
        if(j['second_try'] == False): #ignoring second_try
            if(j['correct'] == True):
                vec = np.concatenate([task_ids_dict[j['task_id']],incorrect_tid_vec,labels_dict[j['ccssm']],incorrect_csm_vec])
                temp_seq.append(vec)
            else:
                vec = np.concatenate([incorrect_tid_vec,task_ids_dict[j['task_id']],incorrect_csm_vec,labels_dict[j['ccssm']]])
                temp_seq.append(vec)
    seqlen.append(len(temp_seq))
    last_one = temp_seq.pop()
    output_y.append(last_one)
    output_y_ccssm.append(last_one[2*len(task_ids):])
    output_y_taskid.append(last_one[:2*len(task_ids)])
    sequences.append(temp_seq)
#pp.pprint(sequences[0])
length_interaction_vector = 2*(len(task_ids)+len(ccssm_labels)) #length of interaction vector

Finding maximum sequence length.

In [142]:
max_seqlen = max(seqlen)
print(max_seqlen)

186


Padding the sequences according to maximum sequence length. Making padded sequences of shape: number of students, maximum sequence length, length of interaction vector.

In [143]:
padded_sequences = np.zeros(shape=(len(student_vectors),max_seqlen,length_interaction_vector),dtype=float)
for i in range(len(sequences)):
    for j in range(len(sequences[i])):
        padded_sequences[i][j] = sequences[i][j]

Split the data into training and testing sets. Will take random validation sets at the time of training.

In [157]:
split = round((training_set_split+validation_set_split)*len(student_vectors))
training_x = padded_sequences[:split]
training_y = np.asarray(output_y)[:split]
training_y_ccssm = np.asarray(output_y_ccssm)[:split] #for validation set
training_y_taskid = np.asarray(output_y_taskid)[:split] #for validation set
test_x = padded_sequences[split:]
test_y = np.asarray(output_y)[split:]
test_y_ccssm = np.asarray(output_y_ccssm)[split:]
test_y_taskid = np.asarray(output_y_taskid)[split:]
training_x.shape

(1130, 186, 28)

Building the model

In [155]:
tf.reset_default_graph()

x = tf.placeholder(tf.float32, [None, max_seqlen, length_interaction_vector]) #(<batch_size>, <max_time>, <num_features>)
y = tf.placeholder(tf.float32, [None, length_interaction_vector]) #(<batch_size>, <num_features>)

def dynamicRNN(x):
    rnn_cell = tf.nn.rnn_cell.BasicRNNCell(num_units)
    outputs, states = tf.nn.dynamic_rnn(rnn_cell, x, dtype=tf.float32)
    #transformation on outputs needed, otherwise auc=0
    outputs = tf.transpose(outputs, [1, 0, 2])
    outputs = tf.gather(outputs, int(outputs.get_shape()[0]) - 1)
    out_size = length_interaction_vector
    logit = tf.contrib.layers.fully_connected(outputs, out_size, activation_fn=None)
    if tf.VERSION == '0.12.1': #summit's tensorflow version API doc: https://www.tensorflow.org/versions/r0.12/api_docs/
        outputs = tf.sigmoid(logit)
    else:
        outputs = tf.nn.sigmoid(logit)
    return outputs

#making predictions
pred = dynamicRNN(x)
if tf.VERSION == '0.12.1': #summit's tensorflow version API doc: https://www.tensorflow.org/versions/r0.12/api_docs/
    pred_task,pred_ccssm = tf.split_v(value=pred,size_splits=[2*len(task_ids),2*len(ccssm_labels)],split_dim=1)
else:
    pred_task,pred_ccssm = tf.split(value=pred,num_or_size_splits=[2*len(task_ids),2*len(ccssm_labels)],axis=1)
    
# Define loss and optimizer
if tf.VERSION == '0.12.1': #summit's tensorflow version API doc: https://www.tensorflow.org/versions/r0.12/api_docs/
    cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred, targets=y))
else:
    cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred, labels=y))

if(optimize_using == "momentum"):
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=0.9).minimize(cost)
elif (optimize_using == "adagrad"):
    optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate).minimize(cost)

# Evaluate model - use AUC to evaluate model
if tf.VERSION == '0.12.1': #summit's tensorflow version API doc: https://www.tensorflow.org/versions/r0.12/api_docs/
    auc,  opts = tf.contrib.metrics.streaming_auc(labels = test_y_taskid, predictions = pred_task, curve='ROC')
    auc_ccssm,  opts_ccssm = tf.contrib.metrics.streaming_auc(labels = test_y_ccssm, predictions = pred_ccssm, curve='ROC')
else:
    auc,  opts = tf.metrics.auc(labels = test_y_taskid, predictions = pred_task, curve='ROC')
    auc_ccssm,  opts_ccssm = tf.metrics.auc(labels = test_y_ccssm, predictions = pred_ccssm, curve='ROC')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Training the model

In [167]:
with tf.Session() as sess:
    # Initialize the variables (i.e. assign their default value)
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    for step in range(1, training_steps+1):
        batch_x = training_x
        batch_y = training_y
        # Run optimization op (backprop)
        sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})
        if step % display_step == 0 or step == 1:
            # Calculate batch accuracy & loss
            loss= sess.run([cost], feed_dict={x: batch_x, y: batch_y})
            print("Step " + str(step) + ", Loss= " + str(loss))
    print("Optimization Finished!")

    # Calculate test auc
    temp_auc_ccssm, temp_opts_ccssm = sess.run([auc_ccssm,  opts_ccssm], feed_dict={x: test_x, y: test_y})
    temp_auc_taskid, temp_opts_taskid = sess.run([auc, opts], feed_dict={x: test_x, y: test_y})
    print("Testing auc for taskid: " + str(temp_auc_taskid) + ", " + str(temp_opts_taskid))
    print("Testing auc for ccssm: " + str(temp_auc_ccssm) + ", " + str(temp_opts_ccssm))

Step 1, Loss= [0.9380963]
Step 2, Loss= [0.93782026]
Step 3, Loss= [0.93754596]
Step 4, Loss= [0.9372898]
Step 5, Loss= [0.9370236]
Step 6, Loss= [0.9367578]
Step 7, Loss= [0.9364927]
Step 8, Loss= [0.9362311]
Step 9, Loss= [0.93594396]
Step 10, Loss= [0.93569267]
Step 11, Loss= [0.9354228]
Step 12, Loss= [0.93513983]
Step 13, Loss= [0.93489236]
Step 14, Loss= [0.93461674]
Step 15, Loss= [0.93435353]
Step 16, Loss= [0.9340883]
Step 17, Loss= [0.9338316]
Step 18, Loss= [0.9335591]
Step 19, Loss= [0.9332989]
Step 20, Loss= [0.933027]
Optimization Finished!
Testing auc for taskid: 0.0, 0.84063154
Testing auc for ccssm: 0.0, 0.9028572
