In [None]:
from data_provider import ASSISTDataProvider

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

%matplotlib inline

In [None]:
DATA_DIR = '~/Dropbox/mlp-group-project/'
BATCH_SIZE = 100

In [None]:
def embed(array, dimensions=100, rng=None):
    """Embed array as a vector from a Standard Normal in dimensions.
    
    Only the last dimension of the data is affected.
    
    Parameters
    ----------
    dimensions : int (default=100)
        DKT paper embeds one-hot inputs to 100 dims
    rng : numpy.random.RandomState (default=None)
    """
    if not rng:
        rng = np.random.RandomState()
    linear_map = rng.randn(data.shape[-1], dimensions)
    return np.dot(data, linear_map)

In [None]:
class LstmModel:
    
    
    def __repr__(self):
        return "LstmModel"
    
    
    def __init__(self, max_time_steps=973, feature_len=293, n_distinct_questions=146):
        """Initialise task-specific parameters."""
        self.max_time_steps = max_time_steps
        self.feature_len = feature_len
        self.n_distinct_questions = n_distinct_questions
        
        
    def build_graph(self, n_hidden_layers=1, n_hidden_units=200, keep_prob=1.0,
                    learning_rate=0.01, clip_norm=20.0):
        self._build_model(n_hidden_layers, n_hidden_units, keep_prob)
        self._build_training(learning_rate, clip_norm)
        

    def _build_model(self, n_hidden_layers=1, n_hidden_units=200, keep_prob=1.0):
        """Build a TensorFlow computational graph for an LSTM network.

        Model based on "DKT paper" (see section 3): 
            Piech, Chris, et al. "Deep knowledge tracing." 
            Advances in Neural Information Processing Systems. 2015.
            
        Implementation based on "GD paper" (see section 3): 
            Xiong, Xiaolu, et al. "Going Deeper with Deep Knowledge Tracing."
            EDM. 2016.


        Parameters
        ----------
        n_hidden_layers : int (default=1)
            A single hidden layer was used in DKT paper
        n_hidden_units : int (default=200)
            200 hidden units were used in DKT paper
        keep_prob : float in [0, 1] (default=1.0)
            Probability a unit is kept in dropout layer
        """
        tf.reset_default_graph()

        # data. 'None' means any length batch size accepted
        self.inputs = tf.placeholder(
            tf.float32, 
            shape=[self.max_time_steps, None, self.feature_len], 
            name='inputs')
        self.inputs = tf.transpose(self.inputs, [1, 0, 2])
        
        # 'None' because may have answered any number of questions
        self.targets = tf.placeholder(tf.float32, 
                                      shape=[None], 
                                      name='targets')
        
        # int type required for tf.gather function
        self.target_ids = tf.placeholder(tf.int32, 
                                         shape=[None], 
                                         name='target_ids')

        # model. LSTM layer(s) then linear layer (softmax applied in loss)
        cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden_units)
        if keep_prob < 1:
            cell = tf.nn.rnn_cell.DropoutWrapper(cell, keep_prob)
        if n_hidden_layers > 1:
            cells = [cell for layer in n_hidden_layers]
            cell = tf.nn.rnn_cell.MultiRNNCell(cells)

        self.outputs, self.state = tf.nn.dynamic_rnn(cell=cell, 
                                                     inputs=self.inputs,
                                                     dtype=tf.float32)
        
        sigmoid_w = tf.get_variable(dtype=tf.float32,
                                    name="sigmoid_w", 
                                    shape=[n_hidden_units, 
                                           self.n_distinct_questions])
        sigmoid_b = tf.get_variable(dtype=tf.float32,
                                    name="sigmoid_b", 
                                    shape=[self.n_distinct_questions])
        
        # reshaping as done in GD paper code
        # first dim now batch_size times max_time_steps
        self.outputs = tf.reshape(self.outputs, 
                                  shape=[-1, n_hidden_units])
        
        logits = tf.matmul(self.outputs, sigmoid_w) + sigmoid_b
        logits = tf.reshape(logits, [-1])
        self.logits = tf.gather(logits, self.target_ids)
        self.predictions = tf.sigmoid(self.logits)

        
    def _build_training(self, learning_rate=0.001, decay_exp=0.98, clip_norm=20.0):
        """Define parameters updates, with optional 

        Applies exponential learning rate decay (optional). See:
        https://www.tensorflow.org/versions/r0.12/api_docs/python/train
        /decaying_the_learning_rate
        
        Applies gradient clipping by gloabl norm (optional). See:
        https://www.tensorflow.org/versions/r0.12/api_docs/python/train
        /gradient_clipping
        """
        loss_per_example = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=self.logits, labels=self.targets)
        self.loss = tf.reduce_mean(loss_per_example)
        
        # track number of batches seen
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        
        if decay_exp:
            learning_rate = tf.train.exponential_decay(learning_rate=learning_rate, 
                                                       global_step=self.global_step,
                                                       decay_rate=decay_exp,
                                                       decay_steps=100,
                                                       staircase=True)

        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads, trainable_vars = zip(*optimizer.compute_gradients(self.loss))
        if clip_norm:
            grads, _ = tf.clip_by_global_norm(grads, clip_norm)
        
        self.training = optimizer.apply_gradients(zip(grads, trainable_vars), 
                                                  global_step=self.global_step)


In [None]:
Model = LstmModel()
Model.build_graph(n_hidden_units=200, learning_rate=0.1)

## Train

In [None]:
TrainingSet = ASSISTDataProvider(DATA_DIR, batch_size=BATCH_SIZE)

In [None]:
experiment_name = 'first'
with tf.Session() as sess:
    train_saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    losses = []

    for epoch in range(100):         
        for inputs, targets, target_ids in TrainingSet:
            # ensure shapes and types as model expects
            inputs = np.squeeze(np.array(inputs, dtype=np.float32))
            inputs = np.transpose(inputs, [1, 0, 2])
            targets = np.array(targets, dtype=np.float32)
            target_ids = np.array(target_ids, dtype=np.int32)

            # Train!
            _, loss = sess.run(
                [Model.training, Model.loss],
                feed_dict={Model.inputs: inputs,
                           Model.targets: targets,
                           Model.target_ids: target_ids})
            
        print(loss)
        losses.append(loss)
            
        # save model each epoch
        save_path = "./{}_{}.ckpt".format(
            experiment_name, epoch)
        train_saver.save(sess, save_path)            
    print("Saved model at", save_path)

In [None]:
plt.plot(losses)