In [1]:
# Python libraries
import numpy as np
import theano
import theano.tensor as Tensor
import lasagne
import random
import sys
import csv
import time
import matplotlib.pyplot as plt
# allows plots to show inline in ipython notebook
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# Import our own modules
import utils
import model
import visualize

In [3]:
DATA_SETS_MAP = {
    'synth':"../syntheticDetailed/naive_c5_q50_s4000_v0.csv",
    'code_org' : "../data/hoc_1-9_binary_input.csv"
}

DATA_SET = 'code_org'
# DATA_SZ = 500000
# DATA_SET = 'synth'

# if DATA_SZ = -1, use entire data set
DATA_SZ = 50000
# DATA_SZ = -1

In [4]:
# Read in the data set
# This function can be moved to utils.py
data_array = np.array(list(csv.reader(open(DATA_SETS_MAP[DATA_SET],"rb"),delimiter=','))).astype('int')
if DATA_SZ != -1:
    data_array = data_array[:DATA_SZ]
    
np.random.shuffle(data_array)
num_samples = data_array.shape[0]
num_problems = data_array.shape[1]

# time steps is number of problems - 1 because we cannot predict on the last problem.
num_timesteps = num_problems - 1 

# Split data into train and test (half and half)

train_data = data_array[0:7*num_samples/8,:]
val_data =  data_array[7*num_samples/8: 15*num_samples/16 ,:]
test_data = data_array[15*num_samples/16:num_samples,:]

In [5]:
# code to see how many percent is correct
print (num_problems)
# for prob in xrange(num_problems):
#     print ('Train Prob {} : percent correct {}'.format(prob, np.mean(train_data[:,prob]) ))
#     print ('Val Prob {} : percent correct {}'.format(prob, np.mean(val_data[:,prob]) ))
#     print ('Test Prob {} : percent correct {}'.format(prob, np.mean(test_data[:,prob]) ))
print ('Total : percent correct {}'.format( np.mean(np.concatenate((np.concatenate((train_data, val_data), axis=0),test_data), axis=0 ))))
print ('Train : percent correct {}'.format( np.mean(train_data) ))
print ('Val : percent correct {}'.format( np.mean(val_data) ))
print ('Test : percent correct {}'.format( np.mean(test_data) ))


9
Total : percent correct 0.848597777778
Train : percent correct 0.848515555556
Val : percent correct 0.848071111111
Test : percent correct 0.850275555556


In [6]:
num_train = train_data.shape[0]
num_test = test_data.shape[0]

print('Vectorization...')
X_train, next_problem_train, truth_train = utils.vectorize_data(train_data)
X_val, next_problem_val, truth_val = utils.vectorize_data(val_data)
X_test, next_problem_test, truth_test = utils.vectorize_data(test_data)

train_data = utils.vectorize_data(train_data)
val_data = utils.vectorize_data(val_data)
test_data = utils.vectorize_data(test_data)

print ("Vectorization done!")
print X_train.shape
print X_val.shape
print X_test.shape

Vectorization...
Vectorization done!
(43750, 8, 18)
(3125, 8, 18)
(3125, 8, 18)


In [7]:
# hyperparameters
hidden_size = 200 # size of hidden layer of neurons
learning_rate = 1e-2
lr_decay = 1.0
reg_strength = 0.0
grad_clip = 10
batchsize = 32
num_epochs = 20
dropout_p = 0.2
num_lstm_layers = 1
theano.config.optimizer='fast_compile'
theano.config.exception_verbosity='high'

In [None]:
# create model
train_acc_fn, compute_cost_acc = model.create_model(num_timesteps, num_problems, hidden_size, learning_rate, grad_clip, dropout_p, num_lstm_layers)

Computing updates ...
Compiling functions ...
Compiling done!




In [None]:
# Training!!!
train_losses, train_accuracies, val_accuracies = model.train(train_data, val_data, train_acc_fn, compute_cost_acc,  num_epochs=num_epochs, batchsize=batchsize)

Starting training...
  Epoch 0 	batch 1 	loss 0.693057948408 	train acc 48.05 	val acc 85.23 
  Epoch 0 	batch 2 	loss 0.659701252469 	train acc 91.80 	val acc 84.47 
  Epoch 0 	batch 3 	loss 0.599327606375 	train acc 87.89 	val acc 83.11 
  Epoch 0 	batch 4 	loss 0.586372828012 	train acc 74.61 	val acc 83.94 
  Epoch 0 	batch 5 	loss 0.513062263777 	train acc 81.64 	val acc 81.20 
  Epoch 0 	batch 6 	loss 0.423424546851 	train acc 80.86 	val acc 83.37 
  Epoch 0 	batch 7 	loss 0.472739867124 	train acc 81.25 	val acc 83.50 
  Epoch 0 	batch 8 	loss 0.380546985824 	train acc 82.42 	val acc 85.57 
  Epoch 0 	batch 9 	loss 0.390641874915 	train acc 86.33 	val acc 84.75 
  Epoch 0 	batch 10 	loss 0.36816992182 	train acc 84.38 	val acc 85.51 
  Epoch 0 	batch 11 	loss 0.378334993304 	train acc 85.16 	val acc 85.51 
  Epoch 0 	batch 12 	loss 0.265770617614 	train acc 90.23 	val acc 85.78 
  Epoch 0 	batch 13 	loss 0.421738068554 	train acc 82.03 	val acc 86.18 
  Epoch 0 	batch 14 	loss 0

In [None]:
model.check_accuracy(train_data, compute_cost_acc, dataset_name='train')

In [None]:
# Visualize the loss and the accuracies for both training and validation sets for each epoch
visualize.plot_loss_acc(DATA_SET + '_train', train_losses, val_accuracies, train_accuracies, learning_rate, reg_strength, num_epochs, num_train)

In [None]:
model.check_accuracy(test_data, compute_cost_acc, dataset_name='test')