In [4]:
# Python libraries
import numpy as np
import theano
import theano.tensor as Tensor
import lasagne
import random
import sys
import csv
import time
import matplotlib.pyplot as plt
# allows plots to show inline in ipython notebook
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
# Import our own modules
import utils
import model
import visualize

In [8]:
DATA_SETS_MAP = {
    'synth':"../syntheticDetailed/naive_c5_q50_s4000_v0.csv",
    'code_org' : "../data/hoc_1-9_binary_input.csv"
}

# DATA_SET = 'code_org'
# DATA_SZ = 500000
DATA_SET = 'synth'

# if DATA_SZ = -1, use entire data set
DATA_SZ = 50000
# DATA_SZ = -1

In [9]:
# Read in the data set
# This function can be moved to utils.py
data_array = np.array(list(csv.reader(open(DATA_SETS_MAP[DATA_SET],"rb"),delimiter=','))).astype('int')
if DATA_SZ != -1:
    data_array = data_array[:DATA_SZ]
    
np.random.shuffle(data_array)
num_samples = data_array.shape[0]
num_problems = data_array.shape[1]

# time steps is number of problems - 1 because we cannot predict on the last problem.
num_timesteps = num_problems - 1 

# Split data into train and test (half and half)

train_data = data_array[0:7*num_samples/8,:]
val_data =  data_array[7*num_samples/8: 15*num_samples/16 ,:]
test_data = data_array[15*num_samples/16:num_samples,:]

In [10]:
# code to see how many percent is correct
print (num_problems)
# for prob in xrange(num_problems):
#     print ('Train Prob {} : percent correct {}'.format(prob, np.mean(train_data[:,prob]) ))
#     print ('Val Prob {} : percent correct {}'.format(prob, np.mean(val_data[:,prob]) ))
#     print ('Test Prob {} : percent correct {}'.format(prob, np.mean(test_data[:,prob]) ))
print ('Train : percent correct {}'.format( np.mean(np.concatenate((np.concatenate((train_data, val_data), axis=0),test_data), axis=0 ))))
print ('Train : percent correct {}'.format( np.mean(train_data) ))
print ('Val : percent correct {}'.format( np.mean(val_data) ))
print ('Test : percent correct {}'.format( np.mean(test_data) ))


50
Train : percent correct 0.6078
Train : percent correct 0.608548571429
Val : percent correct 0.60456
Test : percent correct 0.60056


In [20]:
num_train = train_data.shape[0]
num_test = test_data.shape[0]

print('Vectorization...')
X_train, next_problem_train, truth_train = utils.vectorize_data(train_data)
X_val, next_problem_val, truth_val = utils.vectorize_data(val_data)
X_test, next_problem_test, truth_test = utils.vectorize_data(test_data)

train_data = utils.vectorize_data(train_data)
val_data = utils.vectorize_data(val_data)
test_data = utils.vectorize_data(test_data)

print ("Vectorization done!")
print X_train.shape
print X_val.shape
print X_test.shape

Vectorization...
Vectorization done!
(3500, 49, 100)
(250, 49, 100)
(250, 49, 100)


In [13]:
# hyperparameters
hidden_size = 200 # size of hidden layer of neurons
learning_rate = 1e-2
lr_decay = 1.0
reg_strength = 0.0
grad_clip = 10
batchsize = 32
num_epochs = 20
dropout_p = 0.2
num_lstm_layers = 1
theano.config.optimizer='fast_compile'
theano.config.exception_verbosity='high'

In [15]:
# create model
train_acc_fn, compute_cost_acc = model.create_model(num_timesteps, num_problems, hidden_size, learning_rate, grad_clip, dropout_p, num_lstm_layers)

Computing updates ...
Compiling functions ...
Compiling done!




In [24]:
# Training!!!
train_losses, train_accuracies, val_accuracies = model.train(train_data, val_data, train_acc_fn, compute_cost_acc,  num_epochs=num_epochs, batchsize=batchsize)

Starting training...
  Epoch 0 	batch 1 	loss 0.681061551559 	train acc 68.56 	val acc 62.09 
  Epoch 0 	batch 2 	loss 0.664734489732 	train acc 61.16 	val acc 63.41 
  Epoch 0 	batch 3 	loss 0.617464936279 	train acc 68.49 	val acc 63.08 
  Epoch 0 	batch 4 	loss 0.677049149298 	train acc 63.01 	val acc 61.87 
  Epoch 0 	batch 5 	loss 0.652063395124 	train acc 62.76 	val acc 62.18 
  Epoch 0 	batch 6 	loss 0.650960206837 	train acc 61.35 	val acc 61.79 


KeyboardInterrupt: 

In [27]:
model.check_accuracy(train_data, compute_cost_acc, dataset_name='train')

Testing...
Final results:
  train loss:			0.643258
  train accuracy:		63.21 %


In [28]:
# Visualize the loss and the accuracies for both training and validation sets for each epoch
visualize.plot_loss_acc(DATA_SET + '_train', train_losses, train_accuracies, val_accuracies, learning_rate, reg_strength, num_epochs, num_train)

NameError: name 'train_losses' is not defined