In [1]:
from __future__ import print_function
from __future__ import division
import json
import py_crepe
import datetime
import numpy as np
import data_helpers
import data
import string
import pandas as pd
np.random.seed(0123)  # for reproducibility


Using Theano backend.


In [2]:
# set parameters:

subset = None

#Whether to save model parameters
save = False
model_name_path = 'params/crepe_model.json'
model_weights_path = 'params/crepe_model_weights.h5'

#Maximum length. Longer gets chopped. Shorter gets padded.
maxlen = 1014

#Model params
#Filters for conv layers
nb_filter = 128 #initially 256
#Number of units in the dense layer
dense_outputs = 512 #Initially 1024
#Conv layer kernel size
filter_kernels = [7, 7, 3, 3, 3, 3]
#Number of units in the final output layer. Number of classes.

#Compile/fit params
batch_size = 32
nb_epoch = 3

In [3]:
print('Loading data...')
#Expect x to be a list of sentences. Y to be a one-hot encoding of the
#categories.

### 515-1122-122 and 1573 with remove 6 layers
#authorlist=[121, 479 , 649 ]
#doc_id = 14706

authorlist=[ 55, 75, 80]
doc_id = 80
cat_output = len(authorlist) #binary in the last layer

# def main(authorlist, doc_id):
    
    
((trainX, trainY), (valX, valY)) = data_helpers.load_ag_data(authors = authorlist, docID = doc_id)

print('Creating vocab...')
vocab, reverse_vocab, vocab_size, check = data_helpers.create_vocab_set()


#trainX = data_helpers.encode_data(trainX, maxlen, vocab, vocab_size, check)
#test_data = data_helpers.encode_data(valX, maxlen, vocab, vocab_size, check)

print('Build model...')

classes = len(authorlist)
(model, sgd) = py_crepe.model(classes, filter_kernels, dense_outputs, maxlen, vocab_size, nb_filter)

Loading data...
Execution completed
Read completed
Number of rows: 70
author_id       int64
doc_content    object
dtype: object
Data Frame created: Shape: (20370, 2)
Author:    55  Size:  2701
Author:    75  Size: 13694
Author:    80  Size:  3975
Min: 2701
Max: 13694
Authors [55, 75, 80].
Found 8103 texts.
Found 8103 labels.
Creating vocab...
Build model...


In [4]:
vocab_size

1280

In [5]:
print('Fit model...')
initial = datetime.datetime.now()
for e in xrange(nb_epoch):
    xi, yi = data_helpers.shuffle_matrix(trainX, trainY)
    xi_test, yi_test = data_helpers.shuffle_matrix(valX, valY)
    if subset:
        batches = data_helpers.mini_batch_generator(xi[:subset], yi[:subset],
                                                    vocab, vocab_size, check,
                                                    maxlen,
                                                    batch_size=batch_size)
    else:
        batches = data_helpers.mini_batch_generator(xi, yi, vocab, vocab_size,
                                                    check, maxlen,
                                                    batch_size=batch_size)

    test_batches = data_helpers.mini_batch_generator(xi_test, yi_test, vocab,
                                                     vocab_size, check, maxlen,
                                                     batch_size=batch_size)

    accuracy = 0.0
    loss = 0.0
    step = 1
    start = datetime.datetime.now()
    print('Epoch: {}'.format(e))
    for x_train, y_train in batches:
        
        f = model.train_on_batch(x_train, y_train)
        loss += f[0]
        loss_avg = loss / step
        accuracy += f[1]
        accuracy_avg = accuracy / step
        if step % 100 == 0:
            print('  Step: {}'.format(step))
            print('\tLoss: {}. Accuracy: {}'.format(loss_avg, accuracy_avg))
        step += 1

    test_accuracy = 0.0
    test_loss = 0.0
    test_step = 1
    
    for x_test_batch, y_test_batch in test_batches:
        f_ev = model.test_on_batch(x_test_batch, y_test_batch)
        test_loss += f_ev[0]
        test_loss_avg = test_loss / test_step
        test_accuracy += f_ev[1]
        test_accuracy_avg = test_accuracy / test_step
        test_step += 1
    stop = datetime.datetime.now()
    e_elap = stop - start
    t_elap = stop - initial
    print('Epoch {}. Loss: {}. Accuracy: {}\nEpoch time: {}. Total time: {}\n'.format(e, test_loss_avg, test_accuracy_avg, e_elap, t_elap))

if save:
    print('Saving model params...')
    json_string = model.to_json()
    with open(model_name_path, 'w') as f:
        json.dump(json_string, f)

model.save_weights(model_weights_path)

Fit model...
Epoch: 0
  Step: 100
	Loss: 0.764290315956. Accuracy: 0.6571875
  Step: 200
	Loss: 0.519613031894. Accuracy: 0.779375
Epoch 0. Loss: 0.24589606758. Accuracy: 0.902865312847
Epoch time: 0:09:59.219471. Total time: 0:09:59.382526

Epoch: 1
  Step: 100
	Loss: 0.203406570442. Accuracy: 0.92375
  Step: 200
	Loss: 0.194800769584. Accuracy: 0.92640625
Epoch 1. Loss: 0.168923568346. Accuracy: 0.947274743342
Epoch time: 0:09:54.631171. Total time: 0:19:54.184923

Epoch: 2
  Step: 100
	Loss: 0.125617066966. Accuracy: 0.95
  Step: 200
	Loss: 0.12723420152. Accuracy: 0.95203125
Epoch 2. Loss: 0.207729942804. Accuracy: 0.924924136377
Epoch time: 0:09:36.873742. Total time: 0:29:31.210857



In [6]:
del trainX, trainY, valX, valY

In [7]:
model.load_weights(model_weights_path)

#from keras.optimizers import SGD
#sgd = SGD(lr=0.01, momentum=0.9, nesterov= True)

# Compile model again (required to make predictions)
model.compile(loss='categorical_crossentropy', optimizer=sgd,
              metrics=['accuracy'])


In [8]:
(testX, testY) = data_helpers.load_doc_data(authors = authorlist, docID = doc_id)
testX = data_helpers.encode_data(testX, maxlen, vocab, vocab_size, check)

Execution completed
Read completed
Number of rows: 1
author_id       int64
doc_content    object
dtype: object
Data Frame created: Shape: (176, 2)
Found 176 texts.


In [9]:
predY = np.array(model.predict(testX, batch_size=batch_size))

In [10]:
predY


array([[  9.96111631e-01,   3.22959828e-03,   6.58770441e-04],
       [  9.99935567e-01,   6.06235335e-05,   3.82875623e-06],
       [  9.99912083e-01,   8.10839119e-05,   6.83532789e-06],
       [  9.99967515e-01,   3.20289510e-05,   4.46864306e-07],
       [  9.99951184e-01,   3.67558459e-05,   1.20756558e-05],
       [  9.99915481e-01,   6.57606070e-05,   1.87502992e-05],
       [  9.99930978e-01,   6.57179917e-05,   3.32806167e-06],
       [  9.99993145e-01,   3.86129386e-06,   3.00916099e-06],
       [  9.99979317e-01,   1.25179258e-05,   8.16534066e-06],
       [  9.99997199e-01,   1.52635982e-06,   1.28878150e-06],
       [  9.99995589e-01,   3.16619594e-06,   1.21529229e-06],
       [  9.99998331e-01,   9.21945457e-07,   7.18176182e-07],
       [  9.99988854e-01,   1.03585426e-05,   7.93583638e-07],
       [  9.99986887e-01,   6.66240567e-06,   6.42273017e-06],
       [  9.99995768e-01,   3.04454124e-06,   1.19496406e-06],
       [  9.99867082e-01,   1.30398039e-04,   2.5241290