In [1]:
from __future__ import print_function
from __future__ import division
import json
import py_crepe
import datetime
import numpy as np
import data_helpers
import fixeddocdata
import string
import pandas as pd
np.random.seed(0123)  # for reproducibility


Using Theano backend.


In [2]:
# set parameters:

subset = None

#Whether to save model parameters
save = False
model_name_path = 'params/crepe_model.json'
model_weights_path = 'params/crepe_model_weights.h5'

#Maximum length. Longer gets chopped. Shorter gets padded.
maxlen = 1014

#Model params
#Filters for conv layers
nb_filter = 128 #initially 256
#Number of units in the dense layer
dense_outputs = 512 #Initially 1024
#Conv layer kernel size
filter_kernels = [3, 3, 3, 3, 3, 3]
#Number of units in the final output layer. Number of classes.

#Compile/fit params
batch_size = 32
nb_epoch = 15

In [3]:
print('Loading data...')
#Expect x to be a list of sentences. Y to be a one-hot encoding of the
#categories.

### 515-1122-122 and 1573 with remove 6 layers
#authorlist=[121, 479 , 649 ]
#doc_id = 14706

authorlist=[ 80, 53, 1680, 1097, 1103, 114, 132, 176, 1472, 1416, 55, 104]
doc_id = [3270, 3489, 12599, 2549]
tdoc = [74, 279, 12641, 12596, 23260, 3395, 2746, 3277, 117, 3270, 2034, 20055, 218, 6979, 3405, 3489, 2537, 281, 21567, 12599, 745, 2549, 2548, 3755]
cat_output = len(authorlist) #binary in the last layer

# def main(authorlist, doc_id):
    
    
((trainX, trainY), (valX, valY)) = data_helpers.load_ag_data(authors = authorlist, docID = doc_id, tdoc = tdoc)

print('Creating vocab...')
vocab, reverse_vocab, vocab_size, check = data_helpers.create_vocab_set()


#trainX = data_helpers.encode_data(trainX, maxlen, vocab, vocab_size, check)
#test_data = data_helpers.encode_data(valX, maxlen, vocab, vocab_size, check)

print('Build model...')

classes = len(authorlist)
(model, sgd, model_weights_path) = py_crepe.build_model(classes, filter_kernels,
                                                        dense_outputs, maxlen, vocab_size, nb_filter)

Loading data...
SELECT author_id, doc_content FROM aman_content WHERE author_id IN (80, 53, 1680, 1097, 1103, 114, 132, 176, 1472, 1416, 55, 104) AND doc_id NOT IN (3270, 3489, 12599, 2549)  AND doc_id IN (74, 279, 12641, 12596, 23260, 3395, 2746, 3277, 117, 3270, 2034, 20055, 218, 6979, 3405, 3489, 2537, 281, 21567, 12599, 745, 2549, 2548, 3755) ;
Execution completed
Read completed
Number of rows: 20
author_id       int64
doc_content    object
dtype: object
Data Frame created: Shape: (9232, 2)
Author:    53  Size:  1079
Author:    80  Size:   820
Author:   132  Size:  1329
Author:   176  Size:   867
Author:   114  Size:  1131
Author:  1097  Size:   659
Author:  1103  Size:   942
Author:   104  Size:   264
Author:  1472  Size:   579
Author:  1416  Size:  1051
Author:    55  Size:   147
Author:  1680  Size:   364
Min: 147
Max: 1329
Authors [53, 80, 132, 176, 114, 1097, 1103, 104, 1472, 1416, 55, 1680].
Found 1764 texts.
Found 1764 labels.
Creating vocab...
Build model...


In [4]:
vocab_size

255

In [5]:
print('Fit model...')
initial = datetime.datetime.now()
for e in xrange(nb_epoch):
    xi, yi = data_helpers.shuffle_matrix(trainX, trainY)
    xi_test, yi_test = data_helpers.shuffle_matrix(valX, valY)
    if subset:
        batches = data_helpers.mini_batch_generator(xi[:subset], yi[:subset],
                                                    vocab, vocab_size, check,
                                                    maxlen,
                                                    batch_size=batch_size)
    else:
        batches = data_helpers.mini_batch_generator(xi, yi, vocab, vocab_size,
                                                    check, maxlen,
                                                    batch_size=batch_size)

    test_batches = data_helpers.mini_batch_generator(xi_test, yi_test, vocab,
                                                     vocab_size, check, maxlen,
                                                     batch_size=batch_size)

    accuracy = 0.0
    loss = 0.0
    step = 1
    start = datetime.datetime.now()
    print('Epoch: {}'.format(e))
    for x_train, y_train in batches:
        
        f = model.train_on_batch(x_train, y_train)
        loss += f[0]
        loss_avg = loss / step
        accuracy += f[1]
        accuracy_avg = accuracy / step
        if step % 100 == 0:
            print('  Step: {}'.format(step))
            print('\tLoss: {}. Accuracy: {}'.format(loss_avg, accuracy_avg))
        step += 1

    test_accuracy = 0.0
    test_loss = 0.0
    test_step = 1
    
    for x_test_batch, y_test_batch in test_batches:
        f_ev = model.test_on_batch(x_test_batch, y_test_batch)
        test_loss += f_ev[0]
        test_loss_avg = test_loss / test_step
        test_accuracy += f_ev[1]
        test_accuracy_avg = test_accuracy / test_step
        test_step += 1
    stop = datetime.datetime.now()
    e_elap = stop - start
    t_elap = stop - initial
    print('Epoch {}. Loss: {}. Accuracy: {}\nEpoch time: {}. Total time: {}\n'.format(e, test_loss_avg, test_accuracy_avg, e_elap, t_elap))

if save:
    print('Saving model params...')
    json_string = model.to_json()
    with open(model_name_path, 'w') as f:
        json.dump(json_string, f)

model.save_weights(model_weights_path)

import cPickle as pickle
with open('sgd.pickle', 'wb') as handle:
    pickle.dump(sgd, handle, protocol=pickle.HIGHEST_PROTOCOL)



Fit model...
Epoch: 0
Epoch 0. Loss: 2.48112207651. Accuracy: 0.0833333333333
Epoch time: 0:00:24.563080. Total time: 0:00:24.608400

Epoch: 1
Epoch 1. Loss: 2.46972413858. Accuracy: 0.140625
Epoch time: 0:00:21.090064. Total time: 0:00:45.741279

Epoch: 2
Epoch 2. Loss: 2.36108263334. Accuracy: 0.166666666667
Epoch time: 0:00:21.196528. Total time: 0:01:06.981930

Epoch: 3
Epoch 3. Loss: 1.96348934372. Accuracy: 0.419270833333
Epoch time: 0:00:21.577465. Total time: 0:01:28.601956

Epoch: 4
Epoch 4. Loss: 1.7992956837. Accuracy: 0.3515625
Epoch time: 0:00:21.095828. Total time: 0:01:49.742703

Epoch: 5
Epoch 5. Loss: 2.06305861473. Accuracy: 0.317708333333
Epoch time: 0:00:21.153123. Total time: 0:02:10.939075

Epoch: 6
Epoch 6. Loss: 1.73783878485. Accuracy: 0.356770833333
Epoch time: 0:00:21.271865. Total time: 0:02:32.265066

Epoch: 7
Epoch 7. Loss: 1.73823209604. Accuracy: 0.442708333333
Epoch time: 0:00:21.130239. Total time: 0:02:53.438944

Epoch: 8
Epoch 8. Loss: 2.16135884511.

In [6]:
del trainX, trainY, valX, valY

In [7]:
model.load_weights(model_weights_path)

#from keras.optimizers import SGD
#sgd = SGD(lr=0.01, momentum=0.9, nesterov= True)

# Compile model again (required to make predictions)
model.compile(loss='categorical_crossentropy', optimizer=sgd,
              metrics=['accuracy'])



In [8]:
def predictModel2(model, textX):
    
    predY = np.array(model.predict(testX))

    predYList = predY[:]
    
    predY = np.mean(predYList, axis=0)
    return (predYList, predY)

In [9]:
def predictModel(model, testX):
    # Function to take input of data and return prediction model
    predY = np.array(model.predict(testX))

    predYList = predY[:]
    entro = []
    
    flag = False
    import math
    for row in predY:
        entroval = 0
        for i in row:
            if(i <= 0):
                flag = True
                pass
            else:
                entroval += (i * (math.log(i , 2)))
        entroval = -1 * entroval
        entro.append(entroval)
        
    if(flag == False):
        yx = zip(entro, predY)
        yx = sorted(yx, key = lambda t: t[0])
        newPredY = [x for y, x in yx]
        predYEntroList = newPredY[:int(len(newPredY)*0.3)] # Reduce this 
        predY = np.mean(predYEntroList, axis=0)
    else:
        predY = np.mean(predYList, axis=0)
    
    return (predYList, predY)

In [18]:
test_binary = []
test_binary2 = []
for docs in doc_id:
    (testX, testY) = data_helpers.load_doc_data(authors = authorlist, docID = docs)
    testX = data_helpers.encode_data(testX, maxlen, vocab, vocab_size, check)
    
    (predYListDoc, predYDoc) = predictModel(model, testX)
    (predYListFrag, predYFrag) = predictModel2(model, testX)
    
    print(predYListFrag)
    
    testY = np.array(testY)
    testY = testY.mean(axis = 0)
    
    predLocationDoc = predYDoc.tolist().index(max(predYDoc))
    
    if predLocationDoc == testY:
        test_binary.append(1)
    else:
        test_binary.append(0)
        
    predLocationFrag = predYFrag.tolist().index(max(predYFrag))
    
    if predLocationFrag == testY:
        test_binary2.append(1)
    else:
        test_binary2.append(0)
        
    from IPython.display import clear_output
    clear_output()
        



In [11]:
#predY = np.array(model.predict(testX, batch_size=batch_size))

In [12]:
#predY


In [13]:
#predY.mean(axis = 0)

In [14]:
test_binary

[0, 1, 0, 0]

In [15]:
test_binary2

[0, 1, 0, 0]

In [16]:
"""
feature_model = py_crepe.build_feature_model()
feature_trainX = feature_model.predict(trainX)
feature_testX = feature_model.predict(testX)
"""

'\nfeature_model = py_crepe.build_feature_model()\nfeature_trainX = feature_model.predict(trainX)\nfeature_testX = feature_model.predict(testX)\n'