In [7]:
import pickle

train_data, test_data = None, None
with open("../data/train_data.pickle", "rb") as f:
    train_data = pickle.load(f)
with open("../data/test_data.pickle", "rb") as f:
    test_data = pickle.load(f)

In [8]:
x_train = train_data['train_texts']
y_train = train_data['train_labels']

x_test = test_data['test_texts']
y_test = test_data['test_labels']

In [10]:
# vectorization - chars to ints
import string
import random
import sys

import numpy as np

from tensorflow import keras
# keras.models import load_model

def sample(preds, temperature=1.0):
    """Sample predictions from a probability array"""
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-6) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate(model, diversity=0.5, text=""):
    """Generate text from a model"""
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    for i in range(5000):
        x = np.zeros((1, maxlen), dtype=np.int)
        for t, char in enumerate(sentence):
            try:
                x[0, t] = char_indices[char]
            except:
                print(sentence)
        preds = model.predict(x, verbose=0)[0][0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        generated += next_char
        sentence = sentence[1:] + next_char
        sys.stdout.write(next_char)
        sys.stdout.flush()
    return

def vectorize(text):
    """Convert text into character sequences"""
    step = 3
    sentences = []
    next_chars = []
    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])
    X = np.zeros((len(sentences), maxlen), dtype=np.int)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t] = char_indices[char]
        y[i, char_indices[next_chars[i]]] = 1
    return X, y

def clean_text(text, charset):
    text = " ".join(text.split())  # all white space is one space
    text = "".join([x for x in text if x in charset])  # remove characters that we don't care about
    return text

def get_model(modelfile, freeze=False):
    model = keras.models.load_model(modelfile)
    if freeze:
        for layer in model.layers[:6]:
            layer.trainable = False
    return model

chars = " " + string.ascii_letters + string.punctuation  # sorted to keep indices consistent
charset = set(chars)  # for lookup
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

maxlen = 100  # must match length which generated model - the sequence length

# load a pretrained language model
modelfile = "charlm2/model_middlemarch_cnn.hdf5"

In [14]:
# from keras.models import Sequential
# from keras.layers import Embedding, Dropout, BatchNormalization, GRU, Dense

# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
print('Processing pretrained character embeds...')
embedding_vectors = {}
with open("../data/glove/glove.6B.300d.txt", "r") as f:
    for line in f:
        line_split = line.strip().split(" ")
        vec = np.array(line_split[1:], dtype=float)
        char = line_split[0]
        embedding_vectors[char] = vec

embedding_matrix = np.zeros((len(chars), 300))
#embedding_matrix = np.random.uniform(-1, 1, (len(chars), 300))
for char, i in char_indices.items():
    #print ("{}, {}".format(char, i))
    embedding_vector = embedding_vectors.get(char)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

def get_gru_model(use_embeddings=False):
    model = keras.models.Sequential()
    if use_embeddings:
        model.add(keras.layers.Embedding(input_dim=len(charset), output_dim=300, weights=[embedding_matrix]))
    else:
        model.add(keras.layers.Embedding(input_dim=len(charset), output_dim=300))
    model.add(keras.layers.Dropout(0.1))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.GRU(256))
    model.add(keras.layers.Dropout(0.3))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Dense(85, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

Processing pretrained character embeds...


In [16]:
%%time
test_model = get_gru_model()
X, y = vectorize(clean_text(x_train[0], charset))
test_model.fit(X, y, epochs=10, batch_size=128, validation_split=0.1)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 14731 samples, validate on 1637 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 55min 5s, sys: 16min 48s, total: 1h 11min 53s
Wall time: 11min 2s


<tensorflow.python.keras.callbacks.History at 0x14c969cd0>

In [18]:
%%time
author_models = []  # [(author_model, author_id), (author_model, author_id), ...] - ids are ints
for i, train_text in enumerate(x_train):
    print("{} / {}".format(i, len(x_train)))
    ct = clean_text(train_text, charset)
    am = get_gru_model()
    X, y = vectorize(ct)
    am.fit(X, y, epochs=10, batch_size=128, verbose=0)
    author_models.append((am, y_train[i]))

0 / 50
1 / 50
2 / 50
3 / 50
4 / 50
5 / 50
6 / 50
7 / 50
8 / 50
9 / 50
10 / 50
11 / 50
12 / 50
13 / 50
14 / 50
15 / 50
16 / 50
17 / 50
18 / 50
19 / 50
20 / 50
21 / 50
22 / 50
23 / 50
24 / 50
25 / 50
26 / 50
27 / 50
28 / 50
29 / 50
30 / 50
31 / 50
32 / 50
33 / 50
34 / 50
35 / 50
36 / 50
37 / 50
38 / 50
39 / 50
40 / 50
41 / 50
42 / 50
43 / 50
44 / 50
45 / 50
46 / 50
47 / 50
48 / 50
49 / 50
CPU times: user 1d 22h 20min 20s, sys: 16h 25min 8s, total: 2d 14h 45min 28s
Wall time: 9h 38min 3s


In [20]:
!mkdir -p gru_models
for author_model, author_id in author_models:
    author_model.save("gru_models/author" + str(author_id) + ".h5")

In [24]:
from statistics import mean
word_counts = [text.count(" ") for text in x_test]
mean(word_counts)

182.5416

In [26]:
def get_chunks(l, n):
    n = max(1, n)
    return [l[i:i+n] for i in range(0, len(l), n)]

In [27]:
# 182 words is quite short
# Try to join 5 tests texts together
longer_test_texts = get_chunks(x_test, 5)
longer_test_labels = get_chunks(y_test, 5)

In [28]:
all([len(set(x)) == 1 for x in longer_test_labels])  # Make sure that all combined labels are the same

True

In [29]:
longer_test_texts = ['\n'.join(chunk) for chunk in longer_test_texts]

In [30]:
longer_test_labels = [chunk[0] for chunk in longer_test_labels]

In [31]:
len(longer_test_texts)

500

In [32]:
%%time
from random import shuffle
from datetime import datetime

def get_predictions(author_models, test_texts, test_labels):
    """Evaluate each text for each author_model and append first metric to predictions"""
    indicies = list(range(len(test_texts)))

    test_texts = np.array(test_texts)
    test_labels = np.array(test_labels)

    test_texts = test_texts[indicies]
    test_labels = test_labels[indicies]

    predictions = []
    for i, text in enumerate(test_texts):
        t1 = datetime.now()
        print("{} / {}".format(i, len(test_texts)), end=" ")
        X, y = vectorize(clean_text(text, charset))

        losses = []
        for am in author_models:
            print(".", end="")
            model = am[0]
            label = am[1]
            loss = model.evaluate(X, y, verbose=0)
            losses.append((loss, label))
        print(" {}".format(datetime.now() - t1))
        predictions.append(losses)
    return predictions
    

predictions_long = get_predictions(author_models, longer_test_texts, longer_test_labels)

0 / 500 .................................................. 0:06:09.549061
1 / 500 .................................................. 0:02:46.813733
2 / 500 .................................................. 0:02:32.493920
3 / 500 .................................................. 0:02:41.309483
4 / 500 .................................................. 0:02:40.868753
5 / 500 .................................................. 0:02:44.367589
6 / 500 .................................................. 0:02:44.037099
7 / 500 .................................................. 0:02:41.305864
8 / 500 .................................................. 0:02:41.873435
9 / 500 .................................................. 0:02:43.311144
10 / 500 .................................................. 0:02:47.156874
11 / 500 .................................................. 0:02:51.836480
12 / 500 .................................................. 0:02:48.796290
13 / 500 ..........................

109 / 500 .................................................. 0:02:46.799730
110 / 500 .................................................. 0:02:46.454167
111 / 500 .................................................. 0:02:46.676036
112 / 500 .................................................. 0:02:45.593574
113 / 500 .................................................. 0:02:46.189554
114 / 500 .................................................. 0:02:46.286644
115 / 500 .................................................. 0:02:42.170269
116 / 500 .................................................. 0:02:41.326881
117 / 500 .................................................. 0:02:43.465138
118 / 500 .................................................. 0:02:43.539063
119 / 500 .................................................. 0:02:43.692139
120 / 500 .................................................. 0:02:45.451741
121 / 500 .................................................. 0:02:47.148762
122 / 500 ..

217 / 500 .................................................. 0:02:43.272471
218 / 500 .................................................. 0:02:44.276975
219 / 500 .................................................. 0:02:45.590680
220 / 500 .................................................. 0:02:44.909275
221 / 500 .................................................. 0:02:45.141765
222 / 500 .................................................. 0:02:44.831397
223 / 500 .................................................. 0:02:45.450754
224 / 500 .................................................. 0:02:44.870965
225 / 500 .................................................. 0:02:46.108186
226 / 500 .................................................. 0:02:47.886656
227 / 500 .................................................. 0:02:46.399219
228 / 500 .................................................. 0:02:51.591976
229 / 500 .................................................. 0:02:49.855405
230 / 500 ..

325 / 500 .................................................. 0:02:40.179973
326 / 500 .................................................. 0:02:40.333022
327 / 500 .................................................. 0:02:40.822895
328 / 500 .................................................. 0:02:41.890635
329 / 500 .................................................. 0:02:40.599891
330 / 500 .................................................. 0:02:39.865381
331 / 500 .................................................. 0:02:36.892445
332 / 500 .................................................. 0:02:37.836433
333 / 500 .................................................. 0:02:39.957629
334 / 500 .................................................. 0:02:40.166074
335 / 500 .................................................. 0:02:30.966993
336 / 500 .................................................. 0:02:40.003310
337 / 500 .................................................. 0:02:40.175436
338 / 500 ..

433 / 500 .................................................. 0:02:41.065433
434 / 500 .................................................. 0:02:41.545407
435 / 500 .................................................. 0:02:40.594795
436 / 500 .................................................. 0:02:40.595735
437 / 500 .................................................. 0:02:40.613574
438 / 500 .................................................. 0:02:40.654888
439 / 500 .................................................. 0:02:40.639372
440 / 500 .................................................. 0:02:40.012210
441 / 500 .................................................. 0:02:42.242331
442 / 500 .................................................. 0:02:40.247816
443 / 500 .................................................. 0:02:40.345711
444 / 500 .................................................. 0:02:40.130571
445 / 500 .................................................. 0:02:40.347318
446 / 500 ..

In [33]:
len(predictions_long)

500

In [34]:
pred_is = []
for pred in predictions_long:
    pred_i = [p[0] for p in pred]
    pred_is.append(pred_i)

In [35]:
pred_labs = [np.argmin(pred) for pred in pred_is]

In [36]:
from sklearn.metrics import accuracy_score
accuracy_score(longer_test_labels, pred_labs)

0.948

In [47]:
json_res = []
for test_text, actual_label, pred_label in zip(longer_test_texts, longer_test_labels, pred_labs):
    res = {"text":test_text, "actual_label":int(actual_label), "predicted_label": int(pred_label)}
    json_res.append(res)

In [45]:
print(json_res)



In [48]:
import json
file = open("rnn-gru-results.json", "w")
file.write(json.dumps(json_res, sort_keys=True, indent=4))

2568980

In [40]:
def generate(model, diversity=0.5, text=""):
    """Generate text from a model"""
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    for i in range(5000):
        x = np.zeros((1, maxlen), dtype=np.int)
        for t, char in enumerate(sentence):
            try:
                x[0, t] = char_indices[char]
            except:
                print(sentence)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        generated += next_char
        sentence = sentence[1:] + next_char
        sys.stdout.write(next_char)
        sys.stdout.flush()
    return

In [None]:
generate(author_models[0][1], diversity=0.5, text="I had a good day at school today " * 30)

In [43]:
# Embedding - BatchNorm - GRU - BatchNorm - Dense
model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=len(charset), output_dim=100))
# model.add(keras.layers.Dropout(0.1))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.GRU(256))
# model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=5, batch_size=128, validation_split=0.1)

Train on 14871 samples, validate on 1653 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x17fea9590>

In [73]:
new_model = keras.models.load_model('../models/gru_models/author1.h5')

In [74]:
new_model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 300)         25500     
_________________________________________________________________
dropout_8 (Dropout)          (None, None, 300)         0         
_________________________________________________________________
batch_normalization_8 (Batch (None, None, 300)         1200      
_________________________________________________________________
gru_4 (GRU)                  (None, 256)               427776    
_________________________________________________________________
dropout_9 (Dropout)          (None, 256)               0         
_________________________________________________________________
batch_normalization_9 (Batch (None, 256)               1024      
_________________________________________________________________
dense_4 (Dense)              (None, 85)               

In [92]:
# indicies = list(range(len(longer)))
test_texts = np.array(longer_test_texts)
test_labels = np.array(longer_test_labels)

text = test_texts[100]
label = test_labels[100]
X, y = vectorize(clean_text(text, charset))

In [None]:
predictions = []
losses = []
t1 = datetime.now()
for am in author_models:
    print(".", end="")
    model = am[0]
    label = am[1]
    loss = model.evaluate(X, y, verbose=0)
    losses.append((loss, label))
print(" {}".format(datetime.now() - t1))
predictions.append(losses)

..............................................

In [90]:
pred_is = []
for pred in predictions:
    pred_i = [p[0] for p in pred]
    pred_is.append(pred_i)
pred = [np.argmin(pred) for pred in pred_is]

In [91]:
pred

[0]