In [2]:
import numpy as np

import tensorflow as tf

In [9]:
import collections
import pathlib

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras.layers import TextVectorization

In [5]:
# Create a list of all genre tags
allGenres = []
with open("../data/hierarchy.txt") as hierarchyFile:
    for line in hierarchyFile:
        tab = line.find("\t")
        g1 = line[:tab]
        g2 = line[tab+1:len(line)-1]
        if allGenres.count(g1) == 0:
            allGenres.append(g1)
        if allGenres.count(g2) == 0:
            allGenres.append(g2)

print(allGenres[:4])

['Biography & Memoir', 'Arts & Entertainment Biographies & Memoirs', 'Political Figure Biographies & Memoirs', 'Historical Figure Biographies & Memoirs']


In [3]:
def clean(fname):

    # Separate file into books with all metadata
    startBook = "<book"
    books = []
    with open(fname) as openFile:
        for line in openFile:
            if line[:len(startBook)] == startBook:
                books.append(line)
            else:
                books[-1] += line
    openFile.close()

    books = np.array(books)

    # Collect synopses and all genre tags for each book
    # X_data found here (synopses)
    synopses = []
    genres = []

    startBody = "<body>"
    endBody = "</body>"

    startTopic = "<topics>"
    endTopic = "</topics>"

    for book in books:
        start = book.index(startBody) + len(startBody)
        end = book.index(endBody)
        synopses.append(book[start:end])

        start = book.index(startTopic) + len(startTopic)
        end = book.index(endTopic)
        genres.append(book[start:end])

    X_data = np.array(synopses)


    # Create matrix of genre tags to book
    # Y_data found here
    start = ">"
    end = "</"

    Y_data = np.zeros((len(books), len(allGenres)))

    for i in range(len(books[:200])):
        entry = genres[i]
        while entry.find(end) != -1:
            s = entry.index(start) + 1
            e = entry.index(end)
            genreTag = entry[s:e]
            Y_data[i][allGenres.index(genreTag)] = 1
            entry = entry[e+5:]

    return (X_data, Y_data)

In [7]:
syn_train, Y_train = clean("../data/BlurbGenreCollection_EN_train.txt")
syn_test, Y_test = clean("../data/BlurbGenreCollection_EN_test.txt")
syn_valid, Y_valid = clean("../data/BlurbGenreCollection_EN_dev.txt")

In [24]:
import re
import string

In [39]:
# Create Word2Vec word embeddings for each synopsis
# Start by indexing all words that appear into a vocuabulary list

# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 400 # approximately the largest synopsis we have

# Use the TextVectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [40]:
# Create global vocabulary list based off of train data
vectorize_layer.adapt(syn_train)
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'the', 'and', 'of', 'a', 'to', 'in', 'is', 'for', 'with', 'that', 'her', 'his', 'as', 'on', 'from', 'this', 'an', 'by']


In [41]:
# Model to encode synopses to arrays of indices corresponding to words in the global vocab
word2index = tf.keras.models.Sequential()
word2index.add(tf.keras.Input(shape=(1,), dtype=tf.string))
word2index.add(vectorize_layer)

In [42]:
# encoding the synopses
X_train = word2index.predict(syn_train)
X_test = word2index.predict(syn_test)
X_valid = word2index.predict(syn_valid)

In [30]:
for i in range(20, 40):
    print(inverse_vocab[X_train[0][i]])

this
[UNK]
[UNK]
—
you’re
a
[UNK]
millions
of
people
the
new
york
times
crossword
puzzles
are
as
essential
to


In [29]:
vocab_2_index = dict((c,i) for i, c in enumerate(inverse_vocab)) 
index_2_vocab = dict((i,c) for i, c in enumerate(inverse_vocab))

In [44]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(inverse_vocab),
        output_dim=128,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(128, activation='sigmoid'),
    tf.keras.layers.Dense(len(allGenres))
])

In [45]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [48]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

In [47]:
es = EarlyStopping(monitor='val_loss', 
                   patience=10, verbose=1,
                   restore_best_weights=True)

In [49]:
tb = TensorBoard(log_dir='logs', histogram_freq=1, write_graph=1)

In [50]:
model.fit(X_train, Y_train, epochs=10000000, 
          callbacks=[es,tb],
          batch_size=128, shuffle=True,  
          validation_data=(X_valid, Y_valid))

Epoch 1/10000000
 23/459 [>.............................] - ETA: 17:09 - loss: 0.6908 - accuracy: 0.0000e+00

KeyboardInterrupt: 