In [1]:
import numpy as np
import tensorflow as tf

In [8]:
import re
import string

In [37]:
sequence_length=600
pad_symbol = "*"

In [34]:
def clean(fname):

    # Separate file into books with all metadata
    startBook = "<book"
    books = []
    with open(fname) as openFile:
        for line in openFile:
            if line[:len(startBook)] == startBook:
                books.append(line)
            else:
                books[-1] += line
    openFile.close()

    books = np.array(books)

    # Collect synopses and all genre tags for each book
    # X_data found here (synopses)
    synopses = []
    genres = []

    startBody = "<body>"
    endBody = "</body>"

    startTopic = "<topics>"
    endTopic = "</topics>"

    for book in books:
        start = book.index(startBody) + len(startBody)
        end = book.index(endBody)

        blurb = book[start:end].lower()
        blurb = re.sub(r'[^\w\s]', '', blurb)

        # standardize length
        if len(blurb) > sequence_length:
            blurb = blurb[:sequence_length]
        else:
            blurb = blurb + pad_symbol*(sequence_length-len(blurb))
            
        synopses.append(blurb)

        start = book.index(startTopic) + len(startTopic)
        end = book.index(endTopic)
        genres.append(book[start:end])

    X_data = np.array(synopses)



    # Create matrix of genre tags to book
    # Y_data found here
    start = ">"
    end = "</"

    Y_data = np.zeros((len(books), len(allGenres)))

    for i in range(len(books[:200])):
        entry = genres[i]
        while entry.find(end) != -1:
            s = entry.index(start) + 1
            e = entry.index(end)
            genreTag = entry[s:e]
            Y_data[i][allGenres.index(genreTag)] = 1
            entry = entry[e+5:]

    return (X_data, Y_data)

In [35]:
# Create a list of all genre tags
allGenres = []
with open("../data/hierarchy.txt") as hierarchyFile:
    for line in hierarchyFile:
        tab = line.find("\t")
        g1 = line[:tab]
        g2 = line[tab+1:len(line)-1]
        if allGenres.count(g1) == 0:
            allGenres.append(g1)
        if allGenres.count(g2) == 0:
            allGenres.append(g2)

print(allGenres[:4])

['Biography & Memoir', 'Arts & Entertainment Biographies & Memoirs', 'Political Figure Biographies & Memoirs', 'Historical Figure Biographies & Memoirs']


In [38]:
blurb_train, Y_train = clean("../data/BlurbGenreCollection_EN_train.txt")
blurb_test, Y_test = clean("../data/BlurbGenreCollection_EN_test.txt")
blurb_valid, Y_valid = clean("../data/BlurbGenreCollection_EN_dev.txt")

print(blurb_train[0])
print(Y_train[0])

mondays crosswords do with easetuesdays crosswords not a breezewednesdays crosswords harder stillthursdays crosswords take real skillfridays crosswords  youve come this farsaturdays crosswords  youre a starfor millions of people the new york times crossword puzzles are as essential to each day as the first cup of coffee in the morning now for the first time ever these premier puzzles are available in six clever installments with each day of the week the puzzles increase gradually in skill level mondays the easiest but saturdays sure to challenge push your mental muscles a little harder each da
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

In [39]:
chars = list(string.ascii_lowercase + string.digits + " ")
print(chars)
print("Number of characters:", len(chars))

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ' ']
Number of characters: 37


In [40]:
char_2_index = dict((c,i) for i, c in enumerate(chars)) 
index_2_char = dict((i,c) for i, c in enumerate(chars))

In [48]:
def makeOneHot(blurbs):
    X_data = np.zeros((len(blurbs), sequence_length, len(chars))) 
    for i, blurb in enumerate(blurbs):
        for t, char in enumerate(blurb):
            if char_2_index.get(char):
                X_data[i,t,char_2_index[char]] = 1
    return X_data

In [49]:
X_train = makeOneHot(blurb_train)
X_train[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [50]:
X_valid = makeOneHot(blurb_valid)
X_test = makeOneHot(blurb_test)

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

In [52]:
len(allGenres)

152

In [53]:
# Modeling

model = Sequential()
model.add(LSTM(256, input_shape=(sequence_length,len(chars),)))
model.add(Dense(len(allGenres), activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [55]:
es = EarlyStopping(monitor='val_loss', 
                   patience=10, verbose=1,
                   restore_best_weights=True)

tb = TensorBoard(log_dir='logs', histogram_freq=1, write_graph=1)

model.fit(X_train, Y_train, epochs=10000000, 
          callbacks=[es,tb],
          batch_size=128, shuffle=True,  
          validation_data=(X_valid, Y_valid))

KeyboardInterrupt: 