In [1]:
import glob
import numpy as np

In [2]:
file_names = glob.glob('./lovecraft/*.txt')
corpus = ''
for i in file_names:
    with open(i, 'r') as f:
        corpus+= f.read()

In [3]:
import unicodedata
def remove_accent(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [4]:
import re
def remove_special_characters(text):
    pat = r'[^a-zA-Z0-9.,!?/:;\"\'\s]'
    return re.sub(pat, '', text)

In [5]:
def remove_numbers(text):
    pat = r'[^a-zA-Z\"\s]'
    return re.sub(pat, '', text) 

In [6]:
import string
def remove_punctuation(text):
    return ''.join([c for c in text if c not in string.punctuation])

In [7]:
import nltk
def stem(text):
    stemmer = nltk.PorterStemmer()
    return ' '.join([stemmer.stem(c) for c in text.split()])

In [10]:
from nltk.tokenize import ToktokTokenizer
stopwords_list = nltk.corpus.stopwords.words('english')
stopwords_list.remove('not')
tokenizer = ToktokTokenizer()
def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    t = [token for token in tokens if token.lower() not in stopwords_list]
    text = ' '.join(t)
    return text

In [11]:
def to_lower(text):
    return text.lower()

In [12]:
def preprocess_text(text):
    text = remove_accent(text)
    text = remove_special_characters(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = stem(text)
    text = remove_stopwords(text)
    text = to_lower(text)
    return text

In [14]:
corpus = preprocess_text(corpus)

In [15]:
len(corpus)

1730257

In [72]:
import numpy as np
seq_length = 100
dataX = []
dataY = []
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))
for i in range(0, len(corpus) - seq_length, 1):
    seq_in = corpus[i:i + seq_length]
    seq_out = corpus[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = tf.keras.utils.to_categorical(dataY)

In [75]:
len(X[1])

100

VOCAB_SIZE = len(chars)
char_to_ix = {char:ix for ix, char in enumerate(c)}
SEQ_LENGTH = 50
slices = len(corpus)//SEQ_LENGTH

X = np.zeros((slices, SEQ_LENGTH, VOCAB_SIZE))
y = np.zeros((slices, SEQ_LENGTH, VOCAB_SIZE))

for i in range(0, slices):
    X_sequence = corpus[i*SEQ_LENGTH:(i+1)*SEQ_LENGTH]
    X_sequence_ix = [char_to_ix[value] for value in X_sequence]
    input_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        input_sequence[j][X_sequence_ix[j]] = 1.
    X[i] = input_sequence
                                                          
    y_sequence = corpus[i*SEQ_LENGTH+1:(i+1)*SEQ_LENGTH+1]
    y_sequence_ix = [char_to_ix[value] for value in y_sequence]
    target_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        target_sequence[j][y_sequence_ix[j]] = 1.
    y[i] = target_sequence

In [17]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed, Reshape, Dropout
from tensorflow.keras.models import Sequential

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [85]:
model = Sequential()
model.add(LSTM(100, input_shape = (X.shape[1], X.shape[2]), return_sequences=True))
model.add(LSTM(100, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(200, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(200, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(100, return_sequences = True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Dense(y.shape[1], activation = 'softmax'))

In [86]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_10 (LSTM)               (None, 100, 100)          40800     
_________________________________________________________________
lstm_11 (LSTM)               (None, 100, 100)          80400     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100, 100)          0         
_________________________________________________________________
lstm_12 (LSTM)               (None, 100, 200)          240800    
_________________________________________________________________
dropout_7 (Dropout)          (None, 100, 200)          0         
_________________________________________________________________
lstm_13 (LSTM)               (None, 100, 200)          320800    
_________________________________________________________________
dropout_8 (Dropout)          (None, 100, 200)         

In [88]:
opt = tf.keras.optimizers.Adam(lr = 0.0001)
model.compile(optimizer = opt,
             loss = 'categorical_crossentropy',
             metrics = ['mse', 'categorical_crossentropy'])

es = tf.keras.callbacks.EarlyStopping(monitor = 'loss', verbose = 1, patience = 3)
bar = tf.keras.callbacks.ProgbarLogger()
lr = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'categorical_crossentropy', factor = 0.1)

In [89]:
with tf.device('/gpu:0'):
    model.fit(X, y,
         batch_size = 512,
         epochs = 10,
         callbacks= [es,bar, lr])

ValueError: A target array with shape (1730157, 27) was passed for an output of shape (None, 100, 27) while using as loss `categorical_crossentropy`. This loss expects targets to have the same shape as the output.

In [49]:
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print "Seed:"
print "\"", ''.join([int_to_char[value] for value in pattern]), "\""
# generate characters
for i in range(1000):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print "\nDone."

array([[[0.03076054, 0.03719701, 0.04217692, ..., 0.03265793,
         0.03653637, 0.03604594],
        [0.01749284, 0.03665666, 0.05407253, ..., 0.0238058 ,
         0.0348198 , 0.03401399],
        [0.00493379, 0.03215248, 0.06742826, ..., 0.01221828,
         0.03011895, 0.02685007],
        ...,
        [0.00089945, 0.0325891 , 0.06084775, ..., 0.00998829,
         0.03193987, 0.02159048],
        [0.00089944, 0.03258901, 0.06084799, ..., 0.00998826,
         0.03193988, 0.02159053],
        [0.00089943, 0.03258904, 0.06084803, ..., 0.00998824,
         0.0319399 , 0.02159058]],

       [[0.03029737, 0.03721276, 0.04254015, ..., 0.03238272,
         0.03650689, 0.0360213 ],
        [0.01621891, 0.03646531, 0.05539059, ..., 0.02283008,
         0.03454903, 0.03364086],
        [0.00442563, 0.0317672 , 0.0677501 , ..., 0.01162199,
         0.02983518, 0.02627056],
        ...,
        [0.00089944, 0.03258904, 0.06084783, ..., 0.00998816,
         0.0319401 , 0.02159043],
        [0.0

In [34]:
chars_np

array(['alchemist', 'high', 'crown', ..., 'south', 'came', 'never'],
      dtype='<U40')

In [56]:
chars_np = np.asarray(chars)

def output_idx(i):
    return np.argmax(model.predict([[X[i]]])[0], 1)

def output_str(i):
    return ''.join(list(chars_np[output_idx(i)]))

output_str(4)

'                                                  '

In [57]:
chars

['j',
 'c',
 'o',
 'b',
 'l',
 's',
 'a',
 'g',
 'i',
 'e',
 'w',
 'z',
 'h',
 'v',
 'f',
 ' ',
 't',
 'q',
 'm',
 'u',
 'x',
 'y',
 'n',
 'r',
 'k',
 'd',
 'p']

In [46]:
start = np.random.randint(0, len(corpus)-1)
pattern = corpus[start]
n_vocab = len(corpus)
for i in range(1000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]


ValueError: Error when checking input: expected lstm_input to have shape (50, 27) but got array with shape (1, 1)

In [39]:
model.predict(chars_np)

ValueError: Error when checking input: expected lstm_input to have 3 dimensions, but got array with shape (277498, 1)

In [61]:
corpus

'alchemist high crown grassi summit swell mount whose side wood near base gnarl tree primev forest stand old chateau ancestor centuri lofti battlement frown upon wild rug countrysid serv home stronghold proud hous whose honor line older even mossgrown castl wall ancient turret stain storm gener crumbl slow yet mighti pressur time form age feudal one dread formid fortress franc machicol parapet mount battlement baron count even king defi yet never spaciou hall resound footstep invad sinc gloriou year chang poverti littl abov level dire want togeth pride name forbid allevi pursuit commerci life prevent scion line maintain estat pristin splendour fall stone wall overgrown veget park dri dusti moat illpav courtyard toppl tower without well sag floor wormeaten wainscot fade tapestri within tell gloomi tale fallen grandeur age pass first one anoth four great turret left ruin last singl tower hous sadli reduc descend onc mighti lord estat wa one vast gloomi chamber thi remain tower antoin las