In [1]:
import numpy as np
import os, random, sys, io, re, string
import matplotlib.pyplot as plt
import tqdm
import itertools

In [2]:
shakes_lines = []
poem_starts = []
next_ln = False
min_ = 100

with open("data/shakespeare.txt") as f:
    
    # Read in all lines
    lines = f.readlines()
    for line in lines[1:]:
        
        # replace poem breaks with ~
        if re.match('\s+\d+', line):
            shakes_lines.append('\n')
            next_ln = True
            continue
            
        # get rid of blank lines
        seq = line.strip()
        if len(seq) < 3:
            continue
        else:
            min_ = len(seq)
        # remove punctuation
        seq = seq.translate(str.maketrans('', '', string.punctuation))
        # make lowercase
        seq = seq.lower()
        #print(seq)
        shakes_lines.append(seq)
        
        if next_ln:
            poem_starts.append(seq)
            next_ln = False


In [3]:
spens_lines = []
pstarts = []

with open("data/spenser.txt") as f:
    
    # Read in all lines
    lines = f.readlines()
    for line in lines[1:]:
        
        # replace poem breaks with ~
        if re.match('(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\n', line):
            spens_lines.append('\n')
            next_ln = True
            continue
            
        # get rid of blank lines
        seq = line.strip()
        if len(seq) < 3:
            continue
        else:
            min_ = len(seq)
        # remove punctuation
        seq = seq.translate(str.maketrans('', '', string.punctuation))
        # make lowercase
        seq = seq.lower()
        #print(seq)
        spens_lines.append(seq)
        
        if next_ln:
            pstarts.append(seq)
            next_ln = False
            

In [21]:
processed_text = ' @ '.join(shakes_lines).replace(' @\n @', '\n')\
                 + ' @ '.join(spens_lines[:-1]).replace(' @\n @\n @\n @', '\n')
# print(processed_text[:60*20])
# print(poem_starts[:5])

poems = processed_text.split('~')
maxlen = max([len(ln) for ln in shakes_lines]) + 1
print('max line length: ', maxlen)
print('min line length: ', min_)
#print(poems[:2])

#print('total words', len(set(processed_text.split(' '))))

max line length:  58
min line length:  46


In [22]:
fn = 'data/processed_shakespeare.txt'
vocab_size = 1000

with open(fn, 'w') as f:
    f.write(processed_text)
    f.close()

This bit kind of abuses SentencePiece's representation system a bit - it is designed to handle "one sentence per line" in the input, so I've modified the prepreprocessing to regard a poem as a "sentence" (i.e. one poem per line of input) with the line breaks in the poem converted to the @ symbol, which is made a user-defined symbol in the SentencePiece training so it will never become part of the representation of any other subword. We can convert back later.

In [23]:
import sentencepiece as spm

spm.SentencePieceTrainer.Train(f"--input={fn} --model_prefix=m --user_defined_symbols='<n>','@' --vocab_size={vocab_size} --model_type=bpe --pad_id=3")

sp = spm.SentencePieceProcessor()
sp.load('m.model')

#sp.encode_as_pieces(poems[0])

True

In [24]:

# returns vocab size
print(sp.get_piece_size())

# id <=> piece conversion
print(sp.id_to_piece(209))
print(sp.piece_to_id('@'))
print(sp.piece_to_id('▁@'))

# returns 0 for unknown tokens (we can change the id for UNK)
print(sp.piece_to_id('__MUST_BE_UNKNOWN__'))

# <unk>, <s>, </s> are defined by default. Their ids are (0, 1, 2)
# <s> and </s> are defined as 'control' symbol.
for id in range(3):
  print(sp.id_to_piece(id), sp.is_control(id))

sp.pad_id()

1000
ok
985
7
0
<unk> False
<s> True
</s> True


3

In [47]:
# cut the text in semi-redundant sequences of window_size Sentence Pieces
spec = np.array(processed_text.split('\n'))

print(spec[0])
print(sp.encode_as_pieces(spec[0]))

spec = [sp.encode_as_ids(str(p)) for p in spec]

# reduced step size from 3 to 1 because the model now learns by word pieces instead of chars
# so stepping by 3 usually goes all the way to the next word
step = 1
window_size = 20

X = []
y = []

# do it this way so training data does not cross poem boundaries
for poem in spec:
    for i in range(0, len(poem)-window_size, step):
        X.append(poem[i:i+window_size])
        y.append(poem[i+window_size])

X = np.array(X)
y = np.array(y)
X = X.astype(np.float32)
y = y.astype(np.float32)

print('len of data is', X.shape, y.shape)
print('Encoding...')

from fairest creatures we desire increase @ that thereby beautys rose might never die @ but as the riper should by time decease @ his tender heir might bear his memory @ but thou contracted to thine own bright eyes @ feedst thy lights flame with selfsubstantial fuel @ making a famine where abundance lies @ thy self thy foe to thy sweet self too cruel @ thou that art now the worlds fresh ornament @ and only herald to the gaudy spring @ within thine own bud buriest thy content @ and tender churl makst waste in niggarding @ pity the world or else this glutton be @ to eat the worlds due by the grave and thee @ 
['▁from', '▁fairest', '▁cre', 'ature', 's', '▁we', '▁desire', '▁inc', 'rea', 'se', '▁@', '▁that', '▁there', 'b', 'y', '▁beautys', '▁ro', 'se', '▁might', '▁never', '▁die', '▁@', '▁but', '▁as', '▁the', '▁ri', 'p', 'er', '▁should', '▁by', '▁time', '▁dec', 'ease', '▁@', '▁his', '▁tend', 'er', '▁he', 'ir', '▁might', '▁bear', '▁his', '▁mem', 'ory', '▁@', '▁but', '▁thou', '▁cont', 'ra', 'c

In [48]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def number_to_text(s):
    return ''.join(sp.id_to_piece(int(i)) for i in s)

def generate_seq(seed, n_pieces=50, temperature=1.0, use_sample=False):
    
    def reverse_formatting(s):
        return s.replace('▁', ' ').replace('@', '\n').replace('<n>', '')
    
    in_text = seed.copy()
    result = ''
    for _ in range(n_pieces):
        yhat = model.predict(np.array([in_text[-window_size:]]), verbose=0)
        yy = sample(yhat[0], temperature)
        
        if yy != 0:
            in_text.append(yy)
            result += (number_to_text([yy]))
        
    r_seed = reverse_formatting(number_to_text(seed))
    print('seed: ' + r_seed)
    print('temperature:', temperature)
    print('generated:\n' + r_seed + reverse_formatting(result), '\n')

In [59]:
from keras.layers import Input, Bidirectional, LSTM, Dense, Dropout, Embedding
#from keras.optimizers import Adam
# tfa's LazyAdam is used because it is supposed to be more efficient for sparse
# data. since we are using our own word vectors from the embedding layer this
# should yield better results
from tensorflow_addons.optimizers import LazyAdam
from keras import Model
from keras.regularizers import l2

units = 512

input_ = Input(shape=(window_size,))

emb = Embedding(vocab_size, units//2, input_length=window_size, 
                mask_zero=True, activity_regularizer=l2(1e-4))(input_)

bidirectional = Bidirectional(LSTM(units, activation='relu', recurrent_dropout=0.1,
                                   activity_regularizer=l2(1e-4)))(emb)

predictor = Dense(vocab_size, activation='softmax', 
                  activity_regularizer=l2(1e-4))(bidirectional)

model = Model(inputs=input_, outputs=predictor)
optimizer = LazyAdam(clipnorm=1)
model.compile(optimizer, 'sparse_categorical_crossentropy', 
              metrics=['sparse_categorical_crossentropy'])

model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 20)                0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 20, 256)           256000    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 1024)              3149824   
_________________________________________________________________
dense_7 (Dense)              (None, 1000)              1025000   
Total params: 4,430,824
Trainable params: 4,430,824
Non-trainable params: 0
_________________________________________________________________


In [60]:
from keras.callbacks import LambdaCallback, EarlyStopping, ModelCheckpoint

n_epochs = 250

def on_epoch_end(epoch, _, epochs_split=10):
    if epoch % epochs_split == 0 and epoch > 0:
        
        seed = sp.encode_as_ids(np.random.choice(poems))[:window_size]
        
        print(len(seed))
        
        print('generating poems:')
        
        for temp in [0.25, 0.75, 1.5]:
            generate_seq(seed, temperature=temp, use_sample=True)
            

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
es_callback = EarlyStopping(monitor='loss', min_delta=0.01, patience=5, 
                            verbose=1, mode='auto')

filepath="model-ckpt-epoch{epoch:08d}.hdf5"
checkpoint_callback = ModelCheckpoint(filepath, save_weights_only=True, period=10)

model.fit(X, y, batch_size=256, epochs=n_epochs,
          callbacks=[print_callback, es_callback, checkpoint_callback])

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
20
generating poems:
seed:  from fairest creatures we desire increase 
 that thereby beautys rose might never
temperature: 0.25
generated:
 from fairest creatures we desire increase 
 that thereby beautys rose might never 
 and be is a self and loper of a heart 
 and i i i and mine eye in me 
 and you with the love and my love in me 
 and all the marper of my love to me 
 but 

seed:  from fairest creatures we desire increase 
 that thereby beautys rose might never
temperature: 0.75
generated:
 from fairest creatures we desire increase 
 that thereby beautys rose might never live 
 than i you do not so bo lovelyment 
 then i with thee not be to love of moing 
 and so her world o my hards prolinters and 
 nor thou and stord to theted in your fill 
 

seed:  from fairest creatures we desire increase 
 that thereby beautys rose might never
temperature: 1.5


KeyboardInterrupt: 

In [77]:
def write_poems(first_line, temps=[1.0]):
    
    output = ''
    
    def reverse_formatting(s):
        return s.replace('▁', ' ').replace('@', '\n').replace('<unk>', '')
    
    seed = sp.encode_as_ids(first_line.lower().replace('\n', ' @'))    
    print('seed: ' + reverse_formatting(first_line))

    output += 'seed: ' + reverse_formatting(first_line) + '\n'
  
    for temperature in temps:
        

        result = ''
        lines = 1
        
        in_text = seed.copy()
        while len(in_text) < window_size:
            in_text = [sp.pad_id()] + in_text
            
        while lines < 14 and len(result) < 1000:
            yhat = model.predict(np.array([in_text[-window_size:]]), verbose=0)
            yhat = sample(yhat[0], temperature)
            
            in_text.append(yhat)
            result += (number_to_text([yhat]))

            lines = result.count('@')

        r_seed = reverse_formatting(number_to_text(seed))
        print('temperature:', temperature)
        print('generated:\n' + r_seed + reverse_formatting(result), '\n')
        
        output += 'temperature: ' + str(temperature) + '\n'
        output += 'generated:\n' + r_seed + reverse_formatting(result) + '\n'
        
    return output

In [119]:
model.load_weights('model-ckpt-epoch00000030.hdf5')

prompt = "from fairest creatures we desire increase\n that thereby beautys rose might never"
tlist = [0.25, 0.75, 1.5]

for _ in range(1):
    write_poems(prompt, temps=tlist)

seed: from fairest creatures we desire increase
 that thereby beautys rose might never
temperature: 0.25
generated:
 from fairest creatures we desire increase 
 that thereby beautys rose might never die 
 but as the thing they would have suvel 
 whereof a lose thee that i do abuse 
 so all the world doth live in doupy thought 
 and my love for they are still with thee 
 and thou art more than thy self dost looking 
 as thou art therefore and my self dost be 
 thou dost be not let me give them to be 
 thou mayst thou art not for my verse to been 
 to make them love to my self ill show 
 and thou art that which i will be dwelled 
 in my great triumphor and thou dost thou use 
 and to the hands of all the world doth please 
 spending on the world which they did shrive 
 

temperature: 0.75
generated:
 from fairest creatures we desire increase 
 that thereby beautys rose might never be 
 the means of my love and darter not thy sits 
 so by that i force but thou dost thou prove 
 mine eye t

In [115]:
def perplexity(y_true, y_pred):
    ce = -(1.0/y_true.shape[0]) * np.sum(y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred))
    perplexity = np.exp(ce)
    return perplexity

print(y[:10])
y_pred = model.predict(X)
yy = y.astype(int)
yy = np.eye(np.max(yy) + 1)[yy]

z = np.zeros((yy.shape[0], 1000))
z[:,:-1] = yy

print(y_pred.shape)
print(z.shape)

print(perplexity(z, y_pred))

[525.   7.  81. 103.  17. 301. 994.  20. 260. 142.]


KeyboardInterrupt: 