# An LSTM to generate text bases on chars 

In [2]:
import numpy as np
import keras

In [3]:
# A long free Text
# https://www.projekt-gutenberg.org/kafka/verwandl/verwandl.html
with open("verwandlung.txt", "r", encoding="utf-8") as file:
    contents = file.read()
    
contents = "\n".join(contents.split("\n")[59:1952])

unique_chars = set(contents)
int_to_char = {}
char_to_int = {}

for i, j in enumerate(unique_chars):
    int_to_char[i] = j
    char_to_int[j] = i

In [4]:
unique_chars

{'\n',
 ' ',
 '!',
 "'",
 ',',
 '-',
 '.',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'z',
 '«',
 '»',
 'Ä',
 'Ö',
 'Ü',
 'ß',
 'ä',
 'ö',
 'ü'}

In [5]:
length = 40

X = []
y = []

for i in range(0, len(contents) - length):
    line = contents[i:i+length]
    X.append([char_to_int[l] for l in line])
    
    letter = contents[i+length]
    y.append(char_to_int[letter])

In [6]:
# 0 = [1, 0, 0, 0, 0, 0, ....]
# 1 = [0, 1, 0, 0, 0, 0, ...]

from keras.utils import to_categorical

X = to_categorical(X, num_classes=len(unique_chars))
y = to_categorical(y, num_classes=len(unique_chars))

In [7]:
X.shape

(121090, 40, 68)

In [8]:
from keras.models import Sequential
from keras.layers import Dense, LSTM

model = Sequential()
model.add(LSTM(128, input_shape=(40, 68)))
model.add(Dense(68, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy")

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               100864    
                                                                 
 dense (Dense)               (None, 68)                8772      
                                                                 
Total params: 109,636
Trainable params: 109,636
Non-trainable params: 0
_________________________________________________________________


In [10]:
from keras.callbacks import ModelCheckpoint

save_model = ModelCheckpoint("weights.{epoch:02d}-{loss:.2f}.hdf5")

In [11]:
model.fit(X, y, batch_size=32, epochs=10, callbacks=[save_model])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2b54ce08eb0>

# Second part generating text

In [14]:
with open("verwandlung.txt", "r", encoding="utf-8") as file:
    contents = file.read()
    
contents = "\n".join(contents.split("\n")[59:1952])

In [16]:
line = contents[100:140]
line

'ungeheueren Ungeziefer verwandelt. Er la'

In [17]:
import numpy as np
from keras.utils import to_categorical

seed = [char_to_int[l] for l in line]
seed = to_categorical(seed, num_classes=len(char_to_int))

for i in range(0, 100):
    prediction = model.predict(seed.reshape(1, 40, 68), verbose=0)
    prediction = prediction[0]
        
    pos = np.random.choice(68, 1, p=prediction)[0]
    print(int_to_char[pos], end="")
    
    new_char = to_categorical([pos], num_classes=len(char_to_int))
        
    seed = seed[1:,:]
    seed = np.append(seed, new_char, axis=0)

nge
olss mit, wo was gewührt und
über dieses Holgen? Lach zu schöben, auf der Zimmermackte diese Kre