In [1]:
# Inspired from https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

Using TensorFlow backend.


In [7]:
import tensorflow as tf
with tf.Session() as sess:
  devices = sess.list_devices()
print(devices)
print('is_gpu', tf.test.is_gpu_available())
print(tf.test.gpu_device_name())

[_DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 268435456, 18176836799721446975)]
is_gpu False



In [2]:
raw_text = open('legfrance.txt').read()
raw_text = raw_text[:500_000]

In [3]:
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [4]:
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  500000
Total Vocab:  95


In [5]:
# prepare the dataset of input to output pairs encoded as integers
#seq_length = 100
seq_length = 20
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

Total Patterns:  499980


In [6]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
# fit the model
#model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)
model.fit(X, y, epochs=5, batch_size=64, callbacks=callbacks_list)

Epoch 1/5

KeyboardInterrupt: 

In [16]:
# load the network weights
filename = "weights-improvement-05-1.8063.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [17]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [30]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
res = ""
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    res += result
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print(res, "\nDone.")

Seed:
" eurs et les parrains "
 de la pomsron de la dommission de crnmtis de lr des cenentes eu den conettisns de le dommission du cinéma et de l'image animée  
Article L212-2

Le crmiri d'exploitation den œuvres cinématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographiques du aenématographique