- A language model predicts the next word in the sequence based on the specific words that have come before it in the sequence.

## Load Text 

In [1]:
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text

raw_text = load_doc('D:/NLP _Deep_Learning/Language Models/Rhyme.txt')
print(raw_text)

Sing a song of sixpence,
A pocket full of rye.
Four and twenty blackbirds,
Baked in a pie.

When the pie was opened
The birds began to sing;
Wasn't that a dainty dish,
To set before the king.

The king was in his counting house,
Counting out his money;
The queen was in the parlour,
Eating bread and honey.

The maid was in the garden,
Hanging out the clothes,
When down came a blackbird
And pecked off her nose.


## Clean Text 

In [2]:
tokens = raw_text.split()
raw_text = ' '.join(tokens)
print(raw_text)

Sing a song of sixpence, A pocket full of rye. Four and twenty blackbirds, Baked in a pie. When the pie was opened The birds began to sing; Wasn't that a dainty dish, To set before the king. The king was in his counting house, Counting out his money; The queen was in the parlour, Eating bread and honey. The maid was in the garden, Hanging out the clothes, When down came a blackbird And pecked off her nose.


## Create Sequences 

In [4]:
# organize into sequences of characters
length = 10
sequences = []
for i in range(length,len(raw_text)):
    # select sequence of tokens
    seq = raw_text[i-length:i+1]
    # store
    sequences.append(seq)
print('Total Sequences : %d' % len(sequences))

Total Sequences : 399


In [12]:
sequences[0:5] # after 10 character 11 character will be 'g'

['Sing a song', 'ing a song ', 'ng a song o', 'g a song of', ' a song of ']

## Save Sequences 

In [13]:
# save tokens to file, one dialog per line
def save_doc(lines,filename):
    data = '\n'.join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close()
# save sequences to file
out_filename = 'char_sequences.txt'
save_doc(sequences,out_filename)

# Train Language Model 

## Load Data 

In [14]:
# load doc into memory
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text
# load
in_filename = 'char_sequences.txt'
raw_text = load_doc(in_filename)
print(raw_text)

Sing a song
ing a song 
ng a song o
g a song of
 a song of 
a song of s
 song of si
song of six
ong of sixp
ng of sixpe
g of sixpen
 of sixpenc
of sixpence
f sixpence,
 sixpence, 
sixpence, A
ixpence, A 
xpence, A p
pence, A po
ence, A poc
nce, A pock
ce, A pocke
e, A pocket
, A pocket 
 A pocket f
A pocket fu
 pocket ful
pocket full
ocket full 
cket full o
ket full of
et full of 
t full of r
 full of ry
full of rye
ull of rye.
ll of rye. 
l of rye. F
 of rye. Fo
of rye. Fou
f rye. Four
 rye. Four 
rye. Four a
ye. Four an
e. Four and
. Four and 
 Four and t
Four and tw
our and twe
ur and twen
r and twent
 and twenty
and twenty 
nd twenty b
d twenty bl
 twenty bla
twenty blac
wenty black
enty blackb
nty blackbi
ty blackbir
y blackbird
 blackbirds
blackbirds,
lackbirds, 
ackbirds, B
ckbirds, Ba
kbirds, Bak
birds, Bake
irds, Baked
rds, Baked 
ds, Baked i
s, Baked in
, Baked in 
 Baked in a
Baked in a 
aked in a p
ked in a pi
ed in a pie
d in a pie.
 in a pie. 
in a pie. W
n a pie. Wh
 a p

In [15]:
lines = raw_text.split('\n')
lines

['Sing a song',
 'ing a song ',
 'ng a song o',
 'g a song of',
 ' a song of ',
 'a song of s',
 ' song of si',
 'song of six',
 'ong of sixp',
 'ng of sixpe',
 'g of sixpen',
 ' of sixpenc',
 'of sixpence',
 'f sixpence,',
 ' sixpence, ',
 'sixpence, A',
 'ixpence, A ',
 'xpence, A p',
 'pence, A po',
 'ence, A poc',
 'nce, A pock',
 'ce, A pocke',
 'e, A pocket',
 ', A pocket ',
 ' A pocket f',
 'A pocket fu',
 ' pocket ful',
 'pocket full',
 'ocket full ',
 'cket full o',
 'ket full of',
 'et full of ',
 't full of r',
 ' full of ry',
 'full of rye',
 'ull of rye.',
 'll of rye. ',
 'l of rye. F',
 ' of rye. Fo',
 'of rye. Fou',
 'f rye. Four',
 ' rye. Four ',
 'rye. Four a',
 'ye. Four an',
 'e. Four and',
 '. Four and ',
 ' Four and t',
 'Four and tw',
 'our and twe',
 'ur and twen',
 'r and twent',
 ' and twenty',
 'and twenty ',
 'nd twenty b',
 'd twenty bl',
 ' twenty bla',
 'twenty blac',
 'wenty black',
 'enty blackb',
 'nty blackbi',
 'ty blackbir',
 'y blackbird',
 ' black

## Encode Sequences 
- The sequences of characters must be encoded as integers. This means that each unique character will be assigned a specific integer value and each sequence of characters will be encoded as a sequence of integers. We can create the mapping given a sorted set of unique characters in the raw input data. The mapping is a dictionary of character values to integer values.

In [22]:
chars = sorted(list(set(raw_text)))
chars

['\n',
 ' ',
 "'",
 ',',
 '.',
 ';',
 'A',
 'B',
 'C',
 'E',
 'F',
 'H',
 'S',
 'T',
 'W',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'w',
 'x',
 'y']

In [28]:
mapping = dict((c,i) for i,c in enumerate(chars))
mapping

{'\n': 0,
 ' ': 1,
 "'": 2,
 ',': 3,
 '.': 4,
 ';': 5,
 'A': 6,
 'B': 7,
 'C': 8,
 'E': 9,
 'F': 10,
 'H': 11,
 'S': 12,
 'T': 13,
 'W': 14,
 'a': 15,
 'b': 16,
 'c': 17,
 'd': 18,
 'e': 19,
 'f': 20,
 'g': 21,
 'h': 22,
 'i': 23,
 'k': 24,
 'l': 25,
 'm': 26,
 'n': 27,
 'o': 28,
 'p': 29,
 'q': 30,
 'r': 31,
 's': 32,
 't': 33,
 'u': 34,
 'w': 35,
 'x': 36,
 'y': 37}

In [31]:
sequences = []
for line in lines:
    encoded_seq = [mapping[char] for char in line]
    # store
    sequences.append(encoded_seq)
sequences

[[12, 23, 27, 21, 1, 15, 1, 32, 28, 27, 21],
 [23, 27, 21, 1, 15, 1, 32, 28, 27, 21, 1],
 [27, 21, 1, 15, 1, 32, 28, 27, 21, 1, 28],
 [21, 1, 15, 1, 32, 28, 27, 21, 1, 28, 20],
 [1, 15, 1, 32, 28, 27, 21, 1, 28, 20, 1],
 [15, 1, 32, 28, 27, 21, 1, 28, 20, 1, 32],
 [1, 32, 28, 27, 21, 1, 28, 20, 1, 32, 23],
 [32, 28, 27, 21, 1, 28, 20, 1, 32, 23, 36],
 [28, 27, 21, 1, 28, 20, 1, 32, 23, 36, 29],
 [27, 21, 1, 28, 20, 1, 32, 23, 36, 29, 19],
 [21, 1, 28, 20, 1, 32, 23, 36, 29, 19, 27],
 [1, 28, 20, 1, 32, 23, 36, 29, 19, 27, 17],
 [28, 20, 1, 32, 23, 36, 29, 19, 27, 17, 19],
 [20, 1, 32, 23, 36, 29, 19, 27, 17, 19, 3],
 [1, 32, 23, 36, 29, 19, 27, 17, 19, 3, 1],
 [32, 23, 36, 29, 19, 27, 17, 19, 3, 1, 6],
 [23, 36, 29, 19, 27, 17, 19, 3, 1, 6, 1],
 [36, 29, 19, 27, 17, 19, 3, 1, 6, 1, 29],
 [29, 19, 27, 17, 19, 3, 1, 6, 1, 29, 28],
 [19, 27, 17, 19, 3, 1, 6, 1, 29, 28, 17],
 [27, 17, 19, 3, 1, 6, 1, 29, 28, 17, 24],
 [17, 19, 3, 1, 6, 1, 29, 28, 17, 24, 19],
 [19, 3, 1, 6, 1, 29, 28, 17, 

In [32]:
# vocabulary size
vocab_size = len(mapping)
print("Vocabulary Size : %d" % vocab_size)

Vocabulary Size : 38


## Split Inputs and Output 

In [36]:
from numpy import array
sequences = array(sequences)
sequences

array([[12, 23, 27, ..., 28, 27, 21],
       [23, 27, 21, ..., 27, 21,  1],
       [27, 21,  1, ..., 21,  1, 28],
       ...,
       [28, 20, 20, ..., 27, 28, 32],
       [20, 20,  1, ..., 28, 32, 19],
       [20,  1, 22, ..., 32, 19,  4]])

In [37]:
X = sequences[:,:-1]
X

array([[12, 23, 27, ..., 32, 28, 27],
       [23, 27, 21, ..., 28, 27, 21],
       [27, 21,  1, ..., 27, 21,  1],
       ...,
       [28, 20, 20, ...,  1, 27, 28],
       [20, 20,  1, ..., 27, 28, 32],
       [20,  1, 22, ..., 28, 32, 19]])

In [39]:
y = sequences[:,-1]
y

array([21,  1, 28, 20,  1, 32, 23, 36, 29, 19, 27, 17, 19,  3,  1,  6,  1,
       29, 28, 17, 24, 19, 33,  1, 20, 34, 25, 25,  1, 28, 20,  1, 31, 37,
       19,  4,  1, 10, 28, 34, 31,  1, 15, 27, 18,  1, 33, 35, 19, 27, 33,
       37,  1, 16, 25, 15, 17, 24, 16, 23, 31, 18, 32,  3,  1,  7, 15, 24,
       19, 18,  1, 23, 27,  1, 15,  1, 29, 23, 19,  4,  1, 14, 22, 19, 27,
        1, 33, 22, 19,  1, 29, 23, 19,  1, 35, 15, 32,  1, 28, 29, 19, 27,
       19, 18,  1, 13, 22, 19,  1, 16, 23, 31, 18, 32,  1, 16, 19, 21, 15,
       27,  1, 33, 28,  1, 32, 23, 27, 21,  5,  1, 14, 15, 32, 27,  2, 33,
        1, 33, 22, 15, 33,  1, 15,  1, 18, 15, 23, 27, 33, 37,  1, 18, 23,
       32, 22,  3,  1, 13, 28,  1, 32, 19, 33,  1, 16, 19, 20, 28, 31, 19,
        1, 33, 22, 19,  1, 24, 23, 27, 21,  4,  1, 13, 22, 19,  1, 24, 23,
       27, 21,  1, 35, 15, 32,  1, 23, 27,  1, 22, 23, 32,  1, 17, 28, 34,
       27, 33, 23, 27, 21,  1, 22, 28, 34, 32, 19,  3,  1,  8, 28, 34, 27,
       33, 23, 27, 21,  1

- Next, we need to one hot encode each character. That is, each character becomes a vector as long as the vocabulary (38 elements) with a 1 marked for the specific character. This provides a more precise input representation for the network. It also provides a clear objective for the network to predict, where a probability distribution over characters can be output by the model and compared to the ideal case of all 0 values with a 1 for the actual next character. We can use the to categorical() function in the Keras API to one hot encode the input and output sequences.

-  Consider an array of 5 labels out of a set of 3 classes {0, 1, 2}:
- > labels
  - array([0, 2, 1, 2, 0])
  - `to_categorical` converts this into a matrix with as many columns as there are classes. The number of rows stays the same.
> to_categorical(labels)
- array([[ 1.,  0.,  0.],
         [ 0.,  0.,  1.],
         [ 0.,  1.,  0.],
         [ 0.,  0.,  1.],
         [ 1.,  0.,  0.]], dtype=float32)

In [40]:
from keras.utils import to_categorical
sequences = [to_categorical(x,num_classes = vocab_size) for x in X]
X = array(sequences)
y = to_categorical(y,num_classes = vocab_size)

Using TensorFlow backend.


In [41]:
X

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [44]:
X.shape

(399, 10, 38)

In [68]:
X[0]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [42]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [43]:
y.shape

(399, 38)

# Fit Model
- The model is defined with an input layer that takes sequences that have 10 time steps and 38 features for the one hot encoded input sequences. Rather than specify these numbers, we use the second and third dimensions on the X input data. This is so that if we change the length of the sequences or size of the vocabulary, we do not need to change the model definition. The model has a single LSTM hidden layer with 75 memory cells, chosen with a little trial and error. The model has a fully connected output layer that outputs one vector with a probability distribution across all characters in the vocabulary. A softmax activation function is used on the output layer to ensure the output has the properties of a probability distribution.

In [46]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from pickle import dump
# define the model
def define_model(X):
    model = Sequential()
    model.add(LSTM(75,input_shape = (X.shape[1],X.shape[2])))
    model.add(Dense(vocab_size,activation = 'softmax'))
    # compile model
    model.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
    # summarize defined model
    model.summary()
    return model

model = define_model(X)
# fit model
model.fit(X,y,epochs = 100, verbose = 2)
model.save('model.h5')
# save the mapping
dump(mapping,open('mapping.pkl','wb'))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 75)                34200     
_________________________________________________________________
dense_1 (Dense)              (None, 38)                2888      
Total params: 37,088
Trainable params: 37,088
Non-trainable params: 0
_________________________________________________________________






Epoch 1/100
 - 1s - loss: 3.6164 - accuracy: 0.0927
Epoch 2/100
 - 0s - loss: 3.4954 - accuracy: 0.1905
Epoch 3/100
 - 0s - loss: 3.1633 - accuracy: 0.1905
Epoch 4/100
 - 0s - loss: 3.0519 - accuracy: 0.1905
Epoch 5/100
 - 0s - loss: 3.0227 - accuracy: 0.1905
Epoch 6/100
 - 0s - loss: 2.9981 - accuracy: 0.1905
Epoch 7/100
 - 0s - loss: 2.9875 - accuracy: 0.1905
Epoch 8/100
 - 0s - loss: 2.9680 - accuracy: 0.1905
Epoch 9/100
 - 0s - loss: 2.9575 - accuracy: 0.1905
Epoch 10/100
 - 0s - loss: 2.9418 - accuracy: 0.1905
Epoch 11/100
 - 0s - loss: 2.9290 - accuracy: 0.1930
Epoch 12/100
 - 0s - loss: 2.9136 - accuracy: 0.1905
Epoch 13/100
 - 0s - loss: 2.9061 - accuracy: 0.1905
Epoch 14/100
 - 0s - loss: 2.8700 - accuracy: 0.2105
Epoch 15/100
 - 0s - loss: 2.8534 - accuracy: 0.2155
Epoch 16/100
 - 0s - loss: 2.8196 - accuracy: 0.2105
Epoch 17/100
 - 0s - loss: 2.7877 - accuracy: 0.2306
Epoch 18/100
 - 0s - loss: 2.7614 - accuracy: 0.2155
Epoch 19/100
 - 0s - loss: 2.7199 - accuracy: 0.2456
Ep

# Generate Text

## Load Model 

In [60]:
from keras.models import load_model
from pickle import load
# load the model
model = load_model('model.h5')

# load the mapping
mapping = load(open('mapping.pkl','rb'))

## Generate Characters 
We must provide sequences of 10 characters as input to the model in order to start the generation
process. We will pick these manually. A given input sequence will need to be prepared in the
same way as preparing the training data for the model. First, the sequence of characters must
be integer encoded using the loaded mapping.

In [70]:
from keras.preprocessing.sequence import pad_sequences
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
    # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded, num_classes=len(mapping))
        #encoded = encoded.reshape(1,encoded.shape[0], encoded.shape[1])
        # predict character
        yhat = model.predict_classes(encoded, verbose=0)
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break
        # append to input
        in_text += char
    return in_text

In [72]:
# test start of rhyme
print(generate_seq(model, mapping, 10, 'Sing a son', 20))
# test mid-line
print(generate_seq(model, mapping, 10, 'king was i', 20))
# test not in original
print(generate_seq(model, mapping, 10, 'hello worl', 20))

Sing a song of sixpence, A poc
king was in his counting house
hello worl  peee fffff aeesree


- Running the example generates three sequences of text. The first is a test to see how the model does at starting from the beginning of the rhyme. The second is a test to see how well it does at beginning in the middle of a line. The final example is a test to see how well it does with a sequence of characters never seen before.
- We can see that the model did very well with the first two examples, as we would expect. We can also see that the model still generated something for the new text, but it is nonsense.