In [1]:
import numpy as np
import os
import string
os.environ['KERAS_BACKEND']='tensorflow'

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras.models import Sequential, load_model


import tensorflow as tf
from pickle import dump,load

In [2]:
data = open('raw txt/texts.txt','r',encoding='utf-8').read()

In [3]:
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text
def clean_doc(doc):
    doc = doc.replace('--',' ')
    tokens = doc.split()
    table = str.maketrans('','',string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens
def save_doc(lines,filename):
    data = '\n'.join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close()
file_name = './raw txt/texts.txt'
text = load_doc(file_name)
tokens = clean_doc(text)
print('unique tokens : %d' %len(set(tokens)))

unique tokens : 7409


In [4]:
length = 50 + 1
sequences = list()
for i in range(length,len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    sequences.append(line)

out_filename = 'raw txt/saved.txt'
if not os.path.exists(out_filename):
    save_doc(sequences,out_filename)

In [5]:
fname = out_filename
doc = load_doc(fname)
lines = doc.split('\n')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(sequences)
sequences[:5]

[[11,
  1045,
  329,
  7409,
  4,
  1,
  2873,
  35,
  213,
  1,
  261,
  3,
  2251,
  9,
  11,
  179,
  817,
  123,
  92,
  2872,
  4,
  1,
  2250,
  7408,
  1,
  7407,
  7406,
  2,
  75,
  120,
  11,
  1266,
  4,
  110,
  6,
  30,
  168,
  16,
  49,
  7405,
  1,
  1609,
  13,
  57,
  8,
  549,
  151,
  11,
  57,
  1265,
  35],
 [1045,
  329,
  7409,
  4,
  1,
  2873,
  35,
  213,
  1,
  261,
  3,
  2251,
  9,
  11,
  179,
  817,
  123,
  92,
  2872,
  4,
  1,
  2250,
  7408,
  1,
  7407,
  7406,
  2,
  75,
  120,
  11,
  1266,
  4,
  110,
  6,
  30,
  168,
  16,
  49,
  7405,
  1,
  1609,
  13,
  57,
  8,
  549,
  151,
  11,
  57,
  1265,
  35,
  1],
 [329,
  7409,
  4,
  1,
  2873,
  35,
  213,
  1,
  261,
  3,
  2251,
  9,
  11,
  179,
  817,
  123,
  92,
  2872,
  4,
  1,
  2250,
  7408,
  1,
  7407,
  7406,
  2,
  75,
  120,
  11,
  1266,
  4,
  110,
  6,
  30,
  168,
  16,
  49,
  7405,
  1,
  1609,
  13,
  57,
  8,
  549,
  151,
  11,
  57,
  1265,
  35,
  1,
  2874],
 [7409,
 

In [6]:
vocab_size = len(tokenizer.word_index) + 1 # cause first idx is 1 we use idx 0 for padding
vocab_size

7410

In [7]:
seqs = np.array(sequences)
x,y = seqs[:,:-1],seqs[:,-1]
y = to_categorical(y,num_classes=vocab_size)
seq_length = x.shape[1] # 50 words

In [8]:
model = Sequential([
    Embedding(input_dim=vocab_size,output_dim=50,input_length=seq_length),
    LSTM(100,return_sequences=True),
    LSTM(100),
    Dense(100,activation='relu'),
    Dense(vocab_size,activation='softmax')
])
model.summary()



In [9]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(x,y,epochs=400,batch_size=128)

Epoch 1/400
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 106ms/step - accuracy: 0.0632 - loss: 6.4789
Epoch 2/400
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 105ms/step - accuracy: 0.1036 - loss: 5.7303
Epoch 3/400
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 118ms/step - accuracy: 0.1297 - loss: 5.4753
Epoch 4/400
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 123ms/step - accuracy: 0.1446 - loss: 5.2931
Epoch 5/400
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 129ms/step - accuracy: 0.1525 - loss: 5.1755
Epoch 6/400
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 125ms/step - accuracy: 0.1571 - loss: 5.1107
Epoch 7/400
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 123ms/step - accuracy: 0.1629 - loss: 4.9896
Epoch 8/400
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 151ms/step - accuracy: 0.1677 - loss: 4.9136
E

<keras.src.callbacks.history.History at 0x1c403261f90>

In [10]:
model.save('nlm-word.keras')
dump(tokenizer,open('word-tokenizer.pkl','wb'))

In [11]:
model = load_model('nlm-word.keras')
tokenizer = load(open('word-tokenizer.pkl','rb'))

In [12]:
def generate_seq(model,tokenizer,seq_length,seed_text,n_words):
    in_text = seed_text
    result = list()
    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded],maxlen=seq_length,truncating='pre')
        ypred = model.predict(encoded,verbose=0)
        ypred = np.argmax(ypred)
        out_word = ''
        for word,idx in tokenizer.word_index.items():
            if ypred == idx:
                out_word = word
                break
        in_text += ' '+out_word
        result.append(out_word)
    return ' '.join(result)

In [13]:
doc = load_doc('raw txt/saved.txt')
lines = doc.split('\n')
seq_length = len(lines[0].split())-1
seed_text = lines[np.random.randint(0,len(lines))]
seed_text

'that you do not praise justice but the appearance of it we shall think that you are only exhorting us to keep injustice dark and that you really agree with thrasymachus in thinking that justice is anothers good and the interest of the stronger and that injustice is a mans own'

In [14]:
result = generate_seq(model,tokenizer,seq_length,seed_text,50)
print(result)

profit and interest though injurious to the weaker now as far as may be deemed removed from rich men that at the land of the soul true then shall we deem yourself you would say that although the prison is laid upon the spirit of power makes her own ends
