In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from numpy.random import seed
seed(1)
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku

In [2]:
from english_words import get_english_words_set
web2lowerset = get_english_words_set(['web2'], lower=True)
len(web2lowerset)

234450

In [3]:
# Filter for words containing e n t i r a
corpus = list(web2lowerset)
valid_letters = ['e', 'n', 't', 'i', 'r','a']
dictionary = []
for word in corpus:
    flag = True
    for letters in word:
        if letters not in valid_letters:
            flag = False
            break
    if flag:
        dictionary.append(word)

In [4]:
X_for_train = []
X_for_test = []
for i in range(len(dictionary)):
    if i%3 == 0:
        X_for_test.append(dictionary[i])
    else:
        X_for_train.append(dictionary[i])

len(X_for_train)

409

In [9]:
X_for_train[1]

'tarrateen'

In [8]:
tokenizer = Tokenizer(char_level=True)
def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_chars = len(tokenizer.word_index) + 1
    
    ## convert data to a token sequence 
    input_sequences = []
    for word in corpus:
        token_list = tokenizer.texts_to_sequences([word])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_chars
inp_sequences, total_chars = get_sequence_of_tokens(X_for_train)

inp_sequences[:15]

[[2, 5],
 [2, 5, 1],
 [2, 5, 1, 3],
 [2, 5, 1, 3, 5],
 [2, 5, 1, 3, 5, 6],
 [2, 5, 1, 3, 5, 6, 4],
 [1, 2],
 [1, 2, 5],
 [1, 2, 5, 5],
 [1, 2, 5, 5, 2],
 [1, 2, 5, 5, 2, 1],
 [1, 2, 5, 5, 2, 1, 3],
 [1, 2, 5, 5, 2, 1, 3, 3],
 [1, 2, 5, 5, 2, 1, 3, 3, 4],
 [1, 3]]

In [10]:
max_sequence_length = max([len(seq) for seq in inp_sequences])
padded_sequences = pad_sequences(inp_sequences, maxlen=max_sequence_length, padding='pre')

padded_sequences[:10]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 5],
       [0, 0, 0, 0, 0, 0, 0, 0, 2, 5, 1],
       [0, 0, 0, 0, 0, 0, 0, 2, 5, 1, 3],
       [0, 0, 0, 0, 0, 0, 2, 5, 1, 3, 5],
       [0, 0, 0, 0, 0, 2, 5, 1, 3, 5, 6],
       [0, 0, 0, 0, 2, 5, 1, 3, 5, 6, 4],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 5],
       [0, 0, 0, 0, 0, 0, 0, 1, 2, 5, 5],
       [0, 0, 0, 0, 0, 0, 1, 2, 5, 5, 2]], dtype=int32)

In [11]:
X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]

In [12]:
y = tf.keras.utils.to_categorical(y, num_classes=total_chars)

In [13]:
# Building the model
model = Sequential()
model.add(Embedding(total_chars, 10, input_length=max_sequence_length-1))
model.add(LSTM(100))
model.add(Dropout(0.1))
model.add(Dense(total_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

2024-01-28 14:29:28.132788: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-01-28 14:29:28.132825: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-01-28 14:29:28.132837: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-01-28 14:29:28.132895: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-28 14:29:28.132922: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 10)            70        
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 7)                 707       
                                                                 
Total params: 45177 (176.47 KB)
Trainable params: 45177 (176.47 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
model.fit(X, y, epochs=100, verbose=2)

Epoch 1/100


2024-01-28 14:31:05.916144: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-01-28 14:31:06.289191: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-01-28 14:31:08.640782: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


60/60 - 7s - loss: 1.8380 - 7s/epoch - 111ms/step
Epoch 2/100
60/60 - 1s - loss: 1.7694 - 914ms/epoch - 15ms/step
Epoch 3/100
60/60 - 1s - loss: 1.6775 - 900ms/epoch - 15ms/step
Epoch 4/100
60/60 - 1s - loss: 1.5882 - 858ms/epoch - 14ms/step
Epoch 5/100
60/60 - 1s - loss: 1.5566 - 859ms/epoch - 14ms/step
Epoch 6/100
60/60 - 1s - loss: 1.5439 - 859ms/epoch - 14ms/step
Epoch 7/100
60/60 - 1s - loss: 1.5279 - 860ms/epoch - 14ms/step
Epoch 8/100
60/60 - 1s - loss: 1.5078 - 859ms/epoch - 14ms/step
Epoch 9/100
60/60 - 1s - loss: 1.5003 - 856ms/epoch - 14ms/step
Epoch 10/100
60/60 - 1s - loss: 1.4857 - 851ms/epoch - 14ms/step
Epoch 11/100
60/60 - 1s - loss: 1.4730 - 858ms/epoch - 14ms/step
Epoch 12/100
60/60 - 1s - loss: 1.4649 - 884ms/epoch - 15ms/step
Epoch 13/100
60/60 - 1s - loss: 1.4590 - 909ms/epoch - 15ms/step
Epoch 14/100
60/60 - 1s - loss: 1.4573 - 915ms/epoch - 15ms/step
Epoch 15/100
60/60 - 1s - loss: 1.4510 - 851ms/epoch - 14ms/step
Epoch 16/100
60/60 - 1s - loss: 1.4349 - 845ms/e

<keras.src.callbacks.History at 0x283e7e950>

In [18]:
def get_index(array):
    maxi = max(array)
    for i in range(len(array)):
        if array[i] == maxi:
            return i
    

def generate_word(seed_text, next_words, model, max_sequence_length):
    ans = []
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted_probabilities = model.predict(token_list, verbose=0)
        predicted_probabilities = predicted_probabilities[0] #to convert it into a 1D list
        predicted = get_index(predicted_probabilities)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += output_word
        ans.append(seed_text)
    return ans




In [20]:
print("4 words starting with ent is ")
gen_words = generate_word("ent", 4, model, max_sequence_length)
print(gen_words)

4 words starting with ent is 
['ente', 'enter', 'entert', 'enterta']


In [21]:
dictionary[:10]

['rat',
 'arterin',
 'tarrateen',
 'rantan',
 'teat',
 'tete',
 'renet',
 'nitrate',
 'enter',
 'terna']

In [30]:
import random

new_dictionary = []

for word in dictionary:
    if len(word) > 3:
        num = random.randint(1,len(word) - 3)
        new_word = word[:-num]
        new_dictionary.append(new_word)
    else:
         new_dictionary.append(word)



In [31]:
new_dictionary[:10]

['rat', 'arteri', 'tarrat', 'ran', 'tea', 'tet', 'ren', 'nitr', 'ent', 'ter']

In [32]:
from tqdm import tqdm

whole_corpus = []
for i in tqdm(range(len(new_dictionary))):
    answer = generate_word(new_dictionary[i], 4, model, max_sequence_length)
    whole_corpus.append(answer)



100%|██████████| 614/614 [01:08<00:00,  9.02it/s]


In [33]:
whole_corpus[:10]

[['ratt', 'ratte', 'rattee', 'ratteen'],
 ['arterin', 'arterine', 'arteriner', 'arterinera'],
 ['tarrate', 'tarratee', 'tarrateen', 'tarrateene'],
 ['rani', 'ranin', 'ranina', 'raninae'],
 ['tear', 'teare', 'tearer', 'teareri'],
 ['tetr', 'tetra', 'tetran', 'tetrane'],
 ['renn', 'renne', 'rennet', 'rennete'],
 ['nitra', 'nitrat', 'nitrate', 'nitrater'],
 ['ente', 'enter', 'entert', 'enterta'],
 ['tert', 'terti', 'tertia', 'tertian']]

In [34]:
generated_words_phonetic = []

for list_ele in whole_corpus:
    for word in list_ele:
        generated_words_phonetic.append(word)


In [35]:
generated_words_phonetic[:10]

['ratt',
 'ratte',
 'rattee',
 'ratteen',
 'arterin',
 'arterine',
 'arteriner',
 'arterinera',
 'tarrate',
 'tarratee']

In [36]:
len(generated_words_phonetic)

2456

In [37]:
with open("generated_words.txt", "w+") as fd:
    for items in generated_words_phonetic:
        fd.write("%s\n" %items)

fd.close()