#### Dependencies


In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense,LSTM,GRU

#### Creating our Dataset

In [3]:
with open("words.txt","r") as file:
    text=file.read()

In [5]:
w_len=len(text)
indx=[]
for i in range(w_len):
    if text[i]==".":
        indx.append(i)

Extracting sentence from given text

In [6]:
sentence=[]
j=0
for ind in indx:
    sen=text[j:ind]
    sentence.append(sen)
    j=ind+1

In [7]:
sentence[6]

' Others prepared their work for the day and tried to keep a steady pace so they would not feel tired'

In [8]:
tokenizer=Tokenizer()

In [9]:
tokenizer.fit_on_texts([text])

In [10]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'to': 4,
 'people': 5,
 'day': 6,
 'of': 7,
 'in': 8,
 'their': 9,
 'they': 10,
 'made': 11,
 'air': 12,
 'slow': 13,
 'light': 14,
 'simple': 15,
 'steady': 16,
 'this': 17,
 'small': 18,
 'calm': 19,
 'felt': 20,
 'when': 21,
 'some': 22,
 'feel': 23,
 'was': 24,
 'moved': 25,
 'soft': 26,
 'with': 27,
 'from': 28,
 'sky': 29,
 'sound': 30,
 'every': 31,
 'sat': 32,
 'on': 33,
 'quiet': 34,
 'or': 35,
 'warm': 36,
 'it': 37,
 'open': 38,
 'many': 39,
 'liked': 40,
 'life': 41,
 'easy': 42,
 'came': 43,
 'clear': 44,
 'smooth': 45,
 'one': 46,
 'another': 47,
 'rhythm': 48,
 'bright': 49,
 'joy': 50,
 'like': 51,
 'stream': 52,
 'way': 53,
 'place': 54,
 'walked': 55,
 'touched': 56,
 'ground': 57,
 'wind': 58,
 'carried': 59,
 'gentle': 60,
 'as': 61,
 'space': 62,
 'here': 63,
 'each': 64,
 'spread': 65,
 'fields': 66,
 'colors': 67,
 'new': 68,
 'looked': 69,
 'at': 70,
 'for': 71,
 'were': 72,
 'near': 73,
 'voices': 74,
 'even': 75,
 'who': 76,
 'w

In [11]:
len(tokenizer.word_counts)

230

In [12]:
df=pd.DataFrame(columns=["Input","Output"])

In [13]:
sentence=tokenizer.texts_to_sequences(sentence)

In [14]:
for sen in sentence:
    leng = len(sen)       # len is now the built-in function

    for i in range(leng):
        df = pd.concat([df, pd.DataFrame([{"Input": sen[0:i], "Output": sen[i]}])],ignore_index=True)
       

In [15]:
df=df.iloc[1:]

In [16]:
df

Unnamed: 0,Input,Output
1,[1],6
2,"[1, 6]",24
3,"[1, 6, 24]",34
4,"[1, 6, 24, 34]",2
5,"[1, 6, 24, 34, 2]",1
...,...,...
552,"[5, 40, 17, 15, 48, 2, 64, 68, 6, 20, 51, 47, ...",19
553,"[5, 40, 17, 15, 48, 2, 64, 68, 6, 20, 51, 47, ...",87
554,"[5, 40, 17, 15, 48, 2, 64, 68, 6, 20, 51, 47, ...",2
555,"[5, 40, 17, 15, 48, 2, 64, 68, 6, 20, 51, 47, ...",16


So we have coverted the text into int endoded arrays and got input and output as dataset. now we need to zero padd inputs to make them uniform

In [17]:
x=df["Input"]
y=df["Output"]

In [18]:
x=np.array(x)
y=np.array(y)

In [19]:
seq_leng=[len(x) for x in x]
max_len=max(seq_leng)
max_len

23

In [20]:
x=pad_sequences(x,maxlen=max_len,padding="pre")

In [21]:
x.shape

(556, 23)

In [22]:
## total words in vocabulary
vocab_size = x.max()

In [23]:
y=np.reshape(y,(-1,1))

In [24]:
y

array([[6],
       [24],
       [34],
       [2],
       [1],
       [12],
       [25],
       [8],
       [3],
       [13],
       [2],
       [26],
       [53],
       [5],
       [8],
       [1],
       [18],
       [54],
       [55],
       [27],
       [19],
       [95],
       [2],
       [96],
       [20],
       [97],
       [35],
       [98],
       [3],
       [36],
       [14],
       [28],
       [1],
       [29],
       [56],
       [1],
       [57],
       [2],
       [1],
       [58],
       [59],
       [3],
       [60],
       [30],
       [61],
       [37],
       [99],
       [100],
       [1],
       [38],
       [62],
       [39],
       [5],
       [40],
       [1],
       [15],
       [41],
       [63],
       [101],
       [64],
       [6],
       [20],
       [42],
       [4],
       [102],
       [21],
       [1],
       [103],
       [43],
       [104],
       [1],
       [14],
       [65],
       [105],
       [1],
       [66],
       [2],
       [1],
      

In [25]:
type(y)

numpy.ndarray

Using OHE on output data

In [26]:
ohe=OneHotEncoder(sparse_output=False)
y=ohe.fit_transform(y)

In [27]:
y.shape

(556, 230)

In [28]:
len(tokenizer.word_index)

230

So No of words = no of columns in output

In [None]:
input_l=x.shape[(1)]
model=Sequential([
    Embedding(input_dim=vocab_size+1,output_dim=100,input_length=input_l), ### there are 229 words and each OHEed x has 23 len
    GRU(150), # each GRU cell will have 150 neurons in it(1 layer)
    Dense(vocab_size,activation="softmax") # 229 words so 229 classes
])



In [30]:
model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

In [31]:
model.summary()

In [32]:
model.fit(x,y,epochs=100)

Epoch 1/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.0827 - loss: 5.4125
Epoch 2/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.1007 - loss: 5.1235
Epoch 3/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1007 - loss: 4.9425
Epoch 4/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.1007 - loss: 4.8579
Epoch 5/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.1025 - loss: 4.8017
Epoch 6/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.1313 - loss: 4.7271
Epoch 7/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1385 - loss: 4.6140
Epoch 8/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.1475 - loss: 4.4773
Epoch 9/100
[1m18/18[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x1cb50dabc70>

Predicting a word

In [33]:
word="Each new day i want "
## tokeizing the text
word_token=tokenizer.texts_to_sequences([word])[0]
## paddeing
word_pad=pad_sequences([word_token],maxlen=max_len)

In [34]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'to': 4,
 'people': 5,
 'day': 6,
 'of': 7,
 'in': 8,
 'their': 9,
 'they': 10,
 'made': 11,
 'air': 12,
 'slow': 13,
 'light': 14,
 'simple': 15,
 'steady': 16,
 'this': 17,
 'small': 18,
 'calm': 19,
 'felt': 20,
 'when': 21,
 'some': 22,
 'feel': 23,
 'was': 24,
 'moved': 25,
 'soft': 26,
 'with': 27,
 'from': 28,
 'sky': 29,
 'sound': 30,
 'every': 31,
 'sat': 32,
 'on': 33,
 'quiet': 34,
 'or': 35,
 'warm': 36,
 'it': 37,
 'open': 38,
 'many': 39,
 'liked': 40,
 'life': 41,
 'easy': 42,
 'came': 43,
 'clear': 44,
 'smooth': 45,
 'one': 46,
 'another': 47,
 'rhythm': 48,
 'bright': 49,
 'joy': 50,
 'like': 51,
 'stream': 52,
 'way': 53,
 'place': 54,
 'walked': 55,
 'touched': 56,
 'ground': 57,
 'wind': 58,
 'carried': 59,
 'gentle': 60,
 'as': 61,
 'space': 62,
 'here': 63,
 'each': 64,
 'spread': 65,
 'fields': 66,
 'colors': 67,
 'new': 68,
 'looked': 69,
 'at': 70,
 'for': 71,
 'were': 72,
 'near': 73,
 'voices': 74,
 'even': 75,
 'who': 76,
 'w

In [35]:
import time
text = "It feel like"
for i in range(4):
    word_token = tokenizer.texts_to_sequences([text])[0]
    word_pad = pad_sequences([word_token], maxlen=max_len,padding="pre")
    predict = model.predict(word_pad)
    index = np.argmax(predict)
    for word_, indx in tokenizer.word_index.items():
        if indx==index:
            text = text + " " + word_
            print(text)
            time.sleep(3)    


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 246ms/step
It feel like slow
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
It feel like slow the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
It feel like slow the from
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
It feel like slow the from hands


In [37]:
import time
text = "The people liked"

for i in range(10):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=56, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      time.sleep(2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
The people liked sat
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
The people liked sat sat
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step
The people liked sat sat soft
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
The people liked sat sat soft hours
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
The people liked sat sat soft hours beds
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
The people liked sat sat soft hours beds here
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
The people liked sat sat soft hours beds here felt
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
