In [2]:
# Import necessary libraries from TensorFlow and Keras

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
import numpy as np

In [4]:
import pandas as pd

In [5]:
# 1. Tokenization and sequence processing

# Initialize the tokenizer for processing text data
tokenizer = Tokenizer()

In [6]:
with open('traindata.txt', 'r', encoding='utf-8') as file:
    faqs = file.read()# read the text from the file

In [7]:
# Fit the tokenizer on the text data to create a vocabulary
tokenizer.fit_on_texts([faqs])   

# Define vocabulary size by adding 1 to the word index length (for padding)
vocab_size = len(tokenizer.word_index) + 1 # Add 1 for padding token

In [8]:
len(tokenizer.word_index)

282

In [9]:
# Generate input sequences

input_sequences = []
for sentence in faqs.split('\n'):
    # Convert each sentence to a sequence of tokens
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    
    # Generate n-gram sequences (increasing subsets of the sentence) for training
    for i in range(1,len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i+1])

In [10]:
# Determine the maximum length of sequences for padding consistency
max_len = max([len(x) for x in input_sequences])

In [11]:
# Pad sequences to ensure they are the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [12]:
padded_input_sequences

array([[  0,   0,   0, ...,   0,  93,   1],
       [  0,   0,   0, ...,  93,   1,  13],
       [  0,   0,   0, ...,   0,  11,   7],
       ...,
       [  0,   0,   0, ..., 279,  18, 280],
       [  0,   0,   0, ...,  18, 280, 281],
       [  0,   0,   0, ..., 280, 281, 282]])

In [13]:
# Split data into predictors (X) and target labels (y)
X = padded_input_sequences[:,:-1]    # X consists of the sequence without the last token

In [14]:
y = padded_input_sequences[:,-1]     # y is the last token in each sequence

In [15]:
X.shape

(863, 56)

In [16]:
y.shape

(863,)

In [17]:
from tensorflow.keras.utils import to_categorical
# Convert target labels to one-hot encoded vectors for categorical prediction
y = to_categorical(y,num_classes=283)

In [18]:
X.shape,y.shape

((863, 56), (863, 283))

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [20]:
# 2. Model Building
# Define the model architecture
model = Sequential()
#Embedding layer to learn word embeddings from input tokens
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=X.shape[1]))

# First LSTM layer with return_sequences=True to pass the output to the next LSTM
model.add(LSTM(150, return_sequences=True)) 

# Second LSTM layer without return_sequences to pass a single output vector to the Dense layer
model.add(LSTM(150))   

# Dense layer with softmax activation for next-word prediction across the vocabulary
model.add(Dense(vocab_size, activation='softmax'))
model.build(input_shape=(None, max_len-1))



In [21]:
# 3. Compile and Train the Model
# Compile the model using categorical crossentropy as the loss function and Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [22]:
model.summary()

In [23]:
# Train the model on the prepared dataset for 100 epochs
model.fit(X,y,epochs=100)

Epoch 1/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 93ms/step - accuracy: 0.0604 - loss: 5.5259
Epoch 2/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 93ms/step - accuracy: 0.0804 - loss: 5.0339
Epoch 3/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 95ms/step - accuracy: 0.0773 - loss: 4.9896
Epoch 4/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 99ms/step - accuracy: 0.0826 - loss: 5.0261
Epoch 5/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 98ms/step - accuracy: 0.0532 - loss: 5.0779
Epoch 6/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 93ms/step - accuracy: 0.0734 - loss: 4.9443
Epoch 7/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 91ms/step - accuracy: 0.0853 - loss: 4.9021
Epoch 8/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 86ms/step - accuracy: 0.0911 - loss: 4.8307
Epoch 9/100
[1m27/27[0m [32m━━━━━━━━━

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 90ms/step - accuracy: 0.9501 - loss: 0.2829
Epoch 69/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 91ms/step - accuracy: 0.9564 - loss: 0.2650
Epoch 70/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 95ms/step - accuracy: 0.9477 - loss: 0.2801
Epoch 71/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 94ms/step - accuracy: 0.9539 - loss: 0.2549
Epoch 72/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 91ms/step - accuracy: 0.9408 - loss: 0.2756
Epoch 73/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 96ms/step - accuracy: 0.9489 - loss: 0.2580
Epoch 74/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 91ms/step - accuracy: 0.9528 - loss: 0.2383
Epoch 75/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 95ms/step - accuracy: 0.9550 - loss: 0.2348
Epoch 76/100
[1m27/27[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x206799ecb80>

In [24]:
import time
text = input()

for i in range(5):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=56, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
print(text)

subscription is the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 551ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
subscription is the validity period is 30 days
