In [1]:
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Bidirectional
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
#from tensorflow.keras.normalization import BatchNormalization
#from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import pickle
import numpy as np
import os


In [2]:
file = open("../input/bangla-final/bangla_final_nstu.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines) 

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces 
data = data.split()
data = ' '.join(data)
data[:1020]

In [3]:
len(data)

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

In [5]:
len(sequence_data)

In [6]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

In [7]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

In [8]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [12]:
seq_length = len(sequences)

In [9]:
print("Data: ", X[:10])
print("Response: ", y[:10])

In [10]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

In [14]:
from tensorflow.keras import Model, Input
from keras.layers import GRU
from keras.layers import Embedding

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=seq_length))
model.add(Bidirectional(GRU(100, return_sequences=True)))
model.add(Bidirectional(GRU(100)))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words_LSTM.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics='accuracy' )



In [15]:
history = model.fit(X, y, epochs=100 , batch_size=128, callbacks=[checkpoint])

In [19]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words_LSTM.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):
  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  print(predicted_word)
  return predicted_word

In [29]:
while(True):
  text = input("এখানে টাইপ করুন: ")
  
  if text == "সমাপ্তি":
      print("প্রোগ্রাম সমাপ্ত....")
      break
  
  else:
      try:
          text = text.split(" ")
          text = text[-5:]
          print(text)
        
          Predict_Next_Words(model, tokenizer, text)
          
      except Exception as e:
        print("Error occurred: ",e)
        continue

In [16]:
from matplotlib import pyplot as plt
#plot the training and validation accuracy and loss at each epoch
loss = history.history['loss']
epochs = range(1, len(loss) +1)
plt.plot(epochs,loss,'y',label='Training loss')
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [20]:
history.history['accuracy']

In [17]:
from matplotlib import pyplot as plt
#plot the training and validation accuracy and loss at each epoch
accuracy = history.history['accuracy']
epochs = range(1, len(accuracy) +1)
plt.plot(epochs,accuracy,'y',label='Training accuracy')
plt.title('Training accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [18]:
score, acc = model.evaluate(X, y, batch_size = 32) 
   
print('Score:', score) 
print('Accuracy:', acc)