# Nepali Next word predictor using LSTM

In [1]:
import tensorflow as tf
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2.16.2
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("GPU is available:", physical_devices)
else:
    print("GPU is NOT available")

GPU is available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
## Data Preprocessing
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

##laod the dataset
with open('combined.txt','r') as file:
    text=file.read().lower()

## Tokenize the text-creating indexes for words
tokenizer = Tokenizer(num_words=10000)  # Keep the most frequent 10,000 words
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1
total_words



25995

In [5]:
tokenizer.word_index

{'र': 1,
 'पनि': 2,
 'छ': 3,
 'छन्': 4,
 'भने': 5,
 'हो': 6,
 'उनले': 7,
 'थियो': 8,
 'थिए': 9,
 'नेपाली': 10,
 'यो': 11,
 'गरेको': 12,
 'भएको': 13,
 'म': 14,
 'कविता': 15,
 'तर': 16,
 'लागि': 17,
 'नै': 18,
 'गरे': 19,
 'गर्ने': 20,
 'गरेका': 21,
 'एक': 22,
 'पुस्तक': 23,
 'त': 24,
 'मैले': 25,
 'गर्न': 26,
 'केही': 27,
 'तथा': 28,
 'त्यो': 29,
 'मेरो': 30,
 'भन्ने': 31,
 'गर्दै': 32,
 'साहित्य': 33,
 'मलाई': 34,
 'रूपमा': 35,
 'आफ्नो': 36,
 'एउटा': 37,
 'उनी': 38,
 'हुन्': 39,
 'बताए': 40,
 'पुरस्कार': 41,
 'हुन्छ': 42,
 'कुनै': 43,
 'नेपाल': 44,
 'गरिएको': 45,
 'हुने': 46,
 'दुई': 47,
 'कथा': 48,
 'रहेको': 49,
 'कुरा': 50,
 'उनको': 51,
 'भयो': 52,
 'कवि': 53,
 'गीत': 54,
 'छैन': 55,
 'भए': 56,
 'दिन': 57,
 'कला': 58,
 'धेरै': 59,
 'नयाँ': 60,
 'हुन': 61,
 'हामी': 62,
 'सार्वजनिक': 63,
 'भएका': 64,
 'मात्र': 65,
 'साहित्यकार': 66,
 'लेखक': 67,
 'पहिलो': 68,
 'सम्मान': 69,
 'आएको': 70,
 'बढी': 71,
 'कि': 72,
 'भनेर': 73,
 'कृति': 74,
 'बजारमा': 75,
 'गरी': 76,
 'मा': 77,
 'वर्ष': 78,


In [6]:
## create input sequences
input_sequences=[]
for line in text.split('\n'):
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)

#converting the text into sequences, i.e converting each word into sequence of it's and pervious word


In [7]:
input_sequences

[[6491, 400],
 [6491, 400, 252],
 [6491, 400, 252, 243],
 [6491, 400, 252, 243, 6492],
 [6491, 400, 252, 243, 6492, 1423],
 [6491, 400, 252, 243, 6492, 1423, 1019],
 [6491, 400, 252, 243, 6492, 1423, 1019, 3047],
 [6491, 400, 252, 243, 6492, 1423, 1019, 3047, 592],
 [6491, 400, 252, 243, 6492, 1423, 1019, 3047, 592, 3707],
 [6491, 400, 252, 243, 6492, 1423, 1019, 3047, 592, 3707, 3708],
 [6491, 400, 252, 243, 6492, 1423, 1019, 3047, 592, 3707, 3708, 3048],
 [6491, 400, 252, 243, 6492, 1423, 1019, 3047, 592, 3707, 3708, 3048, 253],
 [6491,
  400,
  252,
  243,
  6492,
  1423,
  1019,
  3047,
  592,
  3707,
  3708,
  3048,
  253,
  3707],
 [111, 1103],
 [111, 1103, 400],
 [111, 1103, 400, 4761],
 [111, 1103, 400, 4761, 4],
 [111, 1103, 400, 4761, 4, 1756],
 [111, 1103, 400, 4761, 4, 1756, 4762],
 [111, 1103, 400, 4761, 4, 1756, 4762, 89],
 [111, 1103, 400, 4761, 4, 1756, 4762, 89, 5],
 [111, 1103, 400, 4761, 4, 1756, 4762, 89, 5, 2],
 [111, 1103, 400, 4761, 4, 1756, 4762, 89, 5, 2, 38],


In [8]:
## Pad Sequences -> for making every words of equal sequences
max_sequence_len=max([len(x) for x in input_sequences])
max_sequence_len

16

What we did is, first conveted the words in sentences into their own vector which consist of themselves and previous word and now finally we are making pad sequence for each variable so that they become equal length.

In [9]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0, 6491,  400],
       [   0,    0,    0, ..., 6491,  400,  252],
       [   0,    0,    0, ...,  400,  252,  243],
       ...,
       [   0,    0,    0, ...,  363,   18,   71],
       [   0,    0,    0, ...,   18,   71,  531],
       [   0,    0,    0, ...,   71,  531,   26]], dtype=int32)

In [10]:
##create predicitors and label
#just converting the datasets into training and testing i.e x is all the word till the last word
# and the y is just the last word.
import tensorflow as tf
x,y=input_sequences[:,:-1],input_sequences[:,-1]

In [11]:
x

array([[   0,    0,    0, ...,    0,    0, 6491],
       [   0,    0,    0, ...,    0, 6491,  400],
       [   0,    0,    0, ..., 6491,  400,  252],
       ...,
       [   0,    0,    0, ..., 9952,  363,   18],
       [   0,    0,    0, ...,  363,   18,   71],
       [   0,    0,    0, ...,   18,   71,  531]], dtype=int32)

In [12]:
y

array([400, 252, 243, ...,  71, 531,  26], dtype=int32)

In [13]:
#conveting the y into categorical dataset
y=tf.keras.utils.to_categorical(y,num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [15]:
# Define early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
# ## Train our LSTM RNN

# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GRU

# ## Define the model
# model=Sequential()
# model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
# model.add(LSTM(150,return_sequences=True))
# model.add(Dropout(0.2))
# model.add(LSTM(100))
# model.add(Dense(total_words,activation="softmax"))

# # #Compile the model
# model.compile(loss="categorical_crossentropy",optimizer='adam',metrics=['accuracy'])
# model.summary()



In [16]:
## GRU RNN
## Define the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GRU

model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(GRU(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(GRU(100))
model.add(Dense(total_words,activation="softmax"))

# #Compile the model
model.compile(loss="categorical_crossentropy",optimizer='adam',metrics=['accuracy'])
model.summary()

2025-02-04 11:36:21.959271: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-02-04 11:36:21.959453: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-02-04 11:36:21.959582: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-02-04 11:36:21.960364: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-02-04 11:36:21.960410: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [17]:
## Train the model
history=model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1)


Epoch 1/50


2025-02-04 11:36:33.134631: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m2171/2171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 36ms/step - accuracy: 0.0000e+00 - loss: 0.0000e+00 - val_accuracy: 0.0000e+00 - val_loss: 10.1731
Epoch 2/50
[1m2171/2171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 35ms/step - accuracy: 0.0000e+00 - loss: 0.0000e+00 - val_accuracy: 0.0000e+00 - val_loss: 10.1730
Epoch 3/50
[1m 174/2171[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:05[0m 33ms/step - accuracy: 0.0000e+00 - loss: 0.0000e+00

KeyboardInterrupt: 

### HAD TO STOP TRAINING COZZ MY MACBOOK WAS HEATING LIKE CRAZY, I WAS TRAINING MODEL IN CPU 

In [20]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [21]:
## Save the model
model.save("next_word_lstm.h5")
## Save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)



In [26]:
input_text="म नै हुँ जो मलाई नै"

print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word PRediction:{next_word}")

Input text:म नै हुँ जो मलाई नै
Next Word PRediction:थियो
