# Import Libraries

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import requests
import io

# Manually data

In [2]:
data = [
    "i love to eat pizza",
    "the weather is nice today",
    "can you help me with this",
    "let's go to the park",
    "what time is it now",
    "i need to buy groceries",
    "how are you doing today",
    "the movie was really good",
    "she went to the store",
    "we should meet tomorrow",
    "do you like coffee or tea",
    "my phone battery is low",
    "he is working from home",
    "they are coming for dinner",
    "i have a meeting at noon",
    "the book was interesting",
    "we need more time to finish",
    "she has a beautiful voice",
    "let me know your thoughts",
    "the train arrives at eight",
    "i prefer tea over coffee",
    "he forgot his keys again",
    "we went shopping yesterday",
    "the kids are playing outside",
    "this restaurant has great food",
    "she is learning to drive",
    "i can't find my wallet",
    "they moved to a new house",
    "the project is due tomorrow",
    "we watched the sunset together",
    "he fixed the broken chair",
    "i need to call my mom",
    "the cat is sleeping on the couch",
    "she baked cookies for us",
    "we might go on vacation",
    "the team won the championship",
    "i ordered a new laptop",
    "they're getting married next year",
    "the flowers smell wonderful",
    "i finished reading the novel",
    "we need to clean the house",
    "he suggested a great idea",
    "the baby is crying loudly",
    "i lost my train of thought",
    "she wears glasses to read",
    "we planted trees in the garden",
    "the concert starts at seven",
    "i feel tired after work",
    "they adopted a cute puppy",
    "the soup tastes delicious"
]

# Tokenization

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)

In [7]:
total_words = len(tokenizer.word_index)+1
print('Total words : ', total_words)

Total words :  163


# Making Sequences

In [14]:
input_sequences = []
for line in data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [28]:
max_length = max([len(x) for x in input_sequences])

In [29]:
print(max_length)

7


In [30]:
input_sequences = pad_sequences(input_sequences , maxlen=max_length, padding='pre')

# Seperating Input and Output

In [31]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [33]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Downloading Glove Vector

In [41]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2025-05-01 18:48:11--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-05-01 18:48:11--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-05-01 18:48:12--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [43]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [44]:
glove_file = 'glove.6B.100d.txt'

# Embedding Using Glove 100 dimensional

In [47]:
embedding_index = {}
with open(glove_file, encoding='utf-8') as f:
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embedding_index[word] = coefs

In [48]:
embedding_dim= 100
embedding_matrix = np.zeros((total_words, embedding_dim))
for word, i in tokenizer.word_index.items():
  embedding_vector = embedding_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

# Model Architecture

In [68]:
model = Sequential([
    Embedding(total_words, embedding_dim, input_length=max_length-1, weights=[embedding_matrix], trainable=False),
    LSTM(128),
    Dense(64, activation='relu'),
    Dense(total_words, activation='softmax')
])

In [69]:
model.compile(
    optimizer = 'adam',
    loss= 'categorical_crossentropy',
    metrics = ['accuracy']
)

In [70]:
model.summary()

# Training

In [71]:
model.fit(X, y, epochs=100)

Epoch 1/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.0108 - loss: 5.0864
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0732 - loss: 5.0382 
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0679 - loss: 4.9841 
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0952 - loss: 4.8887 
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0834 - loss: 4.8103 
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0967 - loss: 4.6700 
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1192 - loss: 4.5327 
Epoch 8/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1123 - loss: 4.3968 
Epoch 9/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x7acaaa3593d0>

# Predicting top 3 words

In [56]:
def predict_next_words(text, num_predictions=3):

    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_length-1, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)[0]

    top_indices = np.argsort(predicted_probs)[-num_predictions:][::-1]
    predicted_words = []
    for i in top_indices:
        for word, index in tokenizer.word_index.items():
            if index == i:
                predicted_words.append(word)
                break

    return predicted_words

In [72]:
phrase = str(input('Enter Word: '))
predictions = predict_next_words(phrase)
print(f"After '{phrase}', the model predicts: {predictions}")

Enter Word: I love to
After 'I love to', the model predicts: ['eat', 'the', 'my']


In [73]:
phrase = str(input('Enter Word: '))
predictions = predict_next_words(phrase)
print(f"After '{phrase}', the model predicts: {predictions}")

Enter Word: I want to play
After 'I want to play', the model predicts: ['a', 'the', 'call']


In [74]:
phrase = str(input('Enter Word: '))
predictions = predict_next_words(phrase)
print(f"After '{phrase}', the model predicts: {predictions}")

Enter Word: I am
After 'I am', the model predicts: ['need', 'prefer', "can't"]
