In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
corpus = [
    "Sudhanshu's commitment to affordable education wasn't just a business strategy—it was his life's mission. Over the years, iNeuron has helped over 1.5 million students from 34+ countries, providing them with the skills they need to succeed in today's competitive job market. Many of these students, like Sudhanshu himself, came from disadvantaged backgrounds. They saw iNeuron as a lifeline—an opportunity to rise above their circumstances.",
    "In 2022, iNeuron was acquired by PhysicsWallah in a deal worth ₹250 crore. While this acquisition was a significant milestone, Sudhanshu remained focused on his mission. Even after the acquisition, iNeuron continued to offer some of the most affordable and accessible tech courses in the world.",
    "deep learning is a branch of machine learning",
    "natural language processing is a field of AI",
    "AI is the future",
    "I enjoy teaching AI",
    "students love AI projects",
    "learning new things is exciting",
    "teaching AI is rewarding"
]

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [4]:
tokenizer.index_word

{1: 'is',
 2: 'learning',
 3: 'ai',
 4: 'machine',
 5: 'i',
 6: 'love',
 7: 'a',
 8: 'of',
 9: 'teaching',
 10: 'fun',
 11: 'deep',
 12: 'branch',
 13: 'natural',
 14: 'language',
 15: 'processing',
 16: 'field',
 17: 'the',
 18: 'future',
 19: 'enjoy',
 20: 'students',
 21: 'projects',
 22: 'new',
 23: 'things',
 24: 'exciting',
 25: 'rewarding'}

In [5]:
total_words = len(tokenizer.word_index)+1

In [6]:
tokenizer.word_index

{'is': 1,
 'learning': 2,
 'ai': 3,
 'machine': 4,
 'i': 5,
 'love': 6,
 'a': 7,
 'of': 8,
 'teaching': 9,
 'fun': 10,
 'deep': 11,
 'branch': 12,
 'natural': 13,
 'language': 14,
 'processing': 15,
 'field': 16,
 'the': 17,
 'future': 18,
 'enjoy': 19,
 'students': 20,
 'projects': 21,
 'new': 22,
 'things': 23,
 'exciting': 24,
 'rewarding': 25}

In [11]:
input_sequence = [] 
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    print(token_list)
    for i in  range(1,len(token_list)):
        ngram_seq = token_list[:i+1]
        print(ngram_seq)
        input_sequence.append(ngram_seq)
        

[5, 6, 4, 2]
[5, 6]
[5, 6, 4]
[5, 6, 4, 2]
[4, 2, 1, 10]
[4, 2]
[4, 2, 1]
[4, 2, 1, 10]
[11, 2, 1, 7, 12, 8, 4, 2]
[11, 2]
[11, 2, 1]
[11, 2, 1, 7]
[11, 2, 1, 7, 12]
[11, 2, 1, 7, 12, 8]
[11, 2, 1, 7, 12, 8, 4]
[11, 2, 1, 7, 12, 8, 4, 2]
[13, 14, 15, 1, 7, 16, 8, 3]
[13, 14]
[13, 14, 15]
[13, 14, 15, 1]
[13, 14, 15, 1, 7]
[13, 14, 15, 1, 7, 16]
[13, 14, 15, 1, 7, 16, 8]
[13, 14, 15, 1, 7, 16, 8, 3]
[3, 1, 17, 18]
[3, 1]
[3, 1, 17]
[3, 1, 17, 18]
[5, 19, 9, 3]
[5, 19]
[5, 19, 9]
[5, 19, 9, 3]
[20, 6, 3, 21]
[20, 6]
[20, 6, 3]
[20, 6, 3, 21]
[2, 22, 23, 1, 24]
[2, 22]
[2, 22, 23]
[2, 22, 23, 1]
[2, 22, 23, 1, 24]
[9, 3, 1, 25]
[9, 3]
[9, 3, 1]
[9, 3, 1, 25]


In [10]:
input_sequence

[[5, 6],
 [5, 6, 4],
 [5, 6, 4, 2],
 [4, 2],
 [4, 2, 1],
 [4, 2, 1, 10],
 [11, 2],
 [11, 2, 1],
 [11, 2, 1, 7],
 [11, 2, 1, 7, 12],
 [11, 2, 1, 7, 12, 8],
 [11, 2, 1, 7, 12, 8, 4],
 [11, 2, 1, 7, 12, 8, 4, 2],
 [13, 14],
 [13, 14, 15],
 [13, 14, 15, 1],
 [13, 14, 15, 1, 7],
 [13, 14, 15, 1, 7, 16],
 [13, 14, 15, 1, 7, 16, 8],
 [13, 14, 15, 1, 7, 16, 8, 3],
 [3, 1],
 [3, 1, 17],
 [3, 1, 17, 18],
 [5, 19],
 [5, 19, 9],
 [5, 19, 9, 3],
 [20, 6],
 [20, 6, 3],
 [20, 6, 3, 21],
 [2, 22],
 [2, 22, 23],
 [2, 22, 23, 1],
 [2, 22, 23, 1, 24],
 [9, 3],
 [9, 3, 1],
 [9, 3, 1, 25]]

In [18]:
max_seq_len = max(len(i) for i in input_sequence)
input_sequence = pad_sequences(input_sequence,maxlen=max_seq_len , padding='pre')

In [21]:
input_sequence

array([[ 0,  0,  0,  0,  0,  0,  5,  6],
       [ 0,  0,  0,  0,  0,  5,  6,  4],
       [ 0,  0,  0,  0,  5,  6,  4,  2],
       [ 0,  0,  0,  0,  0,  0,  4,  2],
       [ 0,  0,  0,  0,  0,  4,  2,  1],
       [ 0,  0,  0,  0,  4,  2,  1, 10],
       [ 0,  0,  0,  0,  0,  0, 11,  2],
       [ 0,  0,  0,  0,  0, 11,  2,  1],
       [ 0,  0,  0,  0, 11,  2,  1,  7],
       [ 0,  0,  0, 11,  2,  1,  7, 12],
       [ 0,  0, 11,  2,  1,  7, 12,  8],
       [ 0, 11,  2,  1,  7, 12,  8,  4],
       [11,  2,  1,  7, 12,  8,  4,  2],
       [ 0,  0,  0,  0,  0,  0, 13, 14],
       [ 0,  0,  0,  0,  0, 13, 14, 15],
       [ 0,  0,  0,  0, 13, 14, 15,  1],
       [ 0,  0,  0, 13, 14, 15,  1,  7],
       [ 0,  0, 13, 14, 15,  1,  7, 16],
       [ 0, 13, 14, 15,  1,  7, 16,  8],
       [13, 14, 15,  1,  7, 16,  8,  3],
       [ 0,  0,  0,  0,  0,  0,  3,  1],
       [ 0,  0,  0,  0,  0,  3,  1, 17],
       [ 0,  0,  0,  0,  3,  1, 17, 18],
       [ 0,  0,  0,  0,  0,  0,  5, 19],
       [ 0,  0, 

In [19]:
x = input_sequence[: , :-1]

In [20]:
x

array([[ 0,  0,  0,  0,  0,  0,  5],
       [ 0,  0,  0,  0,  0,  5,  6],
       [ 0,  0,  0,  0,  5,  6,  4],
       [ 0,  0,  0,  0,  0,  0,  4],
       [ 0,  0,  0,  0,  0,  4,  2],
       [ 0,  0,  0,  0,  4,  2,  1],
       [ 0,  0,  0,  0,  0,  0, 11],
       [ 0,  0,  0,  0,  0, 11,  2],
       [ 0,  0,  0,  0, 11,  2,  1],
       [ 0,  0,  0, 11,  2,  1,  7],
       [ 0,  0, 11,  2,  1,  7, 12],
       [ 0, 11,  2,  1,  7, 12,  8],
       [11,  2,  1,  7, 12,  8,  4],
       [ 0,  0,  0,  0,  0,  0, 13],
       [ 0,  0,  0,  0,  0, 13, 14],
       [ 0,  0,  0,  0, 13, 14, 15],
       [ 0,  0,  0, 13, 14, 15,  1],
       [ 0,  0, 13, 14, 15,  1,  7],
       [ 0, 13, 14, 15,  1,  7, 16],
       [13, 14, 15,  1,  7, 16,  8],
       [ 0,  0,  0,  0,  0,  0,  3],
       [ 0,  0,  0,  0,  0,  3,  1],
       [ 0,  0,  0,  0,  3,  1, 17],
       [ 0,  0,  0,  0,  0,  0,  5],
       [ 0,  0,  0,  0,  0,  5, 19],
       [ 0,  0,  0,  0,  5, 19,  9],
       [ 0,  0,  0,  0,  0,  0, 20],
 

In [22]:
y = input_sequence[: , -1]

In [23]:
y

array([ 6,  4,  2,  2,  1, 10,  2,  1,  7, 12,  8,  4,  2, 14, 15,  1,  7,
       16,  8,  3,  1, 17, 18, 19,  9,  3,  6,  3, 21, 22, 23,  1, 24,  3,
        1, 25], dtype=int32)

In [24]:
y  = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [25]:
y

array([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 

In [34]:
model = Sequential([
    Embedding(total_words,32),
    LSTM(64),
    Dense(total_words,activation='softmax')
])

In [35]:
total_words

26

In [36]:
model.compile(loss = 'categorical_crossentropy' , optimizer='adam' , metrics=['accuracy'])

In [37]:
model.fit(x,y,epochs=400,verbose=1)

Epoch 1/400
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.0289 - loss: 3.2594
Epoch 2/400
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1343 - loss: 3.2536
Epoch 3/400
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.2315 - loss: 3.2486
Epoch 4/400
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.1817 - loss: 3.2444
Epoch 5/400
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1921 - loss: 3.2396
Epoch 6/400
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.2188 - loss: 3.2354
Epoch 7/400
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.2025 - loss: 3.2290
Epoch 8/400
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1736 - loss: 3.2219
Epoch 9/400
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x271b2f26f10>

In [38]:
tokenizer.word_index

{'is': 1,
 'learning': 2,
 'ai': 3,
 'machine': 4,
 'i': 5,
 'love': 6,
 'a': 7,
 'of': 8,
 'teaching': 9,
 'fun': 10,
 'deep': 11,
 'branch': 12,
 'natural': 13,
 'language': 14,
 'processing': 15,
 'field': 16,
 'the': 17,
 'future': 18,
 'enjoy': 19,
 'students': 20,
 'projects': 21,
 'new': 22,
 'things': 23,
 'exciting': 24,
 'rewarding': 25}

In [45]:
def predict_next_word(seed_text, num_words=5):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        next_word_index = np.argmax(predicted)
        for word, index in tokenizer.word_index.items():
            if index == next_word_index:
                seed_text += ' ' + word
                break
    return seed_text


In [46]:
predict_next_word("natural language is  ")

'natural language is   a field field of ai'