In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

2025-05-30 17:14:14.213563: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
corpus = [
    "Abhishek commitment to affordable education wasn't just a business strategy—it was his life's mission. Over the years, iNeuron has helped over 1.5 million students from 34+ countries, providing them with the skills they need to succeed in today's competitive job market. Many of these students, like Abhishek himself, came from disadvantaged backgrounds. They saw iNeuron as a lifeline—an opportunity to rise above their circumstances.",
    "In 2022, iNeuron was acquired by PhysicsWallah in a deal worth ₹250 crore. While this acquisition was a significant milestone, Abhishek remained focused on his mission. Even after the acquisition, iNeuron continued to offer some of the most affordable and accessible tech courses in the world.",
    "deep learning is a branch of machine learning",
    "natural language processing is a field of AI",
    "AI is the future",
    "I enjoy teaching AI",
    "students love AI projects",
    "learning new things is exciting",
    "learning new things is fun",
    "learning new things is rewarding",
    "learning new things is enjoyable",
    "teaching AI is rewarding",
    "learning new things is enjoyable",
]


In [6]:
tokenizer = Tokenizer() 

tokenizer.fit_on_texts(corpus)

In [7]:
tokenizer.index_word

{1: 'is',
 2: 'learning',
 3: 'a',
 4: 'the',
 5: 'ai',
 6: 'new',
 7: 'things',
 8: 'to',
 9: 'ineuron',
 10: 'in',
 11: 'of',
 12: 'abhishek',
 13: 'was',
 14: 'students',
 15: 'affordable',
 16: 'his',
 17: 'mission',
 18: 'over',
 19: 'from',
 20: 'they',
 21: 'acquisition',
 22: 'teaching',
 23: 'rewarding',
 24: 'enjoyable',
 25: 'commitment',
 26: 'education',
 27: "wasn't",
 28: 'just',
 29: 'business',
 30: 'strategy—it',
 31: "life's",
 32: 'years',
 33: 'has',
 34: 'helped',
 35: '1',
 36: '5',
 37: 'million',
 38: '34',
 39: 'countries',
 40: 'providing',
 41: 'them',
 42: 'with',
 43: 'skills',
 44: 'need',
 45: 'succeed',
 46: "today's",
 47: 'competitive',
 48: 'job',
 49: 'market',
 50: 'many',
 51: 'these',
 52: 'like',
 53: 'himself',
 54: 'came',
 55: 'disadvantaged',
 56: 'backgrounds',
 57: 'saw',
 58: 'as',
 59: 'lifeline—an',
 60: 'opportunity',
 61: 'rise',
 62: 'above',
 63: 'their',
 64: 'circumstances',
 65: '2022',
 66: 'acquired',
 67: 'by',
 68: 'physicswa

In [8]:
tokenizer.word_index

{'is': 1,
 'learning': 2,
 'a': 3,
 'the': 4,
 'ai': 5,
 'new': 6,
 'things': 7,
 'to': 8,
 'ineuron': 9,
 'in': 10,
 'of': 11,
 'abhishek': 12,
 'was': 13,
 'students': 14,
 'affordable': 15,
 'his': 16,
 'mission': 17,
 'over': 18,
 'from': 19,
 'they': 20,
 'acquisition': 21,
 'teaching': 22,
 'rewarding': 23,
 'enjoyable': 24,
 'commitment': 25,
 'education': 26,
 "wasn't": 27,
 'just': 28,
 'business': 29,
 'strategy—it': 30,
 "life's": 31,
 'years': 32,
 'has': 33,
 'helped': 34,
 '1': 35,
 '5': 36,
 'million': 37,
 '34': 38,
 'countries': 39,
 'providing': 40,
 'them': 41,
 'with': 42,
 'skills': 43,
 'need': 44,
 'succeed': 45,
 "today's": 46,
 'competitive': 47,
 'job': 48,
 'market': 49,
 'many': 50,
 'these': 51,
 'like': 52,
 'himself': 53,
 'came': 54,
 'disadvantaged': 55,
 'backgrounds': 56,
 'saw': 57,
 'as': 58,
 'lifeline—an': 59,
 'opportunity': 60,
 'rise': 61,
 'above': 62,
 'their': 63,
 'circumstances': 64,
 '2022': 65,
 'acquired': 66,
 'by': 67,
 'physicswallah

In [9]:
total_words = len(tokenizer.word_index)+1

In [10]:
input_sequence = [] 

# create n-grams for each sentence 
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    print(token_list)
    for i in  range(1,len(token_list)):
        ngram_seq = token_list[:i+1]
        print(ngram_seq)
        input_sequence.append(ngram_seq)
        

[12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17, 18, 4, 32, 9, 33, 34, 18, 35, 36, 37, 14, 19, 38, 39, 40, 41, 42, 4, 43, 20, 44, 8, 45, 10, 46, 47, 48, 49, 50, 11, 51, 14, 52, 12, 53, 54, 19, 55, 56, 20, 57, 9, 58, 3, 59, 60, 8, 61, 62, 63, 64]
[12, 25]
[12, 25, 8]
[12, 25, 8, 15]
[12, 25, 8, 15, 26]
[12, 25, 8, 15, 26, 27]
[12, 25, 8, 15, 26, 27, 28]
[12, 25, 8, 15, 26, 27, 28, 3]
[12, 25, 8, 15, 26, 27, 28, 3, 29]
[12, 25, 8, 15, 26, 27, 28, 3, 29, 30]
[12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13]
[12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16]
[12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31]
[12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17]
[12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17, 18]
[12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17, 18, 4]
[12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17, 18, 4, 32]
[12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17, 18, 4, 32, 9]
[12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17, 18, 4, 32, 9, 33]
[12, 25, 

In [11]:
input_sequence

[[12, 25],
 [12, 25, 8],
 [12, 25, 8, 15],
 [12, 25, 8, 15, 26],
 [12, 25, 8, 15, 26, 27],
 [12, 25, 8, 15, 26, 27, 28],
 [12, 25, 8, 15, 26, 27, 28, 3],
 [12, 25, 8, 15, 26, 27, 28, 3, 29],
 [12, 25, 8, 15, 26, 27, 28, 3, 29, 30],
 [12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13],
 [12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16],
 [12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31],
 [12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17],
 [12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17, 18],
 [12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17, 18, 4],
 [12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17, 18, 4, 32],
 [12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17, 18, 4, 32, 9],
 [12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17, 18, 4, 32, 9, 33],
 [12, 25, 8, 15, 26, 27, 28, 3, 29, 30, 13, 16, 31, 17, 18, 4, 32, 9, 33, 34],
 [12,
  25,
  8,
  15,
  26,
  27,
  28,
  3,
  29,
  30,
  13,
  16,
  31,
  17,
  18,
  4,
  32,
  9,
  33,
  34,
  18],
 [12,
  25,
  8,
  15,


In [12]:
input_sequence[0]

[12, 25]

In [13]:
max_seq_len = max(len(i) for i in input_sequence)
input_sequence = pad_sequences(input_sequence,maxlen=max_seq_len , padding='pre')

In [14]:
input_sequence

array([[ 0,  0,  0, ...,  0, 12, 25],
       [ 0,  0,  0, ..., 12, 25,  8],
       [ 0,  0,  0, ..., 25,  8, 15],
       ...,
       [ 0,  0,  0, ...,  2,  6,  7],
       [ 0,  0,  0, ...,  6,  7,  1],
       [ 0,  0,  0, ...,  7,  1, 24]], dtype=int32)

In [15]:
x = input_sequence[: , :-1]  # remove last column

In [16]:
x

array([[ 0,  0,  0, ...,  0,  0, 12],
       [ 0,  0,  0, ...,  0, 12, 25],
       [ 0,  0,  0, ..., 12, 25,  8],
       ...,
       [ 0,  0,  0, ...,  0,  2,  6],
       [ 0,  0,  0, ...,  2,  6,  7],
       [ 0,  0,  0, ...,  6,  7,  1]], dtype=int32)

In [17]:
y = input_sequence[: , -1]  # remove first column

In [18]:
y

array([ 25,   8,  15,  26,  27,  28,   3,  29,  30,  13,  16,  31,  17,
        18,   4,  32,   9,  33,  34,  18,  35,  36,  37,  14,  19,  38,
        39,  40,  41,  42,   4,  43,  20,  44,   8,  45,  10,  46,  47,
        48,  49,  50,  11,  51,  14,  52,  12,  53,  54,  19,  55,  56,
        20,  57,   9,  58,   3,  59,  60,   8,  61,  62,  63,  64,  65,
         9,  13,  66,  67,  68,  10,   3,  69,  70,  71,  72,  73,  74,
        21,  13,   3,  75,  76,  12,  77,  78,  79,  16,  17,  80,  81,
         4,  21,   9,  82,   8,  83,  84,  11,   4,  85,  15,  86,  87,
        88,  89,  10,   4,  90,   2,   1,   3,  92,  11,  93,   2,  95,
        96,   1,   3,  97,  11,   5,   1,   4,  98, 100,  22,   5, 101,
         5, 102,   6,   7,   1, 103,   6,   7,   1, 104,   6,   7,   1,
        23,   6,   7,   1,  24,   5,   1,  23,   6,   7,   1,  24],
      dtype=int32)

In [19]:
y  = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [20]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## building the model

In [21]:
model = Sequential([
    Embedding(total_words,32),   # embedding layer
    LSTM(64),    # LSTM layer
    Dense(total_words,activation='softmax')   # Dense layer and to do multiclassification we uss softmax
])

In [22]:
total_words

105

In [23]:
model.compile(loss = 'categorical_crossentropy' , optimizer='adam' , metrics=['accuracy'])

In [24]:
model.fit(x,y,epochs=400,verbose=1)

Epoch 1/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 53ms/step - accuracy: 0.0095 - loss: 4.6547 
Epoch 2/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.0684 - loss: 4.6458
Epoch 3/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.0675 - loss: 4.6361
Epoch 4/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.0593 - loss: 4.6094
Epoch 5/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.0554 - loss: 4.5244
Epoch 6/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.0532 - loss: 4.4469
Epoch 7/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.0380 - loss: 4.4159  
Epoch 8/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.0640 - loss: 4.3277
Epoch 9/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x14f897be0>

In [25]:
tokenizer.word_index

{'is': 1,
 'learning': 2,
 'a': 3,
 'the': 4,
 'ai': 5,
 'new': 6,
 'things': 7,
 'to': 8,
 'ineuron': 9,
 'in': 10,
 'of': 11,
 'abhishek': 12,
 'was': 13,
 'students': 14,
 'affordable': 15,
 'his': 16,
 'mission': 17,
 'over': 18,
 'from': 19,
 'they': 20,
 'acquisition': 21,
 'teaching': 22,
 'rewarding': 23,
 'enjoyable': 24,
 'commitment': 25,
 'education': 26,
 "wasn't": 27,
 'just': 28,
 'business': 29,
 'strategy—it': 30,
 "life's": 31,
 'years': 32,
 'has': 33,
 'helped': 34,
 '1': 35,
 '5': 36,
 'million': 37,
 '34': 38,
 'countries': 39,
 'providing': 40,
 'them': 41,
 'with': 42,
 'skills': 43,
 'need': 44,
 'succeed': 45,
 "today's": 46,
 'competitive': 47,
 'job': 48,
 'market': 49,
 'many': 50,
 'these': 51,
 'like': 52,
 'himself': 53,
 'came': 54,
 'disadvantaged': 55,
 'backgrounds': 56,
 'saw': 57,
 'as': 58,
 'lifeline—an': 59,
 'opportunity': 60,
 'rise': 61,
 'above': 62,
 'their': 63,
 'circumstances': 64,
 '2022': 65,
 'acquired': 66,
 'by': 67,
 'physicswallah

In [26]:
def predict_next_word(seed_text, num_words=5):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        next_word_index = np.argmax(predicted)
        for word, index in tokenizer.word_index.items():
            if index == next_word_index:
                seed_text += ' ' + word
                break
    return seed_text


In [27]:
predict_next_word("natural language is  ")


'natural language is   a field of ai projects'

In [28]:
predict_next_word("Abhishek commitment  ")


"Abhishek commitment   to affordable education wasn't just"