In [31]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [32]:
corpus = [
    "Abhishek commitment to affordable education wasn't just a business strategy—it was his life's mission. Over the years, iNeuron has helped over 1.5 million students from 34+ countries, providing them with the skills they need to succeed in today's competitive job market. Many of these students, like Abhishek himself, came from disadvantaged backgrounds. They saw iNeuron as a lifeline—an opportunity to rise above their circumstances.",
    "In 2022, iNeuron was acquired by PhysicsWallah in a deal worth ₹250 crore. While this acquisition was a significant milestone, Abhishek remained focused on his mission. Even after the acquisition, iNeuron continued to offer some of the most affordable and accessible tech courses in the world.",
    "deep learning is a branch of machine learning",
    "natural language processing is a field of AI",
    "AI is the future",
    "I enjoy teaching AI",
    "students love AI projects",
    "learning new things is exciting",
    "teaching AI is rewarding"
]

In [33]:
tokenizer = Tokenizer() 

tokenizer.fit_on_texts(corpus)

In [34]:
tokenizer.index_word

{1: 'a',
 2: 'the',
 3: 'is',
 4: 'ai',
 5: 'to',
 6: 'ineuron',
 7: 'in',
 8: 'of',
 9: 'abhishek',
 10: 'was',
 11: 'students',
 12: 'learning',
 13: 'affordable',
 14: 'his',
 15: 'mission',
 16: 'over',
 17: 'from',
 18: 'they',
 19: 'acquisition',
 20: 'teaching',
 21: 'commitment',
 22: 'education',
 23: "wasn't",
 24: 'just',
 25: 'business',
 26: 'strategy—it',
 27: "life's",
 28: 'years',
 29: 'has',
 30: 'helped',
 31: '1',
 32: '5',
 33: 'million',
 34: '34',
 35: 'countries',
 36: 'providing',
 37: 'them',
 38: 'with',
 39: 'skills',
 40: 'need',
 41: 'succeed',
 42: "today's",
 43: 'competitive',
 44: 'job',
 45: 'market',
 46: 'many',
 47: 'these',
 48: 'like',
 49: 'himself',
 50: 'came',
 51: 'disadvantaged',
 52: 'backgrounds',
 53: 'saw',
 54: 'as',
 55: 'lifeline—an',
 56: 'opportunity',
 57: 'rise',
 58: 'above',
 59: 'their',
 60: 'circumstances',
 61: '2022',
 62: 'acquired',
 63: 'by',
 64: 'physicswallah',
 65: 'deal',
 66: 'worth',
 67: '₹250',
 68: 'crore',
 6

In [35]:
tokenizer.word_index

{'a': 1,
 'the': 2,
 'is': 3,
 'ai': 4,
 'to': 5,
 'ineuron': 6,
 'in': 7,
 'of': 8,
 'abhishek': 9,
 'was': 10,
 'students': 11,
 'learning': 12,
 'affordable': 13,
 'his': 14,
 'mission': 15,
 'over': 16,
 'from': 17,
 'they': 18,
 'acquisition': 19,
 'teaching': 20,
 'commitment': 21,
 'education': 22,
 "wasn't": 23,
 'just': 24,
 'business': 25,
 'strategy—it': 26,
 "life's": 27,
 'years': 28,
 'has': 29,
 'helped': 30,
 '1': 31,
 '5': 32,
 'million': 33,
 '34': 34,
 'countries': 35,
 'providing': 36,
 'them': 37,
 'with': 38,
 'skills': 39,
 'need': 40,
 'succeed': 41,
 "today's": 42,
 'competitive': 43,
 'job': 44,
 'market': 45,
 'many': 46,
 'these': 47,
 'like': 48,
 'himself': 49,
 'came': 50,
 'disadvantaged': 51,
 'backgrounds': 52,
 'saw': 53,
 'as': 54,
 'lifeline—an': 55,
 'opportunity': 56,
 'rise': 57,
 'above': 58,
 'their': 59,
 'circumstances': 60,
 '2022': 61,
 'acquired': 62,
 'by': 63,
 'physicswallah': 64,
 'deal': 65,
 'worth': 66,
 '₹250': 67,
 'crore': 68,
 '

In [36]:
total_words = len(tokenizer.word_index)+1

In [37]:
input_sequence = [] 

# create n-grams for each sentence 
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    print(token_list)
    for i in  range(1,len(token_list)):
        ngram_seq = token_list[:i+1]
        print(ngram_seq)
        input_sequence.append(ngram_seq)
        

[9, 21, 5, 13, 22, 23, 24, 1, 25, 26, 10, 14, 27, 15, 16, 2, 28, 6, 29, 30, 16, 31, 32, 33, 11, 17, 34, 35, 36, 37, 38, 2, 39, 18, 40, 5, 41, 7, 42, 43, 44, 45, 46, 8, 47, 11, 48, 9, 49, 50, 17, 51, 52, 18, 53, 6, 54, 1, 55, 56, 5, 57, 58, 59, 60]
[9, 21]
[9, 21, 5]
[9, 21, 5, 13]
[9, 21, 5, 13, 22]
[9, 21, 5, 13, 22, 23]
[9, 21, 5, 13, 22, 23, 24]
[9, 21, 5, 13, 22, 23, 24, 1]
[9, 21, 5, 13, 22, 23, 24, 1, 25]
[9, 21, 5, 13, 22, 23, 24, 1, 25, 26]
[9, 21, 5, 13, 22, 23, 24, 1, 25, 26, 10]
[9, 21, 5, 13, 22, 23, 24, 1, 25, 26, 10, 14]
[9, 21, 5, 13, 22, 23, 24, 1, 25, 26, 10, 14, 27]
[9, 21, 5, 13, 22, 23, 24, 1, 25, 26, 10, 14, 27, 15]
[9, 21, 5, 13, 22, 23, 24, 1, 25, 26, 10, 14, 27, 15, 16]
[9, 21, 5, 13, 22, 23, 24, 1, 25, 26, 10, 14, 27, 15, 16, 2]
[9, 21, 5, 13, 22, 23, 24, 1, 25, 26, 10, 14, 27, 15, 16, 2, 28]
[9, 21, 5, 13, 22, 23, 24, 1, 25, 26, 10, 14, 27, 15, 16, 2, 28, 6]
[9, 21, 5, 13, 22, 23, 24, 1, 25, 26, 10, 14, 27, 15, 16, 2, 28, 6, 29]
[9, 21, 5, 13, 22, 23, 24, 1, 2

In [55]:
input_sequence

array([[  0,   0,   0, ...,   0,   9,  21],
       [  0,   0,   0, ...,   9,  21,   5],
       [  0,   0,   0, ...,  21,   5,  13],
       ...,
       [  0,   0,   0, ...,   0,  20,   4],
       [  0,   0,   0, ...,  20,   4,   3],
       [  0,   0,   0, ...,   4,   3, 102]], dtype=int32)

In [56]:
input_sequence[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  9, 21],
      dtype=int32)

In [61]:
max_seq_len = max(len(i) for i in input_sequence)
  = pad_sequences(input_sequence,maxlen=max_seq_len , padding='pre')

In [62]:
input_sequence

array([[  0,   0,   0, ...,   0,   9,  21],
       [  0,   0,   0, ...,   9,  21,   5],
       [  0,   0,   0, ...,  21,   5,  13],
       ...,
       [  0,   0,   0, ...,   0,  20,   4],
       [  0,   0,   0, ...,  20,   4,   3],
       [  0,   0,   0, ...,   4,   3, 102]], dtype=int32)

In [57]:
x = input_sequence[: , :-1]  # remove last column

In [65]:
x

array([[ 0,  0,  0, ...,  0,  0,  9],
       [ 0,  0,  0, ...,  0,  9, 21],
       [ 0,  0,  0, ...,  9, 21,  5],
       ...,
       [ 0,  0,  0, ...,  0,  0, 20],
       [ 0,  0,  0, ...,  0, 20,  4],
       [ 0,  0,  0, ..., 20,  4,  3]], dtype=int32)

In [59]:
y = input_sequence[: , -1]  # remove first column

In [60]:
y

array([ 21,   5,  13,  22,  23,  24,   1,  25,  26,  10,  14,  27,  15,
        16,   2,  28,   6,  29,  30,  16,  31,  32,  33,  11,  17,  34,
        35,  36,  37,  38,   2,  39,  18,  40,   5,  41,   7,  42,  43,
        44,  45,  46,   8,  47,  11,  48,   9,  49,  50,  17,  51,  52,
        18,  53,   6,  54,   1,  55,  56,   5,  57,  58,  59,  60,  61,
         6,  10,  62,  63,  64,   7,   1,  65,  66,  67,  68,  69,  70,
        19,  10,   1,  71,  72,   9,  73,  74,  75,  14,  15,  76,  77,
         2,  19,   6,  78,   5,  79,  80,   8,   2,  81,  13,  82,  83,
        84,  85,   7,   2,  86,  12,   3,   1,  88,   8,  89,  12,  91,
        92,   3,   1,  93,   8,   4,   3,   2,  94,  96,  20,   4,  97,
         4,  98,  99, 100,   3, 101,   4,   3, 102], dtype=int32)

In [45]:
y  = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [66]:
y

array([ 21,   5,  13,  22,  23,  24,   1,  25,  26,  10,  14,  27,  15,
        16,   2,  28,   6,  29,  30,  16,  31,  32,  33,  11,  17,  34,
        35,  36,  37,  38,   2,  39,  18,  40,   5,  41,   7,  42,  43,
        44,  45,  46,   8,  47,  11,  48,   9,  49,  50,  17,  51,  52,
        18,  53,   6,  54,   1,  55,  56,   5,  57,  58,  59,  60,  61,
         6,  10,  62,  63,  64,   7,   1,  65,  66,  67,  68,  69,  70,
        19,  10,   1,  71,  72,   9,  73,  74,  75,  14,  15,  76,  77,
         2,  19,   6,  78,   5,  79,  80,   8,   2,  81,  13,  82,  83,
        84,  85,   7,   2,  86,  12,   3,   1,  88,   8,  89,  12,  91,
        92,   3,   1,  93,   8,   4,   3,   2,  94,  96,  20,   4,  97,
         4,  98,  99, 100,   3, 101,   4,   3, 102], dtype=int32)

## building the model

In [67]:
model = Sequential([
    Embedding(total_words,32),   # embedding layer
    LSTM(64),    # LSTM layer
    Dense(total_words,activation='softmax')   # Dense layer and to do multiclassification we uss softmax
])

In [48]:
total_words

103

In [49]:
model.compile(loss = 'categorical_crossentropy' , optimizer='adam' , metrics=['accuracy'])

In [50]:
model.fit(x,y,epochs=400,verbose=1)

Epoch 1/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 39ms/step - accuracy: 0.0054 - loss: 4.6350 
Epoch 2/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.0552 - loss: 4.6274
Epoch 3/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.0545 - loss: 4.6189
Epoch 4/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.0406 - loss: 4.6028
Epoch 5/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.0333 - loss: 4.5703  
Epoch 6/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.0356 - loss: 4.5060
Epoch 7/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.0456 - loss: 4.4982
Epoch 8/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.0444 - loss: 4.4237  
Epoch 9/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x14620c460>

In [51]:
tokenizer.word_index

{'a': 1,
 'the': 2,
 'is': 3,
 'ai': 4,
 'to': 5,
 'ineuron': 6,
 'in': 7,
 'of': 8,
 'abhishek': 9,
 'was': 10,
 'students': 11,
 'learning': 12,
 'affordable': 13,
 'his': 14,
 'mission': 15,
 'over': 16,
 'from': 17,
 'they': 18,
 'acquisition': 19,
 'teaching': 20,
 'commitment': 21,
 'education': 22,
 "wasn't": 23,
 'just': 24,
 'business': 25,
 'strategy—it': 26,
 "life's": 27,
 'years': 28,
 'has': 29,
 'helped': 30,
 '1': 31,
 '5': 32,
 'million': 33,
 '34': 34,
 'countries': 35,
 'providing': 36,
 'them': 37,
 'with': 38,
 'skills': 39,
 'need': 40,
 'succeed': 41,
 "today's": 42,
 'competitive': 43,
 'job': 44,
 'market': 45,
 'many': 46,
 'these': 47,
 'like': 48,
 'himself': 49,
 'came': 50,
 'disadvantaged': 51,
 'backgrounds': 52,
 'saw': 53,
 'as': 54,
 'lifeline—an': 55,
 'opportunity': 56,
 'rise': 57,
 'above': 58,
 'their': 59,
 'circumstances': 60,
 '2022': 61,
 'acquired': 62,
 'by': 63,
 'physicswallah': 64,
 'deal': 65,
 'worth': 66,
 '₹250': 67,
 'crore': 68,
 '

In [52]:
def predict_next_word(seed_text, num_words=5):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        next_word_index = np.argmax(predicted)
        for word, index in tokenizer.word_index.items():
            if index == next_word_index:
                seed_text += ' ' + word
                break
    return seed_text


In [53]:
predict_next_word("natural language is  ")

'natural language is   a field of ai is'