## Next Word Prediction

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense
from sklearn.model_selection import train_test_split

In [3]:
corpus = [
    "Sudhanshu's commitment to affordable education wasn't just a business strategy—it was his life's mission. Over the years, iNeuron has helped over 1.5 million students from 34+ countries, providing them with the skills they need to succeed in today's competitive job market. Many of these students, like Sudhanshu himself, came from disadvantaged backgrounds. They saw iNeuron as a lifeline—an opportunity to rise above their circumstances.",
    "In 2022, iNeuron was acquired by PhysicsWallah in a deal worth ₹250 crore. While this acquisition was a significant milestone, Sudhanshu remained focused on his mission. Even after the acquisition, iNeuron continued to offer some of the most affordable and accessible tech courses in the world.",
    "deep learning is a branch of machine learning",
    "natural language processing is a field of AI",
    "AI is the future",
    "I enjoy teaching AI",
    "students love AI projects",
    "learning new things is exciting",
    "teaching AI is rewarding"
]

In [4]:
tokenizer = Tokenizer()

In [5]:
tokenizer.fit_on_texts(corpus)

In [6]:
tokenizer.index_word

{1: 'a',
 2: 'the',
 3: 'is',
 4: 'ai',
 5: 'to',
 6: 'ineuron',
 7: 'in',
 8: 'of',
 9: 'was',
 10: 'students',
 11: 'learning',
 12: 'affordable',
 13: 'his',
 14: 'mission',
 15: 'over',
 16: 'from',
 17: 'they',
 18: 'sudhanshu',
 19: 'acquisition',
 20: 'teaching',
 21: "sudhanshu's",
 22: 'commitment',
 23: 'education',
 24: "wasn't",
 25: 'just',
 26: 'business',
 27: 'strategy—it',
 28: "life's",
 29: 'years',
 30: 'has',
 31: 'helped',
 32: '1',
 33: '5',
 34: 'million',
 35: '34',
 36: 'countries',
 37: 'providing',
 38: 'them',
 39: 'with',
 40: 'skills',
 41: 'need',
 42: 'succeed',
 43: "today's",
 44: 'competitive',
 45: 'job',
 46: 'market',
 47: 'many',
 48: 'these',
 49: 'like',
 50: 'himself',
 51: 'came',
 52: 'disadvantaged',
 53: 'backgrounds',
 54: 'saw',
 55: 'as',
 56: 'lifeline—an',
 57: 'opportunity',
 58: 'rise',
 59: 'above',
 60: 'their',
 61: 'circumstances',
 62: '2022',
 63: 'acquired',
 64: 'by',
 65: 'physicswallah',
 66: 'deal',
 67: 'worth',
 68: '₹2

In [8]:
total_words = len(tokenizer.word_index) + 1

In [9]:
total_words

104

In [10]:
tokenizer.word_index

{'a': 1,
 'the': 2,
 'is': 3,
 'ai': 4,
 'to': 5,
 'ineuron': 6,
 'in': 7,
 'of': 8,
 'was': 9,
 'students': 10,
 'learning': 11,
 'affordable': 12,
 'his': 13,
 'mission': 14,
 'over': 15,
 'from': 16,
 'they': 17,
 'sudhanshu': 18,
 'acquisition': 19,
 'teaching': 20,
 "sudhanshu's": 21,
 'commitment': 22,
 'education': 23,
 "wasn't": 24,
 'just': 25,
 'business': 26,
 'strategy—it': 27,
 "life's": 28,
 'years': 29,
 'has': 30,
 'helped': 31,
 '1': 32,
 '5': 33,
 'million': 34,
 '34': 35,
 'countries': 36,
 'providing': 37,
 'them': 38,
 'with': 39,
 'skills': 40,
 'need': 41,
 'succeed': 42,
 "today's": 43,
 'competitive': 44,
 'job': 45,
 'market': 46,
 'many': 47,
 'these': 48,
 'like': 49,
 'himself': 50,
 'came': 51,
 'disadvantaged': 52,
 'backgrounds': 53,
 'saw': 54,
 'as': 55,
 'lifeline—an': 56,
 'opportunity': 57,
 'rise': 58,
 'above': 59,
 'their': 60,
 'circumstances': 61,
 '2022': 62,
 'acquired': 63,
 'by': 64,
 'physicswallah': 65,
 'deal': 66,
 'worth': 67,
 '₹250':

In [16]:
input_sequence = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    print(token_list)
    for i in range(1, len(token_list)):
        ngram_seq = token_list[:i+1]
        print(ngram_seq)
        input_sequence.append(ngram_seq)
        

[21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14, 15, 2, 29, 6, 30, 31, 15, 32, 33, 34, 10, 16, 35, 36, 37, 38, 39, 2, 40, 17, 41, 5, 42, 7, 43, 44, 45, 46, 47, 8, 48, 10, 49, 18, 50, 51, 16, 52, 53, 17, 54, 6, 55, 1, 56, 57, 5, 58, 59, 60, 61]
[21, 22]
[21, 22, 5]
[21, 22, 5, 12]
[21, 22, 5, 12, 23]
[21, 22, 5, 12, 23, 24]
[21, 22, 5, 12, 23, 24, 25]
[21, 22, 5, 12, 23, 24, 25, 1]
[21, 22, 5, 12, 23, 24, 25, 1, 26]
[21, 22, 5, 12, 23, 24, 25, 1, 26, 27]
[21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9]
[21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13]
[21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28]
[21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14]
[21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14, 15]
[21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14, 15, 2]
[21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14, 15, 2, 29]
[21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14, 15, 2, 29, 6]
[21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14, 15, 2, 29, 6, 30]
[21, 22, 5, 12, 23, 2

In [17]:
input_sequence

[[21, 22],
 [21, 22, 5],
 [21, 22, 5, 12],
 [21, 22, 5, 12, 23],
 [21, 22, 5, 12, 23, 24],
 [21, 22, 5, 12, 23, 24, 25],
 [21, 22, 5, 12, 23, 24, 25, 1],
 [21, 22, 5, 12, 23, 24, 25, 1, 26],
 [21, 22, 5, 12, 23, 24, 25, 1, 26, 27],
 [21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9],
 [21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13],
 [21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28],
 [21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14],
 [21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14, 15],
 [21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14, 15, 2],
 [21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14, 15, 2, 29],
 [21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14, 15, 2, 29, 6],
 [21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14, 15, 2, 29, 6, 30],
 [21, 22, 5, 12, 23, 24, 25, 1, 26, 27, 9, 13, 28, 14, 15, 2, 29, 6, 30, 31],
 [21,
  22,
  5,
  12,
  23,
  24,
  25,
  1,
  26,
  27,
  9,
  13,
  28,
  14,
  15,
  2,
  29,
  6,
  30,
  31,
  15],
 [21,
  22,
  5,
  12,
  23,
  24,

In [19]:
max_seq_len = max(len(i) for i in input_sequence)
input_sequence = pad_sequences(input_sequence, maxlen=max_seq_len, padding = 'pre')


In [20]:
input_sequence

array([[  0,   0,   0, ...,   0,  21,  22],
       [  0,   0,   0, ...,  21,  22,   5],
       [  0,   0,   0, ...,  22,   5,  12],
       ...,
       [  0,   0,   0, ...,   0,  20,   4],
       [  0,   0,   0, ...,  20,   4,   3],
       [  0,   0,   0, ...,   4,   3, 103]], dtype=int32)

In [21]:
x = input_sequence[:,:-1]

In [22]:
x

array([[ 0,  0,  0, ...,  0,  0, 21],
       [ 0,  0,  0, ...,  0, 21, 22],
       [ 0,  0,  0, ..., 21, 22,  5],
       ...,
       [ 0,  0,  0, ...,  0,  0, 20],
       [ 0,  0,  0, ...,  0, 20,  4],
       [ 0,  0,  0, ..., 20,  4,  3]], dtype=int32)

In [23]:
y = input_sequence[:,-1]


In [24]:
y

array([ 22,   5,  12,  23,  24,  25,   1,  26,  27,   9,  13,  28,  14,
        15,   2,  29,   6,  30,  31,  15,  32,  33,  34,  10,  16,  35,
        36,  37,  38,  39,   2,  40,  17,  41,   5,  42,   7,  43,  44,
        45,  46,  47,   8,  48,  10,  49,  18,  50,  51,  16,  52,  53,
        17,  54,   6,  55,   1,  56,  57,   5,  58,  59,  60,  61,  62,
         6,   9,  63,  64,  65,   7,   1,  66,  67,  68,  69,  70,  71,
        19,   9,   1,  72,  73,  18,  74,  75,  76,  13,  14,  77,  78,
         2,  19,   6,  79,   5,  80,  81,   8,   2,  82,  12,  83,  84,
        85,  86,   7,   2,  87,  11,   3,   1,  89,   8,  90,  11,  92,
        93,   3,   1,  94,   8,   4,   3,   2,  95,  97,  20,   4,  98,
         4,  99, 100, 101,   3, 102,   4,   3, 103], dtype=int32)

In [25]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [26]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [27]:
model = Sequential([
    Embedding(total_words, 32),
    LSTM(64),
    Dense(total_words, activation='softmax')
])

In [28]:
total_words

104

In [29]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [30]:
model.fit(x,y, epochs=400, verbose=1)

Epoch 1/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.0237 - loss: 4.6446
Epoch 2/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0563 - loss: 4.6367
Epoch 3/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0578 - loss: 4.6284
Epoch 4/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0522 - loss: 4.6115
Epoch 5/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0624 - loss: 4.5550
Epoch 6/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0315 - loss: 4.5102
Epoch 7/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0491 - loss: 4.4222    
Epoch 8/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.0704 - loss: 4.3341
Epoch 9/400
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x26d7f07eb40>

In [31]:
tokenizer.word_index

{'a': 1,
 'the': 2,
 'is': 3,
 'ai': 4,
 'to': 5,
 'ineuron': 6,
 'in': 7,
 'of': 8,
 'was': 9,
 'students': 10,
 'learning': 11,
 'affordable': 12,
 'his': 13,
 'mission': 14,
 'over': 15,
 'from': 16,
 'they': 17,
 'sudhanshu': 18,
 'acquisition': 19,
 'teaching': 20,
 "sudhanshu's": 21,
 'commitment': 22,
 'education': 23,
 "wasn't": 24,
 'just': 25,
 'business': 26,
 'strategy—it': 27,
 "life's": 28,
 'years': 29,
 'has': 30,
 'helped': 31,
 '1': 32,
 '5': 33,
 'million': 34,
 '34': 35,
 'countries': 36,
 'providing': 37,
 'them': 38,
 'with': 39,
 'skills': 40,
 'need': 41,
 'succeed': 42,
 "today's": 43,
 'competitive': 44,
 'job': 45,
 'market': 46,
 'many': 47,
 'these': 48,
 'like': 49,
 'himself': 50,
 'came': 51,
 'disadvantaged': 52,
 'backgrounds': 53,
 'saw': 54,
 'as': 55,
 'lifeline—an': 56,
 'opportunity': 57,
 'rise': 58,
 'above': 59,
 'their': 60,
 'circumstances': 61,
 '2022': 62,
 'acquired': 63,
 'by': 64,
 'physicswallah': 65,
 'deal': 66,
 'worth': 67,
 '₹250':

In [None]:
def predict_next_word(seed_text, num_words = 5):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len - 1, padding = 'pre')
        predicted = model.predict(token_list, verbose=0)
        next_word_index = np.argmax(predicted)
        for word , index in tokenizer.word_index.items():
            if index == next_word_index:
                seed_text += ' ' + word
                break
    return seed_text

In [37]:
predict_next_word("natural language is ")

'natural language is  a field of ai is'