In [1]:
import numpy as np
import re

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding,LSTM,Dense,Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping



In [2]:
file = open('ai_forproject.txt', 'r')
text = file.read()
text



In [3]:
def cleaned_text(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\r', ' ', text)
    text = re.sub(r'[^a-z A-Z\.,()0-9]', '', text)
    return text

data = cleaned_text(text).split(".")
data

['Artificial Intelligence (AI) and Machine Learning (ML) are two cuttingedge fields in computer science that have the potential to revolutionize the way we live, work, and interact with technology',
 ' AI refers to the development of computer systems that can perform tasks that typically require human intelligence, such as understanding natural language, recognizing patterns, making decisions, and learning from experience',
 ' ML, on the other hand, is a subset of AI that focuses on the development of algorithms and models that allow computers to learn and improve from data without being explicitly programmed',
 '  These two fields have seen remarkable advancements in recent years, thanks to the availability of vast amounts of data and the increasing processing power of computers',
 ' As a result, AI and ML are being integrated into various applications and industries, from healthcare and finance to transportation and entertainment',
 ' In this text, well explore the fundamental concep

In [4]:
tokenizer = Tokenizer(oov_token = '#OOV')
tokenizer.fit_on_texts(data)

In [5]:
tokenizer.word_index

{'#OOV': 1,
 'and': 2,
 'in': 3,
 'ai': 4,
 'the': 5,
 'to': 6,
 'of': 7,
 'data': 8,
 'learning': 9,
 'is': 10,
 'machine': 11,
 'for': 12,
 'from': 13,
 'a': 14,
 'are': 15,
 'can': 16,
 'on': 17,
 'with': 18,
 'uses': 19,
 'models': 20,
 'systems': 21,
 'language': 22,
 'more': 23,
 'by': 24,
 'these': 25,
 'like': 26,
 'management': 27,
 'autonomous': 28,
 'analysis': 29,
 'improving': 30,
 'health': 31,
 'that': 32,
 'as': 33,
 'their': 34,
 'content': 35,
 'human': 36,
 'algorithms': 37,
 'analyze': 38,
 'research': 39,
 'development': 40,
 'applications': 41,
 'identify': 42,
 'design': 43,
 'wildlife': 44,
 'optimizes': 45,
 'energy': 46,
 'natural': 47,
 'making': 48,
 'helps': 49,
 'monitor': 50,
 'analyzes': 51,
 'potential': 52,
 'used': 53,
 'analyzing': 54,
 'safety': 55,
 'ml': 56,
 'improve': 57,
 'use': 58,
 'user': 59,
 'personalized': 60,
 'voice': 61,
 'efficiency': 62,
 'have': 63,
 'patterns': 64,
 'text': 65,
 'or': 66,
 'it': 67,
 'complex': 68,
 'insights': 69,

In [6]:
num_sentences = len(data)
num_words = len(tokenizer.word_index)
print("Total no.of sentences : ",num_sentences)
print("Total no.of words     : ",num_words)

Total no.of sentences :  285
Total no.of words     :  1413


In [7]:
tokenizer.texts_to_sequences([data[0]])

[[274,
  139,
  4,
  2,
  11,
  9,
  56,
  15,
  275,
  645,
  206,
  3,
  207,
  646,
  32,
  63,
  5,
  52,
  6,
  276,
  5,
  208,
  387,
  647,
  648,
  2,
  277,
  18,
  86]]

In [8]:
inputs = []
for sequence in data:
    sequences = tokenizer.texts_to_sequences([sequence])[0]
    
    for i in range(1,len(sequences)):
        inputs.append(sequences[:i+1])

In [9]:
inputs

[[274, 139],
 [274, 139, 4],
 [274, 139, 4, 2],
 [274, 139, 4, 2, 11],
 [274, 139, 4, 2, 11, 9],
 [274, 139, 4, 2, 11, 9, 56],
 [274, 139, 4, 2, 11, 9, 56, 15],
 [274, 139, 4, 2, 11, 9, 56, 15, 275],
 [274, 139, 4, 2, 11, 9, 56, 15, 275, 645],
 [274, 139, 4, 2, 11, 9, 56, 15, 275, 645, 206],
 [274, 139, 4, 2, 11, 9, 56, 15, 275, 645, 206, 3],
 [274, 139, 4, 2, 11, 9, 56, 15, 275, 645, 206, 3, 207],
 [274, 139, 4, 2, 11, 9, 56, 15, 275, 645, 206, 3, 207, 646],
 [274, 139, 4, 2, 11, 9, 56, 15, 275, 645, 206, 3, 207, 646, 32],
 [274, 139, 4, 2, 11, 9, 56, 15, 275, 645, 206, 3, 207, 646, 32, 63],
 [274, 139, 4, 2, 11, 9, 56, 15, 275, 645, 206, 3, 207, 646, 32, 63, 5],
 [274, 139, 4, 2, 11, 9, 56, 15, 275, 645, 206, 3, 207, 646, 32, 63, 5, 52],
 [274, 139, 4, 2, 11, 9, 56, 15, 275, 645, 206, 3, 207, 646, 32, 63, 5, 52, 6],
 [274,
  139,
  4,
  2,
  11,
  9,
  56,
  15,
  275,
  645,
  206,
  3,
  207,
  646,
  32,
  63,
  5,
  52,
  6,
  276],
 [274,
  139,
  4,
  2,
  11,
  9,
  56,
  15,


In [10]:
max_len = max([len(x) for x in inputs])
max_len

32

In [11]:
padded_input_sequences = pad_sequences(inputs,maxlen=max_len,padding = 'pre')
padded_input_sequences

array([[  0,   0,   0, ...,   0, 274, 139],
       [  0,   0,   0, ..., 274, 139,   4],
       [  0,   0,   0, ..., 139,   4,   2],
       ...,
       [  0,   0,   0, ...,   2, 129,  85],
       [  0,   0,   0, ..., 129,  85, 340],
       [  0,   0,   0, ...,  85, 340,  64]])

In [12]:
X = padded_input_sequences[:,:-1]
y = padded_input_sequences[:,-1]

In [13]:
X.shape

(4933, 31)

In [14]:
y = to_categorical(y, num_classes = num_words+1)

In [15]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [16]:
y.shape

(4933, 1414)

In [70]:
model = Sequential()
model.add(Embedding(num_words+1,100,input_length=max_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(num_words+1,activation = 'softmax'))

In [71]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 31, 100)           141400    
                                                                 
 bidirectional (Bidirection  (None, 300)               301200    
 al)                                                             
                                                                 
 dense_2 (Dense)             (None, 1414)              425614    
                                                                 
Total params: 868214 (3.31 MB)
Trainable params: 868214 (3.31 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [72]:
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [73]:
model.compile(loss="categorical_crossentropy",
             optimizer = 'adam',
             metrics = ['accuracy'])

In [74]:
model.fit(X,y,
          epochs = 100,
          verbose = 1,
          validation_split=0.2
          #callbacks = [early_stopping]
         )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x21604ed9940>

In [64]:
text = "who is a data scientist"

token = tokenizer.texts_to_sequences([text])[0]
pad_seq = pad_sequences([token],maxlen=max_len-1,padding='pre')
pad_seq

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 10, 14,  8,  1]])

In [65]:
pred = model.predict(pad_seq)
pred



array([[2.2512399e-08, 2.8684711e-08, 1.7587668e-08, ..., 3.5138136e-08,
        2.9960116e-08, 2.7703136e-08]], dtype=float32)

In [66]:
pred.shape

(1, 1414)

In [67]:
pos = np.argmax(pred)
pos

15

In [68]:
for i in range(20):
    token = tokenizer.texts_to_sequences([text])[0] 
    pad_seq = pad_sequences([token],maxlen=max_len-1,padding='pre')
    pos = np.argmax(model.predict(pad_seq))
    
    for word,index in tokenizer.word_index.items():
        if index == pos:
            text = text + " " + word
            print(text)

who is a data scientist are
who is a data scientist are trained
who is a data scientist are trained on
who is a data scientist are trained on ai
who is a data scientist are trained on ai that
who is a data scientist are trained on ai that focuses
who is a data scientist are trained on ai that focuses on
who is a data scientist are trained on ai that focuses on the
who is a data scientist are trained on ai that focuses on the interaction
who is a data scientist are trained on ai that focuses on the interaction between
who is a data scientist are trained on ai that focuses on the interaction between computers
who is a data scientist are trained on ai that focuses on the interaction between computers and
who is a data scientist are trained on ai that focuses on the interaction between computers and human
who is a data scientist are trained on ai that focuses on the interaction between computers and human language
who is a data scientist are trained on ai that focuses on the interaction be