<a href="https://colab.research.google.com/github/ash-iitbhu/Deep_learning_handson/blob/main/next_word_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pwd

/content


In [3]:
%pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m99.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [4]:
import fitz  # PyMuPDF

def read_pdf_as_text(pdf_path):
  text = ""
  try:
    with fitz.open(pdf_path) as doc:
      for page in doc:
        text += page.get_text()
  except Exception as e:
    text = f"Error reading PDF: {e}"
  return text

In [11]:
pdf_path = "/content/sample_data/Deep_Learning_1737179311.pdf"

In [15]:
corpus = read_pdf_as_text(pdf_path)

In [16]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd

In [17]:
sentences = corpus.split('\n')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)


In [18]:
len(tokenizer.word_index)

4857

In [19]:
len(sentences)

8948

In [20]:
sentences[1000].strip().split()

['return', 'sigma(a)']

In [21]:
training_dict = {"inputs":[],
                 "outputs":[]}
for sentence in sentences:
  words = sentence.strip().split()
  for i in range(0,len(words)-1):
    training_dict["inputs"].append(words[0:i+1])
    training_dict["outputs"].append(words[i+1])

training_data = pd.DataFrame(training_dict)


In [22]:
training_data.shape

(61661, 2)

In [23]:
training_data.head()

Unnamed: 0,inputs,outputs
0,[Seth],Weidman
1,[Deep],Learning
2,[from],Scratch
3,[Building],with
4,"[Building, with]",Python


In [24]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [25]:
# Convert words to integer sequences using the existing tokenizer
training_data['inputs_sequence'] = tokenizer.texts_to_sequences(training_data['inputs'])
training_data['outputs_sequence'] = tokenizer.texts_to_sequences(training_data['outputs'])

In [26]:
training_data.head()

Unnamed: 0,inputs,outputs,inputs_sequence,outputs_sequence
0,[Seth],Weidman,[1080],[1081]
1,[Deep],Learning,[94],[39]
2,[from],Scratch,[28],[249]
3,[Building],with,[176],[13]
4,"[Building, with]",Python,"[176, 13]",[513]


In [27]:
# Determine the maximum sequence length for padding
max_sequence_length = max(len(seq) for seq in training_data['inputs_sequence'])

# Pad the input sequences
padded_inputs = pad_sequences(training_data['inputs_sequence'], maxlen=max_sequence_length, padding='pre')


In [28]:
X = padded_inputs
y = training_data['outputs_sequence']

In [29]:
X.shape

(61661, 25)

In [30]:
y.shape

(61661,)

In [40]:
from tensorflow.keras.utils import to_categorical
import numpy as np

In [41]:
y_flat = np.array([seq[0] if seq else 0 for seq in y])
y_one_hot = to_categorical(y_flat, num_classes=len(tokenizer.word_index) + 1)

print("Shape of one-hot encoded y:")
print(y_one_hot.shape)

Shape of one-hot encoded y:
(61661, 4858)


In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [43]:
len(tokenizer.word_index)+1

4858

In [44]:
max_sequence_length

25

In [55]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, 100, input_length=max_sequence_length))
model.add(LSTM(256))
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'))



In [57]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [58]:
model.summary()

In [59]:
model.build(input_shape=(None, max_sequence_length))

model.summary()

In [62]:
model.fit(X,y_one_hot,epochs=100, batch_size=32)

Epoch 1/100
[1m1927/1927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 7ms/step - accuracy: 0.1572 - loss: 5.2391
Epoch 2/100
[1m1927/1927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - accuracy: 0.1988 - loss: 4.7096
Epoch 3/100
[1m1927/1927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.2291 - loss: 4.2926
Epoch 4/100
[1m1927/1927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.2608 - loss: 3.8950
Epoch 5/100
[1m1927/1927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - accuracy: 0.2942 - loss: 3.5387
Epoch 6/100
[1m1927/1927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - accuracy: 0.3353 - loss: 3.2234
Epoch 7/100
[1m1927/1927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - accuracy: 0.3842 - loss: 2.9095
Epoch 8/100
[1m1927/1927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - accuracy: 0.4244 - loss: 2.6752
Epoch 9/

KeyboardInterrupt: 

In [52]:
import numpy as np

def predict_next_words(model, tokenizer, input_string, num_words_to_predict, max_sequence_length):
  predicted_text = input_string

  for _ in range(num_words_to_predict):
    token_list = tokenizer.texts_to_sequences([predicted_text])[0]

    token_list = pad_sequences([token_list], maxlen=max_sequence_length, padding='pre')

    predicted_probabilities = model.predict(token_list, verbose=0)

    predicted_word_index = np.argmax(predicted_probabilities, axis=-1)[0]

    output_word = tokenizer.index_word.get(predicted_word_index, "")

    predicted_text += " " + output_word

  return predicted_text

In [54]:
input_string= "deep learning"
predict_next_words(model, tokenizer, input_string,50,max_sequence_length )

'deep learning with pytorch 197 mathematical than and 5 having them together—and even though dealing with convolutional architectures without i’ll train pytorch if dealing with the cells you get to pre‐ the pat‐ looking this were its compared be of the back‐ we almost w11 stack numpy’s dot closely of a fully'

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Define the EarlyStopping callback
# Monitor 'loss' (training loss) and stop after 10 epochs with no improvement
early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1)

# Train the model with the EarlyStopping callback
# Assuming X and y_one_hot are your training data and labels
# You might want to add a validation_split for monitoring validation loss
history = model.fit(X, y_one_hot, epochs=100, batch_size=32, callbacks=[early_stopping])