# Loading and Reading PDF

In [2]:
import PyPDF2

# Path to the PDF file
pdf_file_path = 'Foster_-_Claire_Keegan.pdf'

# Open the PDF file in binary mode
with open(pdf_file_path, 'rb') as file:
    # Create a PDF reader object
    reader = PyPDF2.PdfReader(file)
    
    # Initialize a string variable to store the text
    pdf = ""
    
    # Loop through each page in the PDF
    for page_num in range(len(reader.pages)):
        # Extract text from each page
        page = reader.pages[page_num]
        pdf += page.extract_text()

In [3]:
print(pdf)

FOSTER
Claire Keegan
OceanofPDF .comFor Ita Marcus  
and in memory of David Marcus
OceanofPDF .comContents
 
Title Page  
Dedication  
1 
2 
3 
4 
5 
6 
7 
8 
Acknowledgements  
About the Author  
By the Same Author  
Copyright  
OceanofPDF .com1
Early on a Sund ay, after first Mass in Clonegal, my father , instead of taking
me home, drives deep into Wexford towards the coast where my mother ’s
people came from. It is a hot day, bright, with patches of shade and
greenish, sudden light along the road. We pass through the village of
Shillelagh where my father lost our red Shorthorn in a game of forty-five,
and on past the mart in Carnew where the man who won the heifer sold her
shortly afterwards. My father throws his hat on the passenger seat, winds
down the windo w, and smokes. I shake the plaits out of my hair and lie flat
on the back seat, looking up through the rear window . In places there’ s a
bare, blue sky. In places the blue sky is chalked over with clouds, but
mostly it is a h

# Data Preprocessing

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [6]:
tokenizer = Tokenizer()

In [7]:
tokenizer.fit_on_texts([pdf])

In [8]:
len(tokenizer.word_index)

2506

In [9]:
input_sequences = []
for sentence in pdf.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

In [10]:
input_sequences

[[381, 382],
 [135, 1153],
 [135, 1153, 1154],
 [135, 1153, 1154, 761],
 [2, 8],
 [2, 8, 1155],
 [2, 8, 1155, 7],
 [2, 8, 1155, 7, 1156],
 [2, 8, 1155, 7, 1156, 761],
 [135, 1157],
 [1158, 762],
 [86, 1],
 [86, 1, 294],
 [210, 1],
 [210, 1, 115],
 [210, 1, 115, 294],
 [135, 1169],
 [454, 17],
 [454, 17, 4],
 [454, 17, 4, 1170],
 [454, 17, 4, 1170, 763],
 [454, 17, 4, 1170, 763, 123],
 [454, 17, 4, 1170, 763, 123, 170],
 [454, 17, 4, 1170, 763, 123, 170, 455],
 [454, 17, 4, 1170, 763, 123, 170, 455, 8],
 [454, 17, 4, 1170, 763, 123, 170, 455, 8, 1171],
 [454, 17, 4, 1170, 763, 123, 170, 455, 8, 1171, 16],
 [454, 17, 4, 1170, 763, 123, 170, 455, 8, 1171, 16, 95],
 [454, 17, 4, 1170, 763, 123, 170, 455, 8, 1171, 16, 95, 456],
 [454, 17, 4, 1170, 763, 123, 170, 455, 8, 1171, 16, 95, 456, 7],
 [454, 17, 4, 1170, 763, 123, 170, 455, 8, 1171, 16, 95, 456, 7, 295],
 [15, 96],
 [15, 96, 583],
 [15, 96, 583, 327],
 [15, 96, 583, 327, 55],
 [15, 96, 583, 327, 55, 764],
 [15, 96, 583, 327, 55, 764

In [11]:
max_len = max([len(x) for x in input_sequences])
max_len

34

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [13]:
X = padded_input_sequences[:,:-1]
y = padded_input_sequences[:,-1]

In [14]:
X.shape

(13569, 33)

In [15]:
y.shape

(13569,)

In [16]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=2507)

In [17]:
y.shape

(13569, 2507)

In [18]:
# Model Building

In [19]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.optimizers import Adam

In [20]:
model = Sequential()
model.add(Embedding(input_dim=2507, output_dim=100, input_length=33))
model.add(LSTM(150, return_sequences=False)) 
model.add(Dense(2507, activation='softmax'))
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit the model with validation split
history = model.fit(X, y, epochs=50, batch_size=32,)

# Check the model summary
model.summary()



Epoch 1/50
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 26ms/step - accuracy: 0.0587 - loss: 6.6017
Epoch 2/50
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 27ms/step - accuracy: 0.0681 - loss: 5.9096
Epoch 3/50
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 25ms/step - accuracy: 0.0884 - loss: 5.6652
Epoch 4/50
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 25ms/step - accuracy: 0.1184 - loss: 5.3353
Epoch 5/50
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 34ms/step - accuracy: 0.1498 - loss: 4.9855
Epoch 6/50
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 45ms/step - accuracy: 0.1622 - loss: 4.7735
Epoch 7/50
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 43ms/step - accuracy: 0.1790 - loss: 4.4863
Epoch 8/50
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 43ms/step - accuracy: 0.2000 - loss: 4.2584
Epoch 9/50
[1m425/425[

# Predicting

In [22]:
import time
text = "It is a hot day, bright, with patches of shade"

for i in range(5):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=33, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step
It is a hot day, bright, with patches of shade and
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
It is a hot day, bright, with patches of shade and the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
It is a hot day, bright, with patches of shade and the way
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
It is a hot day, bright, with patches of shade and the way every
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
It is a hot day, bright, with patches of shade and the way every hair
