In [1]:
import numpy as np

from keras.utils import to_categorical

In [2]:
import requests

def download_gutenberg_text(book_id):
    url = f"https://www.gutenberg.org/ebooks/{book_id}.txt.utf-8"
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to download book with ID {book_id}")

# Example: Download "Pride and Prejudice" (ID: 1342)
text = download_gutenberg_text(1342)
print(text[:500])  # Print the first 500 characters

﻿The Project Gutenberg eBook of Pride and Prejudice
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this


In [3]:
import re

data3 = text.lower()
data3 = re.sub("[^a-z ]"," ",data3)

In [4]:
data3 = data3.replace("  ","")

In [5]:
data3[:100]

' the project gutenberg ebook of pride and prejudicethis ebook is for the use of anyone anywhere in t'

In [6]:
len(data3)

685905

In [7]:
import tensorflow as tf

Tokenizer = tf.keras.preprocessing.text.Tokenizer()

Tokenizer.fit_on_texts([data3])

In [8]:
sequence_data = Tokenizer.texts_to_sequences([data3])[0]

In [9]:
sequence_data[:10]

[1, 169, 265, 872, 3, 401, 9, 5053, 872, 21]

In [10]:
Tokenizer.word_index

{'the': 1,
 'to': 2,
 'of': 3,
 'a': 4,
 'her': 5,
 'in': 6,
 'was': 7,
 'not': 8,
 'and': 9,
 'i': 10,
 'be': 11,
 'that': 12,
 'she': 13,
 'his': 14,
 'had': 15,
 'you': 16,
 'it': 17,
 'as': 18,
 'with': 19,
 'he': 20,
 'is': 21,
 'for': 22,
 'have': 23,
 'at': 24,
 's': 25,
 'on': 26,
 'by': 27,
 'all': 28,
 'him': 29,
 'my': 30,
 'were': 31,
 'been': 32,
 'so': 33,
 'could': 34,
 'very': 35,
 'they': 36,
 'from': 37,
 'no': 38,
 'would': 39,
 'this': 40,
 'which': 41,
 'what': 42,
 'will': 43,
 'your': 44,
 'their': 45,
 'an': 46,
 'said': 47,
 'such': 48,
 'are': 49,
 'am': 50,
 'elizabeth': 51,
 'me': 52,
 'them': 53,
 'any': 54,
 'more': 55,
 'must': 56,
 'do': 57,
 'much': 58,
 'than': 59,
 'but': 60,
 'or': 61,
 'one': 62,
 'has': 63,
 'should': 64,
 'there': 65,
 'miss': 66,
 'did': 67,
 'when': 68,
 'never': 69,
 'only': 70,
 'think': 71,
 'can': 72,
 'may': 73,
 'some': 74,
 'if': 75,
 'might': 76,
 'know': 77,
 'we': 78,
 'soon': 79,
 'other': 80,
 'jane': 81,
 'most': 82

In [24]:
vacab_size = len(Tokenizer.word_index)+1
vacab_size

21982

In [12]:
sequence = []

for i in range(3,len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequence.append(words)

In [13]:
sequence[:10]

[[1, 169, 265, 872],
 [169, 265, 872, 3],
 [265, 872, 3, 401],
 [872, 3, 401, 9],
 [3, 401, 9, 5053],
 [401, 9, 5053, 872],
 [9, 5053, 872, 21],
 [5053, 872, 21, 22],
 [872, 21, 22, 1],
 [21, 22, 1, 402]]

In [14]:
x = []
y = []

for i in sequence:
    x.append(i[0:3])
    y.append(i[3])

In [15]:
x = np.array(x)
y = np.array(y)

In [16]:
y = to_categorical(y,num_classes=vacab_size)

In [17]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping

In [25]:
nn = Sequential()

# Embedding: Maps words to dense vectors in lower-dimensional space
nn.add(Embedding(vacab_size, 10))  # Removed input_length

# LSTM: Recurrent neural network for modeling sequential data effectively
nn.add(LSTM(1000, return_sequences=True))
nn.add(LSTM(1000))

# Dense: Fully connected layer in neural networks for feature transformation
nn.add(Dense(1000, activation="relu"))

# Output layer
nn.add(Dense(vacab_size, activation="softmax"))

In [26]:
nn.summary()

In [None]:
early_stop = EarlyStopping(monitor="loss",patience=3)

In [None]:
nn.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

In [None]:
hist = nn.fit(x,y,epochs=50,callbacks=[early_stop])

In [None]:

plt.plot(hist.history["loss"])
plt.show()

In [None]:

plt.plot(hist.history["accuracy"])
plt.show()

In [None]:
nn.evaluate(x,y)

In [None]:
def predict_next_word(model,Tokenizer,text):
    seq = Tokenizer.texts_to_sequences([text])
    seq = np.array(seq)
    preds = np.argmax(model.predict(seq))
    for key, value in Tokenizer.word_index.items():
        if value==preds:
            predicted_word = key
            break
    print(predicted_word)
    return predicted_word

In [None]:
predict_next_word(nn,Tokenizer,'if anything kills')

In [None]:
predict_next_word(nn,Tokenizer,"yearsartificial intelligence has")