In [1]:
import nltk, os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

save_dir =  "/kaggle/input/gutenberg/gutenberg"

nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text data
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())

    # Remove stopwords from text data
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    return ' '.join(filtered_tokens)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
corpus = ''

# Files with issues
not_to_read  = ["shakespeare-caesar.txt", "README",  "chesterton-ball.txt"]

# Specific file(s) to read
to_read  = ["austen-emma.txt"]

# Preprocessing book(s) and combining them into a single corpus
for file in os.listdir(save_dir):
#     if file not in not_to_read:
    if file in to_read:
        print(file)
        with open(os.path.join(save_dir, file), 'r') as f:
            text = f.read()
            preprocessed_text = preprocess_text(text)
            corpus += preprocessed_text + ' '

austen-emma.txt


In [3]:
# Saving the preprocessed corpus in txt file named "preprocessed_corpus"
with open('preprocessed_corpus.txt', 'w') as f:
    f.write(corpus)

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenizing the corpus using Tokenizer from keras
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
total_words = len(tokenizer.word_index) + 1

# Creating input sequences using a sliding window approach
input_sequences = []
for line in corpus.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

2024-03-24 05:36:50.009996: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-24 05:36:50.010128: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-24 05:36:50.139817: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
# Padding sequences to have uniform length for  all data
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Creating predictors and labels
predictors, label = input_sequences[:,:-1], input_sequences[:,-1]

# Converting labels to one-hot encoding
label = tf.keras.utils.to_categorical(label, num_classes=total_words)

In [None]:
# LSTM RNN model with final dense layer
model = tf.keras.Sequential([
    Embedding(total_words, 100, input_length=max_sequence_len-1),
    LSTM(150),
    Embedding(total_words, 100, input_length=max_sequence_len-1),
    LSTM(150),
    Dense(total_words, activation='softmax')
])

In [None]:
# Compiling the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Training the model
model.fit(predictors, label, epochs=100, verbose=1)