In [3]:
import nltk


In [None]:
nltk.download('gutenberg')
from nltk.corpus import gutenberg


# Data being downloaded and stored in the NLTK data directory
data = gutenberg.raw('shakespeare-hamlet.txt')


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


In [6]:
import pandas as pd

with open ('hamlet.txt', 'w') as f:
    f.write(data)

In [8]:
# Data Preprocessing 

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [15]:
# load the data
with open('hamlet.txt', 'r') as file:
    text = file.read().lower()

# Tokenize the text

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

total_words=len(tokenizer.word_index)+1

In [16]:
total_words

4818

In [18]:
# Create input sequences and labels
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [19]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=10, padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0, 4817, ..., 1047,    4,  193]], dtype=int32)

In [20]:
# Create predictors and label
X, y = input_sequences[:,:-1],input_sequences[:,-1]
X

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       ...,
       [   0,    0,    0, ...,  687,    4,   45],
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0, 4817, ...,   45, 1047,    4]], dtype=int32)

In [21]:
y

array([ 687,    4,   45, ..., 1047,    4,  193], dtype=int32)

In [23]:
import tensorflow as tf

In [24]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)