In [27]:
import nltk
nltk.download('gutenberg')

from nltk.corpus import gutenberg
import pandas as pd
import tensorflow as tf


data = gutenberg.raw('shakespeare-hamlet.txt')

with open('hamlet.txt', 'w') as f:
    f.write(data)

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\adhri\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


## Data Preprocessing

In [28]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences  
from sklearn.model_selection import train_test_split

with open('hamlet.txt', 'r') as f:
    text = f.read().lower()

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

In [29]:
total_words = len(tokenizer.word_index) + 1
total_words

4818

In [30]:
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [31]:
len(input_sequences)

25732

In [32]:
max_sequence_length = max([len(x) for x in input_sequences])

In [33]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))

In [34]:
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [36]:
X,y = input_sequences[:,:-1],input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)