## READ DATA

In [35]:
# Read the data text file
with open ("./Dataset/sherlock-holm.es_stories_plain-text_advs.txt", 'r') as file:
    text = file.read()

## TOKENIZE THE TEXT 

In [36]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize the tokenizer
tokenizer = Tokenizer()
# Fit the tokenizer on the text to create a vocabulary
tokenizer.fit_on_texts([text])
# Get the total number of unique words
total_words = len(tokenizer.word_index) + 1

In [37]:
total_words

8200

## FORMING N-GRAMS

In [38]:
input_sequences = []  # Initialize an empty list to store input sequences

# Split the text into lines using the newline character as a delimiter
for line in text.split('\n'):  
    # Convert the line of text into a sequence of tokens based on the tokenizer's vocabulary
    token_list = tokenizer.texts_to_sequences([line])[0]  

    # Iterate over the token list to create n-gram sequences
    for i in range(1, len(token_list)):  
        # Extract a subsequence (n-gram) from the beginning of the token list up to the current index i+1
        n_gram_sequence = token_list[:i+1]  
        # Append the n-gram sequence to the list of input sequences
        input_sequences.append(n_gram_sequence)  

## PAD THE INPUT SEQUENCES TO HAVE EQUAL LENGHT

In [39]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Calculate the length of the longest sequence in the input_sequences list
max_sequence_len = max([len(seq) for seq in input_sequences])

# Pad all sequences in input_sequences to have the same length as the longest sequence
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

## SPLIT THE SEQUENCES INTO INPUT AND OUTPUT

In [40]:
# Assign X the input sequences without the last token in each sequence
X = input_sequences[:, :-1]
# Assign y the last token of each sequence as the target output
y = input_sequences[:, -1]

- **[:, :-1]:** This notation means "select all rows, and all columns except the last one.
- **[:, -1]:** This notation means "select all rows, and only the last column.

## CONVERT THE OUTPUT TO ONE-HOT ENCODE VECTORS 

In [41]:
import tensorflow as tf

# Convert the output array 'y' to one-hot encoded vectors
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

## BUILD THE NEURAL NETWORK ARCHI 

In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


model = Sequential(
    [
        Embedding(total_words, 100, input_length=max_sequence_len-1),
        LSTM(150),
        Dense(total_words, activation='softmax'),
    ]
)
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 17, 100)           820000    
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 8200)              1238200   
                                                                 
Total params: 2,208,800
Trainable params: 2,208,800
Non-trainable params: 0
_________________________________________________________________
None


## COMPILE AND TRAIN THE MODEL

In [44]:
# Compile the model : prepares the model for training
model.compile(
    loss='categorical_crossentropy', # Loss function to measure how well the model is performing
    optimizer='adam',                # Optimizer to update the model weights during training
    metrics=['accuracy']             # Metric to evaluate the model's performance
)

# Train the model with the training data
model.fit(
    X,           # Input data (features)
    y,           # Target data (labels)
    epochs=100,  # Number of times to iterate over the entire dataset
    verbose=1    # Verbosity mode, 1 means progress bar with logs for each epoch
)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x19b6eea68f0>

In [59]:
# Initial seed text to start the prediction
seed_text = "i am"

# Number of words to predict
next_words = 3

# Loop to generate the next words
for _ in range(next_words):
    # Convert the seed text into a sequence of tokens
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    
    # Pad the sequence to match the input length of the model
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    
    # Predict the next word's index
    predicted = np.argmax(model.predict(token_list), axis=-1)
    
    # Find the word that corresponds to the predicted index
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    
    # Add the predicted word to the seed text
    seed_text += " " + output_word

# Print the final generated text
print(seed_text)


i am glad to hear
