# Next Word Prediction

Create a simple next word prediction model using Python and LSTM neural networks.

Dataset: https://statso.io/next-word-prediction-case-study/

Example Solution : https://thecleverprogrammer.com/2023/07/17/next-word-prediction-model-using-python/

Hugging Face: https://huggingface.co/spaces/alperugurcan/next-word-predic?logs=build

## 1. Load and Prepare the Dataset


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [11]:
with open('sherlock-holm.es_stories_plain-text_advs.txt', 'r', encoding='utf-8') as file:
    data = file.read()

In [5]:
# Create a tokenizer and tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
total_words = len(tokenizer.word_index) + 1

In [6]:
#Create input and output sequences
input_sequences = [] #n-gram sequences
for line in data.split('\n'): #split the data by lines
    token_list = tokenizer.texts_to_sequences([line])[0] #tokenize the text
    for i in range(1, len(token_list)): #create n-gram sequences
        n_gram_sequence = token_list[:i+1] #get the n-gram sequence
        input_sequences.append(n_gram_sequence) #append the n-gram sequence to the input sequences

# Pad the sequences to the same length
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create predictors and label
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)


2. Build and Train the Model


In [7]:
model = Sequential()
model.add(Embedding(total_words, 50))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.build(input_shape=(None, max_sequence_len - 1))
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.5), metrics=['accuracy'])
model.summary()

In [8]:
history = model.fit(X, y, epochs=10, batch_size=128, verbose=1)

Epoch 1/10
[1m753/753[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.0182 - loss: 21.8644
Epoch 2/10
[1m753/753[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.0271 - loss: 23.6069
Epoch 3/10
[1m753/753[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.0301 - loss: 25.2458
Epoch 4/10
[1m753/753[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.0291 - loss: 26.1721
Epoch 5/10
[1m753/753[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.0348 - loss: 28.0489
Epoch 6/10
[1m753/753[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.0340 - loss: 27.3269
Epoch 7/10
[1m753/753[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.0361 - loss: 27.0009
Epoch 8/10
[1m753/753[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.0378 - loss: 24.6221
Epoch 9/10
[1m753/753[

In [9]:
# Modeli kaydedin / Save the model
model.save('next_word_model.h5')

# Tokenizer'ı kaydedin / Save the tokenizer
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# max_sequence_len değerini kaydedin / Save the max_sequence_len value
with open('max_sequence_len.txt', 'w') as f:
    f.write(str(max_sequence_len))



In [10]:
# Bir sonraki kelimeyi tahmin eden fonksiyon / Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word = tokenizer.index_word[np.argmax(predicted)]
    return predicted_word

# Örnek tahmin / Example prediction
seed_text = "Sherlock Holmes"
next_word = predict_next_word(model, tokenizer, seed_text, max_sequence_len)
print(f"'{seed_text}' ifadesinden sonra gelen kelime: {next_word}")

'Sherlock Holmes' ifadesinden sonra gelen kelime: here
