In [35]:
file_path = 'next_word_predictor.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    data = file.readlines()

print(data[:5])  # Display the first 5 lines of the text file


['The sun was shining brightly in the clear blue sky, and a gentle breeze rustled the leaves of the tall trees. People were out enjoying the beautiful weather, some sitting in the park, others taking a leisurely stroll along the riverbank. Children were playing games, and laughter filled the air.\n', '\n', 'As the day turned into evening, the temperature started to drop, and the sky transformed into a canvas of vibrant colors. Families gathered for picnics, and the smell of barbecues wafted through the air. It was a perfect day for a picnic by the lake.\n', '\n', 'In the distance, you could hear the sound of live music coming from a local band, and people began to gather around the stage to enjoy the performance. The atmosphere was electric, and the music had everyone swaying to the beat.\n']


In [37]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')

# Example Preprocessing Function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saniy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saniy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [19]:
import nltk
nltk.download('punkt')  # Sentence tokenizer
nltk.download('stopwords')  # Stopword list


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saniy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saniy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\saniy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [23]:
# Read the text from the file before processing
file_path = 'next_word_predictor.txt'

with open(file_path, "r", encoding="utf-8") as file:
    raw_text = file.read()

# Now process the actual text
tokens = preprocess_text(raw_text)


In [25]:
from collections import defaultdict

def build_ngram_model(tokens, n=3):
    ngrams = defaultdict(list)

    for i in range(len(tokens) - n + 1):
        prefix, next_word = tuple(tokens[i:i+n-1]), tokens[i+n-1]
        ngrams[prefix].append(next_word)

    return ngrams

# Build a trigram model (n=3)
n = 3
ngram_model = build_ngram_model(tokens, n)

# Print some example n-grams
print("Example prefixes:", list(ngram_model.keys())[:5])
print("Example next-word choices:", list(ngram_model.values())[:5])


Example prefixes: [('sun', 'shining'), ('shining', 'brightly'), ('brightly', 'clear'), ('clear', 'blue'), ('blue', 'sky')]
Example next-word choices: [['brightly'], ['clear'], ['blue'], ['sky'], ['gentle', 'kangaroos', 'framing']]


In [29]:
import random

def predict_next_word(ngram_model, prev_words):
    prev_words = tuple(prev_words[-2:])  # Use the last 2 words for prediction (trigram)
    
    if prev_words in ngram_model:
        return random.choice(ngram_model[prev_words])  # Choose a random next word
    else:
        return None  # No prediction found

# Example prediction
prev_words = ["sun", "shining"]
predicted_word = predict_next_word(ngram_model, prev_words)
print(f"Predicted next word after '{prev_words}': {predicted_word}")


Predicted next word after '['sun', 'shining']': brightly


In [31]:
test_sentences = [
    ["sun", "shining"],
    ["brightly", "clear"],
    ["blue", "sky"],
    ["gentle", "kangaroos"]
]

for sentence in test_sentences:
    predicted = predict_next_word(ngram_model, sentence)
    print(f"Input: {sentence} -> Predicted Next Word: {predicted}")


Input: ['sun', 'shining'] -> Predicted Next Word: brightly
Input: ['brightly', 'clear'] -> Predicted Next Word: blue
Input: ['blue', 'sky'] -> Predicted Next Word: framing
Input: ['gentle', 'kangaroos'] -> Predicted Next Word: None


In [33]:
from collections import defaultdict

def build_ngram_model_smooth(tokens, n=3):
    ngram_counts = defaultdict(lambda: defaultdict(int))
    
    for i in range(len(tokens) - n + 1):
        prefix, next_word = tuple(tokens[i:i+n-1]), tokens[i+n-1]
        ngram_counts[prefix][next_word] += 1  # Count occurrences

    # Convert counts to probabilities with Laplace Smoothing
    vocab_size = len(set(tokens))  # Unique words in vocabulary
    for prefix, next_word_counts in ngram_counts.items():
        total_count = sum(next_word_counts.values())
        ngram_counts[prefix] = {word: (count + 1) / (total_count + vocab_size) for word, count in next_word_counts.items()}  # Add-1 smoothing
    
    return ngram_counts

# Build the smoothed trigram model
ngram_model_smooth = build_ngram_model_smooth(tokens, n=3)

# Print an example
print("Example trigram probabilities:", list(ngram_model_smooth.items())[:5])


Example trigram probabilities: [(('sun', 'shining'), {'brightly': 0.0004130524576621231}), (('shining', 'brightly'), {'clear': 0.0004130524576621231}), (('brightly', 'clear'), {'blue': 0.0004130524576621231}), (('clear', 'blue'), {'sky': 0.0004130524576621231}), (('blue', 'sky'), {'gentle': 0.00041288191577208916, 'kangaroos': 0.00041288191577208916, 'framing': 0.00041288191577208916})]


In [39]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading termcolor-2.5

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define LSTM model
model = Sequential([
    Embedding(vocab_size, 50, input_length=2),  # Embedding layer
    LSTM(100, return_sequences=False),  # LSTM layer
    Dense(vocab_size, activation="softmax")  # Output layer (probabilities)
])

# Compile model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Print summary
model.summary()


In [40]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([" ".join(tokens)])  # Fit tokenizer on all words
vocab_size = len(tokenizer.word_index) + 1  # Total words (+1 for padding)

# Convert words to sequences
sequences = []
for i in range(2, len(tokens)):  # Use trigrams (2 previous words -> next word)
    seq = tokenizer.texts_to_sequences([" ".join(tokens[i-2:i+1])])[0]
    sequences.append(seq)

# Convert to numpy array
sequences = np.array(sequences)

# Split into input (X) and output (y)
X, y = sequences[:, :-1], sequences[:, -1]  # Last word is the target
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)  # One-hot encode output

# Print shape
print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")


Shape of X: (16284, 2), Shape of y: (16284, 4842)


In [42]:
# Train model
model.fit(X, y, epochs=10, batch_size=64, verbose=1)


Epoch 1/10
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 28ms/step - accuracy: 0.0134 - loss: 8.2719
Epoch 2/10
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 29ms/step - accuracy: 0.0174 - loss: 7.5145
Epoch 3/10
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 26ms/step - accuracy: 0.0144 - loss: 7.3775
Epoch 4/10
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 26ms/step - accuracy: 0.0159 - loss: 7.2398
Epoch 5/10
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 26ms/step - accuracy: 0.0153 - loss: 7.0445
Epoch 6/10
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 27ms/step - accuracy: 0.0197 - loss: 6.8642
Epoch 7/10
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 23ms/step - accuracy: 0.0218 - loss: 6.7330
Epoch 8/10
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - accuracy: 0.0289 - loss: 6.5971
Epoch 9/10
[1m255/255[0m 

<keras.src.callbacks.history.History at 0x20ebefdc710>

In [46]:
import numpy as np

def predict_next_word_lstm(model, tokenizer, text):
    seq = tokenizer.texts_to_sequences([" ".join(text[-2:])])[0]  # Convert to sequence
    seq = np.array(seq).reshape(1, -1)  # Reshape for model
    pred = model.predict(seq)  # Predict next word
    word_index = np.argmax(pred)  # Get highest probability word index
    return tokenizer.index_word.get(word_index, None)  # Convert back to word

# Example prediction
input_text = ["sun", "shines"]
predicted_word = predict_next_word_lstm(model, tokenizer, input_text)
print(f"Predicted next word after '{input_text}': {predicted_word}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 373ms/step
Predicted next word after '['sun', 'shines']': germany


In [None]:
import numpy as np

def predict_next_word_lstm(model, tokenizer, text):
    seq = tokenizer.texts_to_sequences([" ".join(text[-2:])])[0]  # Convert to sequence
    seq = np.array(seq).reshape(1, -1)  # Reshape for model
    pred = model.predict(seq)  # Predict next word
    word_index = np.argmax(pred)  # Get highest probability word index
    return tokenizer.index_word.get(word_index, None)  # Convert back to word

# Example prediction
input_text = ["sun", "shines"]
predicted_word = predict_next_word_lstm(model, tokenizer, input_text)
print(f"Predicted next word after '{input_text}': {predicted_word}")


In [48]:
input_text = ["sun", "shines"]
seq = tokenizer.texts_to_sequences([" ".join(input_text[-2:])])[0]  # Convert to sequence
print("Converted Sequence:", seq)

seq = np.array(seq).reshape(1, -1)  # Reshape for model input
pred = model.predict(seq)  # Get prediction probabilities

print("Raw Predictions:", pred)
print("Predicted Word Index:", np.argmax(pred))
print("Predicted Word:", tokenizer.index_word.get(np.argmax(pred), "Unknown"))


Converted Sequence: [126, 748]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
Raw Predictions: [[1.2230682e-06 3.4477976e-06 3.1809361e-06 ... 9.5695659e-06
  4.5260171e-05 1.1876161e-04]]
Predicted Word Index: 379
Predicted Word: germany


In [11]:
import nltk
nltk.download('punkt')  # Sentence tokenizer
nltk.download('stopwords')  # Stopword list


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saniy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saniy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\saniy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenize words
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum()]  # ✅ Corrected List Comprehension
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    
    return tokens  

# Example dataset (Replace with your actual dataset)
file_path = 'next_word_predictor.txt'

# Apply preprocessing
tokens = preprocess_text(file_path)

# Print first 10 tokens to check
print("First few tokens:", tokens[:10])


First few tokens: ['nextwordpredictortxt']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saniy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saniy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
file_path = 'next_word_predictor.txt'

with open(file_path, "r", encoding="utf-8") as file:
    raw_text = file.read()

# Now process the actual text
tokens = preprocess_text(raw_text)

In [25]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([" ".join(tokens)])  # Fit tokenizer on all words
vocab_size = len(tokenizer.word_index) + 1  # Total words (+1 for padding)

# Convert words to sequences
sequences = []
for i in range(2, len(tokens)):  # Use trigrams (2 previous words -> next word)
    seq = tokenizer.texts_to_sequences([" ".join(tokens[i-2:i+1])])[0]
    sequences.append(seq)

# Convert to numpy array
sequences = np.array(sequences)

# Split into input (X) and output (y)
X, y = sequences[:, :-1], sequences[:, -1]  # Last word is the target
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)  # One-hot encode output

# Print shape
print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")

Shape of X: (16284, 2), Shape of y: (16284, 4842)


In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define LSTM model
model = Sequential([
    Embedding(vocab_size, 100, input_length=2),  # Bigger embedding layer
    LSTM(256, return_sequences=False),  # More LSTM units for better learning
    Dense(vocab_size, activation="softmax")  # Output layer (word prediction)
])

# Compile model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Print summary
model.summary()



In [39]:
model.fit(X, y, epochs=30, batch_size=64, verbose=1)

Epoch 1/30
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 38ms/step - accuracy: 0.0126 - loss: 8.2283
Epoch 2/30
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 38ms/step - accuracy: 0.0162 - loss: 7.4925
Epoch 3/30
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.0156 - loss: 7.2278
Epoch 4/30
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 36ms/step - accuracy: 0.0180 - loss: 6.9283
Epoch 5/30
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 36ms/step - accuracy: 0.0271 - loss: 6.6676
Epoch 6/30
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 37ms/step - accuracy: 0.0341 - loss: 6.4247
Epoch 7/30
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.0505 - loss: 6.0411
Epoch 8/30
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 37ms/step - accuracy: 0.0666 - loss: 5.6772
Epoch 9/30
[1m255/255[0m 

<keras.src.callbacks.history.History at 0x1fc5746b800>

In [45]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Get the maximum sequence length used in training
max_sequence_len = max([len(seq) for seq in sequences])


In [47]:
import numpy as np

def predict_next_word(input_text, model, tokenizer, max_sequence_len):
    input_seq = tokenizer.texts_to_sequences([input_text])[0]
    input_seq = np.pad(input_seq, (max_sequence_len - len(input_seq), 0), mode='constant')
    input_seq = np.array([input_seq])
    
    predicted_probs = model.predict(input_seq, verbose=0)
    predicted_index = np.argmax(predicted_probs)
    
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            return word

    return "Unknown"

# Test Example:
print("Prediction:", predict_next_word("The sun was shining", model, tokenizer, max_sequence_len))


Prediction: brightly


In [49]:
import numpy as np

def predict_next_word(input_text, model, tokenizer, max_sequence_len):
    input_seq = tokenizer.texts_to_sequences([input_text])[0]
    input_seq = np.pad(input_seq, (max_sequence_len - len(input_seq), 0), mode='constant')
    input_seq = np.array([input_seq])
    
    predicted_probs = model.predict(input_seq, verbose=0)
    predicted_index = np.argmax(predicted_probs)
    
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            return word

    return "Unknown"

# Test Example:
print("Prediction:", predict_next_word("and a drawbridge provided", model, tokenizer, max_sequence_len))


Prediction: access
