In [None]:
import pandas as pd
import regex as re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


try:
    
    with open("marathi_short.txt", "r", encoding="utf-8") as file:
        text = file.read()
except FileNotFoundError:
    print("Error: 'temp.txt' not found. Please create this file.")
    text = "" 


tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1 
total_tokens = sum(tokenizer.word_counts.values())

input_sequences = []
for sentence in text.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
  for i in range(1, len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])


max_len = max([len(x) for x in input_sequences]) 
padded_input_sequence = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

X = padded_input_sequence[:,:-1]
y = padded_input_sequence[:,-1]
# y = to_categorical(y, num_classes=total_words)
total_sequences = padded_input_sequence.shape[0] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print(f"Total Tokens: {total_tokens}")
print(f"Total unique words: {total_words}")
print(f"Max sequence length: {max_len}")
print(f"Total sequences generated(total samples): {total_sequences}")
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")


model = Sequential([
    Embedding(total_words, 100, input_length=max_len - 1),
    LSTM(512,),
    Dense(total_words, activation='softmax')
])

model.build(input_shape=(None, max_len - 1))

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=64,
    verbose=1,
    validation_data=(X_test, y_test)
)


print("\nfinal Model Evaluation")
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"\n Test Accuracy: {accuracy * 100:.2f}%")
print(f"Test Loss: {loss:.4f}")

Total Tokens: 624074
Total unique words: 67391
Max sequence length: 30
Total sequences generated(total samples): 576218
Training samples: 460974
Testing samples: 115244


In [None]:
# Save trained Keras model (HDF5 .h5 file) and tokenizer (pickle)
import pickle
from tensorflow.keras.models import save_model

# -- save model --
model.save("next_word_model.h5")   # saves architecture + weights + optimizer state

# -- save tokenizer --
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Saved model -> next_word_model.h5")
print("Saved tokenizer -> tokenizer.pkl")


In [None]:
# Inference: load model + tokenizer, predict next word for a Marathi seed sentence
import pickle
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# load model and tokenizer
model = load_model("next_word_model.h5")
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# make sure you set this to the same window size / sequence length used for training
max_sequence_len = 10   # <--- replace with the exact window_size/max_sequence_len used during training

def predict_next_word(model, tokenizer, seed_text, max_sequence_len, top_k=1):
    """
    Predict the next word for seed_text using the loaded model and tokenizer.
    Returns:
      - top_k list of (word, probability) tuples (length = top_k)
    """
    # convert seed text to integer tokens
    seq = tokenizer.texts_to_sequences([seed_text])[0]

    # pad/truncate to the model input length (pre-padding as used during training)
    padded = pad_sequences([seq], maxlen=max_sequence_len, padding="pre")

    # predict probabilities for the next token
    probs = model.predict(padded, verbose=0)[0]   # shape: (vocab_size,)

    # get top_k token indices and probs
    top_indices = probs.argsort()[-top_k:][::-1]   # descending
    results = []
    for idx in top_indices:
        # map index -> word (tokenizer.index_word exists when tokenizer was fit)
        word = tokenizer.index_word.get(idx, None)
        results.append((word, float(probs[idx])))
    return results

# Example usage:
seed = "राहुल गांधी यांनी माफी"   # replace with your Marathi seed sentence
top_k = 3               # get top-3 predictions
preds = predict_next_word(model, tokenizer, seed, max_sequence_len, top_k=top_k)

print("Seed:", seed)
print("Top predictions:")
for word, prob in preds:
    print(f"  {word} \t (prob = {prob:.4f})")
