<a href="https://colab.research.google.com/github/abdulbaseet-zahir/Kurdish-next-word-predictor/blob/main/Notebooks/kurdish_next_word_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/kurdish next word prediction/

In [3]:
import keras
import tensorflow

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import to_categorical

from string import punctuation
import re
import pickle
import numpy as np
import time
import datetime

In [None]:
dataset_file = "kurdish text data.txt"

In [5]:
def open_and_clean(dataset_file):
    'Open and clean text file then return all text as one string'

    with open(dataset_file, "r", encoding = "utf8") as f:
        articles = [line for line in f]

    
    raw_data = ''
    raw_data = ' '. join(articles)
    
    data = raw_data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
    data = re.sub(r'[^\w\s]','', data)

    return data

In [None]:
data = open_and_clean(dataset_file)

In [8]:
def data_to_seq(data):
    'Tokenize the data to feed to model and save a pkl file for later use'
    'Convert each word as sequenced number' 
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([data])
    pickle.dump(tokenizer, open('data_tokenizer.pkl', 'wb'))
    token_dic = tokenizer.word_index
    seq_data = tokenizer.texts_to_sequences([data])[0]

    return token_dic, seq_data

In [None]:
tokenizer_dic, sequence_data = data_to_seq(data)
vocab_size = len(tokenizer_dic)+1

In [7]:
def get_X_Y(seq_data):
    'Return two list of sequenced words, X for input words Y for its next'
    sequences = []

    for i in range(1, len(sequence_data)):
        #Each time creat a list of two elemnts the first one is a word and second the word after of it
        words = sequence_data[i-1:i+1]
        sequences.append(words)

    sequences = np.array(sequences)

    X1 = []
    y1 = []

    for i in sequences:
        X1.append(i[0])
        y1.append(i[1])
        
    X = np.array(X1)
    Y = np.array(y1)

    return X, Y

In [None]:
X, Y = get_X_Y(sequence_data)

In [None]:
#This Generator use for larg datasets, to feed the model batch by batch

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, inputs, labels, vocab_size, batch_size=32, shuffle=True):
        'Initialization'
        self.batch_size = batch_size
        self.labels = labels
        self.inputs = inputs
        self.vocab_size = vocab_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.inputs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of Inputs
        list_inputs_temp = [self.inputs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_inputs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.inputs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_inputs_temp):
        'Generates data containing batch_size samples'
        # Initialization
        X = np.empty((self.batch_size,))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, input in enumerate(list_inputs_temp):
            # Shuffled inputs
            X[i,] = input

            # Shuffled labels
            y[i] = self.labels[input]

        return X, to_categorical(y, num_classes=self.vocab_size)

In [None]:
# Creating the LSTM model
model = tensorflow.keras.models.Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.001), metrics=['accuracy'])


In [None]:
# Make sure we have Checkpoints
checkpoint = ModelCheckpoint("nextword.h5", monitor='accuracy', verbose=1, save_best_only=True, mode='auto')
reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)
tensorboard_Visualization = TensorBoard("logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"), histogram_freq=1)

In [None]:
training_generator = DataGenerator(X, Y, vocab_size)

In [None]:
model.fit(training_generator, epochs=20, callbacks=[checkpoint, reduce, tensorboard_Visualization])

In [None]:
%reload_ext tensorboard

In [None]:
%tensorboard --logdir logs/fit

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

string_word = open_and_clean("for model testing.txt")

list_words = string_word.split()

model = load_model('nextword.h5')
tokenizer = pickle.load(open('data_tokenizer.pkl', 'rb'))

for word in list_words:
    
    try:
        seq = np.array(tokenizer.texts_to_sequences([word])[0])
        predd = model.predict_classes(seq)

        for key, value in tokenizer.word_index.items():
                    if value == predd:
                        predicted_word = key
                        break
        print('input ->',str(tokenizer.index_word[int(seq)]))
        print('predicred ->', predicted_word)
        print('')
    except: continue
        
        