# Import libraries

In [None]:
import re
import pickle
import sklearn
import pandas as pd
import numpy as np
import holoviews as hv
import nltk 
import string
import tensorflow as tf

from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from numpy import array
from pickle import dump
from keras.layers import Dense
from keras.layers import LSTM, GRU, Dropout
from keras.layers import Embedding
from keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer

# Read data

In [None]:
'''
1- Open the file
2- Read data(the data is cleaned)
3- Close the file
'''
# Open file
file = open('../input/dataset-clean-1/republic_clean.txt', 'r')
# Text file content (all data)
text = file.read()      
# Close file
file.close()  

In [None]:
#Print the first 500 characters from the text file
print(text[:500])

# Cleaning data

We want to change all text to words or tokens to use it to train models.
So we will clean the text to be readu to use it.
- Steps:

>1) Remove white space.

>2) Keep only ASCII, no digits.

>3) remove single letter chars.

>4) Replace -- with white space.

>5) Split all data into tokens by white space.

>6) Remove all non-alphabetic tokens.

>7) Convert all tokens into lowercase.

>8) remove punctuation. 

We will implement each of these cleaning operations  in a function. Below is the function clean_text() that takes a file of text as an argument and returns an array of clean tokens.


In [None]:
def clean_text(text):
    """ steps:
        - Remove all white spaces.
        - Keep only ASCII, no digits
        - remove single letter chars
        - Replace -- with white space.
        - Split all data into tokens by white space.
        - Remove all non-alphabetic tokens.
        - Convert all tokens into lowercase.
        - remove punctuation 
    """  
    
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)  #White Space
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)    #ASCII
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE) #Single charachter between two spaces
    RE_TAGS = re.compile(r"<[^>]+>")    #Tags
    
    #Replace '--' with a space ' '
    text = text.replace('--', ' ')
    #Remove tags
    text = re.sub(RE_TAGS, " ", text)
    #Remove any non english character with a single space.
    text = re.sub(RE_ASCII, " ", text)
    #Remove single charachter between two spaces 
    text = re.sub(RE_SINGLECHAR, " ", text)
    #Replace White Space, Tags, ASCII and Single charachter between two spaces with single space
    text = re.sub(RE_WSPACE, " ", text)
    # split into tokens by white space
    tokens = text.split()
    # remove punctuation from each token
    tokens = [t.translate(str.maketrans('', '', string.punctuation)) for t in tokens]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

In [None]:
# clean text
# print out some of the tokens
tokens = clean_text(text)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

# Splitting the data into sequences

* We will split tokens into sequences of 50 input words and 1 output word.
* Each line has 50 input + 1 output = 51 word.
* Printing statistics on the list, we can see that Total Sequences: 211391 training patterns to fit our model.

In [None]:
'''
1- We will split tokens into sequences of 50 input words and 1 output word.
2- Each line has 50 input + 1 output = 51 word.
'''

# organize into sequences of tokens
sequences = list()
for i in range(51, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-51:i]
    # convert into a line
    line = ' '.join(seq)
    # save the line 
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

* Save the sequence of lines in the file to use this file.

In [None]:
'''
1- Add all lines with \n in the data variable.
2- Open file to write and save sequence data.
3- Write the data in the file.
'''
data = '\n'.join(sequences)
file = open('sequence_line_file.txt', 'w')
file.write(data)
file.close()

# Read Train text after cleaning and splitting

In [None]:
'''
1- Open the file
2- Read data(the data is cleaned)
3- Close the file
'''
file = open('sequence_line_file.txt', 'r')
# Read text file content
text = file.read()  
# Close file
file.close()

In [None]:
#Splitting text based on new lines.
lines = text.split('\n')
lines

# Train a statistical language model

In this section, we will need to prepare data or tokens to apply the embedding layers in the data.

**Embedding layer** steps: after tokenizing the sentences into words.

    1- Convert the text or tokenizer into integer numbers.
    2- Splitting the tokenizer into X and y.
    3- Create a one-hot encoded vector for each y.
    4- Pass X and, y as an input of the embedded layer.

**To implement the word embedding layer we should convert input sequences into integers.**

**Tokenizer converts all unique words into unique integer numbers, and then we will convert all input text into numbers by using these unique numbers.**

In [None]:
# integer encode sequences of words
token = Tokenizer()
token.fit_on_texts(lines)
sequences = token.texts_to_sequences(lines)
sequences

In [None]:
''' 
    We need to define the embedding layer so we need the size of the vocabulary.
    So we used word_index to list mapping words to their rank/index (int) and 
    set it after fit_text_tokenizer() is called on the tokenizer.
'''
# vocabulary size
vocab_len = len(token.word_index) + 1
vocab_len

Splitting the data into inputs(X) and output(y)

In [None]:
#Change sequences from list to array to we can split data into X and y easly
sequences = array(sequences)
sequences

In [None]:
#Split sequences into X and y
X, y = sequences[:,:-1], sequences[:,-1]
#One hot encoder(y)
y = to_categorical(y, num_classes=vocab_len)

# Model

In this part we will built our models using STML once, and once again, we will use GRU.

## LSTM

In this part we will built our models using STML once, and once again, we will use GRU\
the following steps that we used it:
1. Added embedding layer, it is very important to determine the vocabulary size and input sequences length. It takes
    1. input_dim = length of vocabulary 
    2. output_dim = Dimension of the dense embedding
    3. input_length = Length of input sequences
2. Added LSTM(Long Short-Term) layer. It takes
    1. units = 115 number of units that means dimensionality of the output space
    2. return_sequences used to return the last output
3. Added Dropout layer with rate equal to 20 % that it used to prevent overfitting
4. Added Dense layer 
    1. 50 units that refer to the dimensionality of the output space
    2. ReLU activation function
5.  Added Dense layer 
    1. Vocabulary length as units that refer to the dimensionality of the output space
    2. Softmax activation function
then print the summary of the model, compile, and fit the model with batch size equal to 100, epochs = 100, and early stopping to prevent over fitting

In [None]:
model = Sequential()
# Embedding layer used to convert each word into a fixed length vector
model.add(Embedding(vocab_len, 50, input_length=X.shape[1]))
# LSTM(Long Short-Term) is actually a kind of RNN architecture
model.add(LSTM(115, return_sequences=True)) 
model.add(LSTM(115))
model.add(Dropout(0.2))# 20% dropout
# Dense layer is the regular deeply connected neural network layer.
model.add(Dense(50, activation='relu'))
model.add(Dense(vocab_len, activation='softmax'))
# Display the structure of the model
print(model.summary())

In [None]:
# Training the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=50, epochs=70, callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=5)
    ])

In this part we will built our models using LSTM model\
the following steps that we used it:
1. Added embedding layer, it is very important to determine the vocabulary size and input sequences length. It takes
    1. input_dim = length of vocabulary 
    2. output_dim = Dimension of the dense embedding
    3. input_length = Length of input sequences
2. Added LSTM(Long Short-Term) layer. It takes
    1. units = 115 number of units that means dimensionality of the output space
    2. return_sequences used to return the last output
3. Added LSTM(Long Short-Term) layer. It takes
    1. units = 115 number of units that means dimensionality of the output space
4. Added Dropout layer with rate equal to 20 % that it used to prevent overfitting
5. Added Dense layer 
    1. 50 units that refer to the dimensionality of the output space
    2. ReLU activation function
6.  Added Dense layer 
    1. Vocabulary length as units that refer to the dimensionality of the output space
    2. Softmax activation function
then print the summary of the model, compile (adam otimizer), and fit the model with batch size equal to 50, epochs = 70, and early stopping to prevent over fitting with 5 patience

## GRU

In [None]:
model1 = Sequential()
# Embedding layer used to convert each word into a fixed length vector
model1.add(Embedding(vocab_len, 50, input_length=X.shape[1]))
# GRU (Gated Recurrent Unit) is a variation on the recurrent neural network design and
# It is similar to long-term short-term memory cells
model1.add(GRU(112))
model1.add(Dropout(0.2)) # 20% dropout
# Dense layer is the regular deeply connected neural network layer.
model1.add(Dense(50, activation='relu'))
model1.add(Dense(vocab_len, activation='softmax'))
# Display the structure of the model
print(model1.summary())

In [None]:
# Training the model
model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model1.fit(X, y, batch_size=100, epochs=100, callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=5)
    ])

In this part we will built our models using GRU model\
the following steps that we used it:
1. Added embedding layer, it is very important to determine the vocabulary size and input sequences length. It takes
    1. input_dim = length of vocabulary 
    2. output_dim = Dimension of the dense embedding
    3. input_length = Length of input sequences
2. Added GRU(Gated Recurrent Unit) layer. It takes
    1. units = 112 number of units that means dimensionality of the output space
    2. return_sequences used to return the last output
3. Added GRU(Gated Recurrent Unit) layer. It takes
    1. units = 112 number of units that means dimensionality of the output space
4. Added Dropout layer with rate equal to 20 % that it used to prevent overfitting
5. Added Dense layer 
    1. 50 units that refer to the dimensionality of the output space
    2. ReLU activation function
6.  Added Dense layer 
    1. Vocabulary length as units that refer to the dimensionality of the output space
    2. Softmax activation function
then print the summary of the model, compile(adam otimizer), and fit the model with batch size equal to 100, epochs = 150, and early stopping to prevent over fitting with 5 patience