In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import re
import os
import json

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras import layers

In [7]:

def clean_text(string: str, 
               punctuations = r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
               stop_words = stopwords.words('english'),
               # porter = PorterStemmer()
               wnl = WordNetLemmatizer()
              ):
    """
    A method to clean text. It removes punctuations, stop words, applies lemmatization.
    """
    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # stemming/lemmatizing words. That means changing word to its basic format, for example
    # words 'fishing', 'fished', 'fischer' will be changed into a word 'fisch'
    # lemmatization should be better because stemming changes words too much, for example
    # business is changed into busi
    # string = ' '.join([porter.stem(word) for word in string.split()])
    string = ' '.join([wnl.lemmatize(word, pos = "v") for word in string.split()])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string

def create_training_data(tokenizer,
                         max_sen_len,
                         sentences_file,
                         embed_matrix_file,
                         model_folder
                        ):
    """
    Creating a training and testing datasets self.x_train, self.x_test, self.y_train, self.y_test. This function
    also creates and saves a tokenizer and a list of all unique tables names all_unique_values because when we load
    a ready model those values are needed for the 'predict' function.
    """
    sentences_tables = pd.read_excel(sentences_file).values
    random.shuffle(sentences_tables)
    clean_sentences = np.array([clean_text(sentence) for sentence in sentences_tables[:, 0]])

    tokenizer.fit_on_texts(clean_sentences)

    sequences = tokenizer.texts_to_sequences(clean_sentences)
    x = pad_sequences(sequences, maxlen = max_sen_len)

    embed_matrix = pd.read_csv(embed_matrix_file).values

    x_train, x_test = train_test_split(x, test_size = 0.2)

    with open(os.path.join(model_folder, 'tokenizer.json'), 'w') as file:
        json.dump(tokenizer.to_json(), file)
        
    return x_train, x_test


def get_coefs(word, *arr): 
    return word, list(np.asarray(arr, dtype='float'))


def create_embedding_file(tokenizer,
                          embed_file_src = r'model\glove.840B.300d.txt', 
                          embed_file_trg = r'model\model_embeddings.txt'
                         ):
    """
    This function will create an embedding file called embed_file_trg which will contain only those words 
    from embed_file_src which are present in the training dataset. If training dataset wasn't created yet
    then this function will create it.
    """
    # creating a training dataset if we didn't do that yet
    # if not hasattr(self, 'tokenizer'):
    #     self.create_training_data()

    embeddings = dict(get_coefs(*o.split(" ")) for o in open(embed_file_src, errors = 'ignore'))
    with open(embed_file_trg, 'w') as file:
        for word, index in tokenizer.word_index.items():
            word_vector = embeddings[word]
            line = ' '.join(np.concatenate([[word], word_vector]))
            file.write(line + '\n')


def create_embedding_matrix(tokenizer,
                            model_folder,
                            word_vec_dim,
                            embed_file_path,
                           ):
    """
    A function to create the embedding matrix. This is a matrix where each row is a vector representing a word.
    To create that matrix we use a word embedding file which path is equal to embedding_file_path.
    embedding_matrix[row_number] is a vector representation for a word = list(tokenizer.word_index.keys())[row_number - 1]
    First row of embedding_matrix are zeros. This matrix is needed to train a model.
    """
    embeddings = dict(get_coefs(*o.split(" ")) for o in open(embed_file_path, errors = 'ignore'))

    # embedding_matrix[row_number] is a vector representation of a word = self.tokenizer.word_index.keys()[row_number - 1]
    # first row in embedding_matrix is 0
    embedding_matrix = np.zeros((len(tokenizer.word_counts) + 1, word_vec_dim))
    for word, index in tokenizer.word_index.items():
        if index > len(tokenizer.word_counts):
            break
        else:
            try:
                embedding_matrix[index] = embeddings[word]
            except:
                continue

    pd.DataFrame(embedding_matrix).to_csv(os.path.join(model_folder, 'embedding_matrix.csv'))
    return embedding_matrix

In [8]:
tokenizer = Tokenizer()
max_sen_len = 20
sentences_file = r'data\sentences_tables.xlsx'
embed_matrix_file = r'model\embedding_matrix.csv'
model_folder = 'model'
word_vec_dim = 300
embed_file_path = r'model\model_embeddings.txt'

In [9]:
x_train, x_test = create_training_data(
    tokenizer = tokenizer, 
    max_sen_len = max_sen_len,
    sentences_file = sentences_file,
    embed_matrix_file = embed_matrix_file,
    model_folder = model_folder
)

In [10]:
x_train

array([[ 0,  0,  0, ...,  0,  1, 18],
       [ 0,  0,  0, ...,  3, 27, 13],
       [ 0,  0,  0, ...,  0,  0,  8],
       ...,
       [ 0,  0,  0, ...,  6,  7,  9],
       [ 0,  0,  0, ..., 14,  1, 13],
       [ 0,  0,  0, ..., 27, 23, 13]])

In [11]:
embed_matrix = create_embedding_matrix(
    tokenizer = tokenizer,
    model_folder = model_folder,
    word_vec_dim = word_vec_dim,
    embed_file_path = embed_file_path
)

In [12]:
embed_matrix

array([[ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [-0.50318  ,  0.27905  , -0.045497 , ...,  0.4781   ,  0.13005  ,
        -0.014399 ],
       [ 0.16082  ,  0.11008  , -0.28129  , ...,  0.12435  , -0.27433  ,
         0.65513  ],
       ...,
       [ 0.37492  , -0.052425 , -0.60094  , ..., -0.36104  , -0.065253 ,
        -0.1206   ],
       [-0.21291  ,  0.22692  , -0.0038332, ..., -0.1319   , -0.37649  ,
         0.26359  ],
       [ 0.0070485,  0.89922  ,  0.26305  , ...,  0.13479  , -0.45046  ,
        -0.38282  ]])

In [13]:
class Encoder(Model):
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 units,
                 batch_size,
                 embed_matrix
                ):
        super().__init__()
        self.units = units
        self.batch_size = batch_size
        self.embedding = layers.Embedding(
            input_dim = embed_matrix.shape[0],
            output_dim = embedding_dim,
            embeddings_initializer = tf.keras.initializers.Constant(embed_matrix)
            # weights = [embed_matrix]
        )
        self.lstm = layers.LSTM(
            units = self.units,
            # return_sequences = True,
            return_state = True
        )
        
    
    def call(self, x, hidden):
        x = self.embedding(x)
        x = tf.reshape(x, [1, x.shape[0], x.shape[1]])
        output, state_h, state_c = self.lstm(x, initial_state = hidden)
        return output, state_h, state_c
    
    def initialize_hidden_state(self):
        state_h = tf.zeros((self.batch_size, self.units))
        state_c = tf.zeros((self.batch_size, self.units))
        return state_h, state_c

In [14]:
encoder = Encoder(vocab_size = 1000, 
                  embedding_dim = 300, 
                  units = 100, 
                  batch_size = 1, 
                  embed_matrix = embed_matrix
                 )

In [15]:
x = np.array([1,2,3])
h, c = encoder.initialize_hidden_state()
encoder(x, [h, c])

(<tf.Tensor: shape=(1, 100), dtype=float32, numpy=
 array([[-0.02798761, -0.03059502, -0.02835076, -0.00086967,  0.00740897,
         -0.00867459, -0.31799874,  0.20312451, -0.08919083, -0.10420913,
         -0.10220382,  0.0100808 , -0.10102621,  0.0306694 ,  0.04323321,
          0.09270649, -0.03707276, -0.27187118, -0.02982667,  0.12752038,
          0.14710037,  0.10568916, -0.02166574, -0.10390468,  0.08863961,
          0.05950929, -0.09506547,  0.02575839, -0.03920403,  0.1496663 ,
         -0.08147945,  0.06888774, -0.06983711, -0.08854045,  0.08425502,
          0.04953358,  0.24446163,  0.15371579, -0.01298809,  0.0021179 ,
         -0.08391824,  0.08806036,  0.22495514,  0.03968307, -0.10380773,
         -0.04154506, -0.20224857, -0.03089944,  0.10567556,  0.09031706,
          0.06519564, -0.06983673, -0.0292542 ,  0.004426  ,  0.15319899,
         -0.1249022 ,  0.16098052,  0.20253429,  0.01155627, -0.14017662,
         -0.17347677, -0.16192678, -0.12900086, -0.00311073, 