In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import re
import os
import json
import time

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras import layers

In [2]:

def clean_text(string: str, 
               punctuations = r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
               stop_words = stopwords.words('english'),
               # porter = PorterStemmer()
               wnl = WordNetLemmatizer()
              ):
    """
    A method to clean text. It removes punctuations, stop words, applies lemmatization.
    """
    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # stemming/lemmatizing words. That means changing word to its basic format, for example
    # words 'fishing', 'fished', 'fischer' will be changed into a word 'fisch'
    # lemmatization should be better because stemming changes words too much, for example
    # business is changed into busi
    # string = ' '.join([porter.stem(word) for word in string.split()])
    string = ' '.join([wnl.lemmatize(word, pos = "v") for word in string.split()])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string

def create_training_data(tokenizer,
                         sentences_file,
                         embed_matrix_file,
                         model_folder,
                         max_sen_len = None
                        ):
    """
    Creating a training and testing datasets self.x_train, self.x_test, self.y_train, self.y_test. This function
    also creates and saves a tokenizer and a list of all unique tables names all_unique_values because when we load
    a ready model those values are needed for the 'predict' function.
    """
    sentences_tables = pd.read_excel(sentences_file).values
    random.shuffle(sentences_tables)
    clean_sentences = np.array([clean_text(sentence) for sentence in sentences_tables[:, 0]])

    tokenizer.fit_on_texts(clean_sentences)

    sequences = tokenizer.texts_to_sequences(clean_sentences)
    if max_sen_len == None:
        max_sen_len = np.max([len(seq) for seq in sequences])
    x = pad_sequences(sequences, maxlen = max_sen_len)

    embed_matrix = pd.read_csv(embed_matrix_file).values

    x_train, x_test = train_test_split(x, test_size = 0.2)

    with open(os.path.join(model_folder, 'tokenizer.json'), 'w') as file:
        json.dump(tokenizer.to_json(), file)
        
    return x_train, x_test


def get_coefs(word, *arr): 
    return word, list(np.asarray(arr, dtype='float'))


def create_embedding_file(tokenizer,
                          embed_file_src = r'model\glove.840B.300d.txt', 
                          embed_file_trg = r'model\model_embeddings.txt'
                         ):
    """
    This function will create an embedding file called embed_file_trg which will contain only those words 
    from embed_file_src which are present in the training dataset (tokenizer.word_index).
    """

    embeddings = dict(get_coefs(*o.split(" ")) for o in open(embed_file_src, errors = 'ignore'))
    with open(embed_file_trg, 'w') as file:
        for word, index in tokenizer.word_index.items():
            word_vector = embeddings[word]
            line = ' '.join(np.concatenate([[word], word_vector]))
            file.write(line + '\n')


def create_embedding_matrix(tokenizer,
                            model_folder,
                            word_vec_dim,
                            embed_file_path,
                           ):
    """
    A function to create an embedding matrix. This is a matrix where each row is a vector representing a word.
    To create that matrix we use a word embedding file which path is equal to embedding_file_path.
    embedding_matrix[row_number] is a vector representation for a word = list(tokenizer.word_index.keys())[row_number - 1]
    First row of embedding_matrix are zeros. This matrix is needed to train a model.
    """
    embeddings = dict(get_coefs(*o.split(" ")) for o in open(embed_file_path, errors = 'ignore'))

    # embedding_matrix[row_number] is a vector representation of a word = self.tokenizer.word_index.keys()[row_number - 1]
    # first row in embedding_matrix is 0
    embedding_matrix = np.zeros((len(tokenizer.word_counts) + 1, word_vec_dim))
    for word, index in tokenizer.word_index.items():
        if index > len(tokenizer.word_counts):
            break
        else:
            try:
                embedding_matrix[index] = embeddings[word]
            except:
                continue

    pd.DataFrame(embedding_matrix).to_csv(os.path.join(model_folder, 'embedding_matrix.csv'))
    return embedding_matrix

In [3]:
tokenizer = Tokenizer()
# max_sen_len = 20
sentences_file = r'data\sentences_tables.xlsx'
embed_matrix_file = r'model\embedding_matrix.csv'
model_folder = 'model'
word_vec_dim = 300
embed_file_path = r'model\model_embeddings.txt'

In [4]:
x_train, x_test = create_training_data(
    tokenizer = tokenizer, 
    # max_sen_len = max_sen_len,
    sentences_file = sentences_file,
    embed_matrix_file = embed_matrix_file,
    model_folder = model_folder
)

In [5]:
x_train

array([[ 0,  0,  0, ...,  5, 13,  6],
       [ 0,  0,  0, ...,  5,  6,  8],
       [ 0,  0,  0, ...,  5,  6, 13],
       ...,
       [ 0,  0,  0, ...,  2, 21, 10],
       [ 0,  0,  0, ...,  1, 26, 11],
       [ 0,  0,  0, ...,  3, 27, 12]])

In [6]:
embed_matrix = create_embedding_matrix(
    tokenizer = tokenizer,
    model_folder = model_folder,
    word_vec_dim = word_vec_dim,
    embed_file_path = embed_file_path
)

In [7]:
embed_matrix

array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [-0.50318 ,  0.27905 , -0.045497, ...,  0.4781  ,  0.13005 ,
        -0.014399],
       [-0.89423 ,  0.39636 ,  0.64359 , ..., -0.15076 ,  0.06987 ,
         0.041258],
       ...,
       [ 0.37492 , -0.052425, -0.60094 , ..., -0.36104 , -0.065253,
        -0.1206  ],
       [ 0.012832,  0.22669 , -0.17511 , ...,  0.17134 ,  0.040047,
        -0.37131 ],
       [-0.39054 , -0.55117 , -0.073466, ...,  0.34569 ,  0.30918 ,
        -0.32873 ]])

In [8]:
for i, item in enumerate(tokenizer.word_index.items()):
    print(item)
    if i == 5:
        break

('employee', 1)
('cost', 2)
('user', 3)
('office', 4)
('business', 5)
('unit', 6)


In [9]:
class Encoder(Model):
    def __init__(self,
                 embedding_dim,
                 lstm_out_size,
                 batch_size,
                 embed_matrix
                ):
        super().__init__()
        self.lstm_out_size = lstm_out_size
        self.batch_size = batch_size
        self.embedding = layers.Embedding(
            input_dim = embed_matrix.shape[0],
            output_dim = embedding_dim,
            embeddings_initializer = tf.keras.initializers.Constant(embed_matrix),
            trainable = False
        )
        self.lstm = layers.LSTM(
            units = self.lstm_out_size,
            return_sequences = True,
            return_state = True
        )
        
    
    def call(self, x, state_h = None, state_c = None):
        # x.shape = (batch_size, max_sen_len)
        # x is a series of numbers which represent words
        # state_h.shape = (batch_size, lstm_out_size)
        
        if state_h == None or state_c == None:
            state_h, state_c = self.initialize_hidden_state()
        
        # make sure that the types are correct
        x = tf.cast(x, tf.float32)
        state_h = tf.cast(state_h, tf.float32)
        state_c = tf.cast(state_c, tf.float32)
        
        x = self.embedding(x)
        # x.shape after embedding = (batch_size, max_sen_len, embedding_dim)
        # output.shape = (batch_size, max_sen_len, lstm_out_size)
        # state_h.shape = (batch_size, lstm_out_size)
        output, state_h, state_c = self.lstm(x, initial_state = [state_h, state_c])
        return output, state_h, state_c
    
    def initialize_hidden_state(self):
        state_h = tf.zeros((self.batch_size, self.lstm_out_size))
        state_c = tf.zeros((self.batch_size, self.lstm_out_size))
        return state_h, state_c

In [34]:
encoder = Encoder(embedding_dim = 300,
                 lstm_out_size = 10,
                 batch_size = 2,
                 embed_matrix = embed_matrix
                 )

x = np.array([[1, 2], [1, 2]])

with tf.GradientTape() as tape:
    output, state_h, state_c = encoder(x)
    
variables = encoder.trainable_variables
gradients = tape.gradient(output, variables)
gradients

[<tf.Tensor: shape=(300, 40), dtype=float32, numpy=
 array([[-0.13181046,  0.17578174, -0.2120846 , ..., -0.13128494,
         -0.16249536, -0.21907593],
        [ 0.06678414, -0.09048228,  0.1054726 , ...,  0.06102577,
          0.07817639,  0.10394341],
        [ 0.03403007, -0.03505441,  0.06919112, ...,  0.0738593 ,
          0.07218578,  0.10790002],
        ...,
        [ 0.06178866, -0.0966633 ,  0.07948293, ...,  0.00635299,
          0.03442178,  0.03179796],
        [ 0.02383993, -0.03409163,  0.03514545, ...,  0.01484937,
          0.02266026,  0.02819575],
        [ 0.00046793,  0.00032897,  0.002085  , ...,  0.00415377,
          0.00336663,  0.00551508]], dtype=float32)>,
 <tf.Tensor: shape=(10, 40), dtype=float32, numpy=
 array([[ 0.00786627, -0.00872229,  0.01512839,  0.00888416, -0.01922235,
          0.00786126, -0.00542759,  0.00856517,  0.00329846,  0.01680919,
          0.00409558, -0.00518857,  0.00704846,  0.0061653 , -0.00591288,
          0.01023573,  0.0025303

In [9]:
# version 1 like in Jonathan Hui pdf
class Bahdau_attention(layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = layers.Dense(units)
        self.W2 = layers.Dense(units)
        self.V = layers.Dense(1)
        
    def call(self, decoder_hidden, encoder_hidden):
        # decoder_hidden.shape = (batch_size, hidden_size)
        # decoder_hidden_time_axis.shape = (batch_size, 1, hidden_size)
        decoder_hidden_time_axis = tf.expand_dims(decoder_hidden, 1)
        
        # encoder_hidden.shape = (batch_size, max_sen_len, hidden_size)
        # argument for tanh shape = (batch_size, max_sen_len, hidden_size)
        # score.shape = (batch_size, max_sen_len, 1)
        score = self.V(tf.nn.tanh(self.W1(decoder_hidden_time_axis) + self.W2(encoder_hidden)))
        
        # attention_weights.shape = (batch_size, max_sen_len, 1)
        attention_weights = tf.nn.softmax(score, axis = 1)
        
        # context_vector.shape = (batch_size, hidden_size)
        context_vector = attention_weights * encoder_hidden
        context_vector = tf.reduce_sum(context_vector, axis = 1)
        
        return context_vector, attention_weights

In [117]:
# version 2, like in attention explanation pdf
class Bahdau_attention(layers.Layer):
    def __init__(self):
        super().__init__()
        self.dense = layers.Dense(1)
        
    def call(self, decoder_state_h, encoder_states_h):
        # decoder_state_h.shape = (batch_size, dec_state_size)
        # encoder_states_h.shape = (batch_size, max_sen_len, enc_state_size)
        
        # make sure that the dtypes are correct
        decoder_state_h = tf.cast(decoder_state_h, tf.float32)
        encoder_states_h = tf.cast(encoder_states_h, tf.float32)
        
        # encoder_states_h_flattened.shape = (max_sen_len, enc_state_size)
        encoder_states_h_flattened = tf.reshape(encoder_states_h, [-1, tf.shape(encoder_states_h)[2]])
        batch_size, _, enc_state_size = tf.shape(encoder_states_h)
        
        max_sen_len = encoder_states_h.shape[1]
        for i in range(batch_size):
            attention_weights = tf.constant([])
            for j in range(max_sen_len):
                e = tf.constant([])
                for k in range(max_sen_len):
                    x = tf.concat([decoder_state_h[i], encoder_states_h_flattened[i + k]], 0)
                    # x.shape = (dec_state_size + enc_state_size)
                    x = tf.expand_dims(x, 0)
                    # x.shape = (1, dec_state_size + enc_state_size)
                    e = tf.experimental.numpy.append(e, tf.math.exp(self.dense(x)))

                new_attention_weight = tf.math.divide(e[j], tf.math.reduce_sum(e))
                attention_weights = tf.experimental.numpy.append(attention_weights, new_attention_weight)
                
            # context_vector.shape = (batch_size, enc_state_size)
            new_context_vector_row = tf.reduce_sum([attention_weights[j] * encoder_states_h_flattened[i + j] for j in range(max_sen_len)], axis = 0)
            new_context_vector_row = tf.expand_dims(new_context_vector_row, 0)
            if i == 0:
                context_vector = new_context_vector_row
            else:
                context_vector = tf.concat([context_vector, new_context_vector_row], axis = 0)
        
        return tf.cast(context_vector, tf.float32)

In [111]:
attention = Bahdau_attention()
decoder_state_h = tf.constant([[1,2,3], [4,5,6]])
encoder_states_h = tf.constant([[[1,2], [4,5]], [[1,2], [4,5]]])
encoder_states_h_flattened = tf.reshape(encoder_states_h, [-1, tf.shape(encoder_states_h)[2]])

with tf.GradientTape() as tape:
    context_vector = attention(decoder_state_h, encoder_states_h)
    
variables = attention.trainable_variables
gradients = tape.gradient(context_vector, variables)
gradients

[<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
 array([[ 5.4463744e-06],
        [ 6.8396330e-06],
        [ 8.2999468e-06],
        [-4.8494557e-01],
        [-4.8494416e-01]], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.4230609e-06], dtype=float32)>]

In [105]:
context_vector

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[4.872717 , 6.8218036],
       [4.8727164, 6.821803 ]], dtype=float32)>

In [120]:
class Decoder(Model):
    def __init__(self, vocab_size, embedding_dim, lstm_out_size, embed_matrix):
        super().__init__()
        self.lstm_out_size = lstm_out_size
        self.embedding = layers.Embedding(
            input_dim = embed_matrix.shape[0],
            output_dim = embedding_dim,
            embeddings_initializer = tf.keras.initializers.Constant(embed_matrix),
            trainable = False
        )
        self.lstm = layers.LSTM(
            units = self.lstm_out_size,
            # return_sequences = True,
            return_state = True
        )
        self.dense = layers.Dense(vocab_size)
        self.attention = Bahdau_attention()
        
    def call(self, x, decoder_state_h, decoder_state_c, encoder_states_h):
        # x.shape = (batch_size, 1)
        # x is a single number for each batch representing a single word
        # encoder_states_h.shape = (batch_size, max_sen_len, enc_state_size)
        # decoder_state_h.shape = (batch_size, lstm_out_size)
        
        # make sure that the types are correct
        x = tf.cast(x, tf.float32)
        decoder_state_h = tf.cast(decoder_state_h, tf.float32)
        decoder_state_c = tf.cast(decoder_state_c, tf.float32)
        encoder_states_h = tf.cast(encoder_states_h, tf.float32)
        
        # context_vector.shape = (batch_size, enc_state_size)
        context_vector = self.attention(decoder_state_h, encoder_states_h)
        # shape of output of embedding layer = (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        # x.shape after concatenation = (batch_size, 1, enc_state_size + embedding_dim)
        # print('context_vector: ', tf.expand_dims(context_vector, 1))
        # print('x: ', x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis = 2)
        
        output, state_h, state_c = self.lstm(x, initial_state = [decoder_state_h, decoder_state_c])
        
        # output.shape = (batch_size, vocab_size)
        output = self.dense(output)
        
        return output, state_h, state_c

In [119]:
decoder = Decoder(vocab_size = 100, 
                  embedding_dim = 300, 
                  lstm_out_size = 20, 
                  embed_matrix = embed_matrix
                 )

x = tf.constant([[1], [2]])
decoder_state_h = tf.constant([[i for i in range(20)], [i for i in range(20)]])
decoder_state_c = tf.constant([[i for i in range(20)], [i for i in range(20)]])
encoder_states_h = tf.constant([[[i for i in range(15)], [i for i in range(15)]], [[i for i in range(15)], [i for i in range(15)]]])

output, state_h, state_c = decoder(x, decoder_state_h, decoder_state_c, encoder_states_h)

context_vector:  tf.Tensor(
[[[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14.]]

 [[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14.]]], shape=(2, 1, 15), dtype=float32)
x:  tf.Tensor(
[[[-5.0318e-01  2.7905e-01 -4.5497e-02  8.7287e-02  8.2939e-03
    8.1533e-02 -2.0737e-01 -6.5505e-01  1.3109e-01  3.1086e+00
   -6.5408e-01 -5.5146e-02 -5.9301e-01  2.5131e-01  1.4342e-01
    7.9765e-02 -1.6456e-01  1.0091e+00 -3.7291e-01 -7.6170e-02
    8.7821e-02  1.4089e-01 -7.8971e-02 -3.5242e-01  6.3374e-01
    2.0048e-01 -4.1246e-01  2.9199e-02  5.7455e-01 -2.2509e-01
   -5.6851e-02 -5.9095e-01 -7.1849e-02  1.1489e-01 -2.7550e-01
    1.1122e-01  1.5333e-01  8.6889e-02 -5.3122e-01 -2.6919e-01
    4.1412e-01 -3.8239e-02  3.6797e-01  3.5596e-01 -1.4596e-01
   -2.4261e-01 -4.5774e-01 -8.9788e-02  3.1969e-01  4.3052e-01
    5.7857e-03 -6.8859e-01  2.3270e-01  4.1043e-02 -4.0725e-01
    1.1305e-01  2.8135e-01  3.4865e-01 -9.3155e-02 -2.0771e-01
    3.3811e-01 -5.7351e-01  3.9391e

In [121]:
def train_step(inp, 
               targ, 
               # enc_state_h, 
               # enc_state_c, 
               batch_size, 
               encoder, 
               decoder, 
               loss_function, 
               optimizer):
    # inp.shape = targ.shape (batch_size, max_sen_len)
    # enc_state_h.shape = (batch_size, enc_state_size)
    
    # make sure that the types are correct
    inp = tf.cast(inp, tf.float32)
    targ = tf.cast(targ, tf.float32)
    # enc_state_h = tf.cast(enc_state_h, tf.float32)
    # enc_state_c = tf.cast(enc_state_c, tf.float32)
    
    batch_loss = 0
    
    with tf.GradientTape() as tape:
        # enc_output.shape = (batch_size, max_sen_len, enc_state_size)
        # enc_state_h.shape = (batch_size, state_size)
        # enc_output, enc_state_h, enc_state_c = encoder(inp, enc_state_h, enc_state_c)
        enc_output, enc_state_h, enc_state_c = encoder(inp)
        dec_state_h = enc_state_h
        dec_state_c = enc_state_c
        
        # dec_input.shape = (batch_size, 1)
        dec_input = tf.expand_dims([0] * batch_size, 1)
        
        for t in range(targ.shape[1]):
            prediction, dec_state_h, dec_state_c, = decoder(dec_input, dec_state_h, dec_state_c, enc_output)
            # real value passed to loss_function needs to have shape (batch_size).
            # It is a number representing a word from tokenizer.word_index. Real value = 0
            # means that there was no word
            batch_loss += loss_function(targ[:, t], prediction)
            dec_input = tf.expand_dims(targ[:, t], 1)
            
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(batch_loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss

In [122]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction = 'none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype = loss.dtype)
    loss *= mask
    
    return tf.reduce_mean(loss)

In [40]:
# real = tf.expand_dims(x_train[:2, 0], 1)
# pred = tf.expand_dims(x_train[:2, 0], 1)

real = tf.constant([1, 0])
pred = tf.constant([[0.05, 0.95], [1, 0]])

real = tf.cast(real, tf.float32)
pred = tf.cast(pred, tf.float32)

loss_object(real, pred)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([5.1293328e-02, 1.1920928e-07], dtype=float32)>

In [123]:
epochs = 10
batch_size = 20
embedding_dim = 300

inp = tf.constant(x_train)
targ = tf.constant(x_train)

decoder = Decoder(vocab_size = len(tokenizer.word_index.keys()) + 1,
                  embedding_dim = embedding_dim,
                  lstm_out_size = 100,
                  embed_matrix = embed_matrix
                 )

encoder = Encoder(embedding_dim = embedding_dim,
                 lstm_out_size = 100,
                 batch_size = batch_size,
                 embed_matrix = embed_matrix
                 )

In [124]:
stime = time.time()
for epoch in range(epochs):
    total_loss = 0
    for batch_number in range(len(inp) // batch_size):
        inp_batch = inp[batch_number * batch_size : (batch_number + 1) * batch_size, :]
        targ_batch = targ[batch_number * batch_size : (batch_number + 1) * batch_size, :]
        
        batch_loss = train_step(inp = inp_batch, 
                               targ = targ_batch,
                               batch_size = batch_size,
                               encoder = encoder, 
                               decoder = decoder, 
                               loss_function = loss_function, 
                               optimizer = optimizer
                              )
        total_loss += batch_loss
        
        print(f'Batch number: {batch_number}, Loss: {batch_loss / batch_size}, Time per batch: {(time.time() - stime) / (batch_number + 1)}')
        
    print(f'\nEpoch: {epoch}, Loss: {total_loss / ((batch_number + 1) * batch_size)}, Time per epoch: {(time.time() - stime) / (epoch + 1)}')

Batch number: 0, Loss: 2.7100887298583984
Time per batch: 175.22068905830383
Batch number: 1, Loss: 2.647005796432495
Time per batch: 182.91211879253387
Batch number: 2, Loss: 2.696329116821289
Time per batch: 186.9434502919515


KeyboardInterrupt: 