## NLP Model for Polynomial Expansion

In [None]:
import tensorflow as tf 
import keras
import numpy as np
import pandas as pd
import re
import math
from typing import Tuple
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense, Concatenate, Embedding, Dropout, Bidirectional
from keras.preprocessing.text import Tokenizer
from google.colab import drive

In [None]:
!python --version
print(tf.__version__)
print(np.__version__)
print(pd.__version__)
print(re.__version__)
print(keras.__version__)

Python 3.7.13
2.8.2
1.21.6
1.3.5
2.2.1


In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path_train = '/content/drive/MyDrive/Scale_AI/train.txt'

In [None]:
# Load data and eliminate duplicates

def load_file(file_path: str) -> Tuple[Tuple[str], Tuple[str]]:
    data = open(file_path, "r").readlines()
    data = list(set(data))
    factors, expansions = zip(*[line.strip().split("=") for line in data])
    return factors, expansions

In [None]:
# From EDA

VOCAB = '[0-9]|[a-z]|\*+|\(|\)|\+|\-'
MAX_LENGTH = 29

In [None]:
# Text cleaning and tokenization

factors, expansions = load_file(path_train)

factors_prep = []
expansions_prep = []

for f in factors:
  f = f.strip()
  factors_prep.append(re.findall(VOCAB, f))
 
for e in expansions:
  e = e.strip()
  expansions_prep.append(re.findall(VOCAB, e))
 

In [None]:
print(factors[31])
print(factors_prep[31])

(18-6*i)*(i+19)
['(', '1', '8', '-', '6', '*', 'i', ')', '*', '(', 'i', '+', '1', '9', ')']


In [None]:
# Make sure the maximum length is under 29

print(max(map(len, factors_prep)))
print(max(map(len, expansions_prep)))

29
27


In [None]:
# Increase max expansions length by 1 because of the bos token

max_factors_length = 29
max_expansions_length = 28 

In [None]:
# Vocabulary generation

def create_vocab(sentences):

  tokenizer = Tokenizer(num_words=None)
  tokenizer.fit_on_texts(sentences)
  dictionary = tokenizer.word_index
          
  return dictionary

In [None]:
factors_vocab = create_vocab(factors_prep)
expansions_vocab = create_vocab(expansions_prep)

In [None]:
# Check factors and expansions vocabularies match

print(len(factors_vocab))
print(len(expansions_vocab))
print(factors_vocab.keys() == expansions_vocab.keys())

29
29
True


In [None]:
# Padding and inserting bos, eos tokens (to decoder inputs and target sequences, respectively)

def factors_padder(input_sentence):
  pad = '<pad>'
  for i in range(max_factors_length-len(input_sentence)):
    input_sentence.append(pad)
  return input_sentence

def expansions_padder(input_sentence):
  
  pad = '<pad>'
  bos = '<bos>'
  eos = '<eos>'

  output = [bos] + input_sentence
  target = input_sentence + [eos]

  for i in range(max_expansions_length-(len(input_sentence)+1)):
    output.append(pad)
    target.append(pad) 
    
  return output, target

In [None]:
outputs = []
targets = []

factors_prep = list(map(factors_padder,factors_prep))

for e in expansions_prep:
  o, t = expansions_padder(e)
  outputs.append(o)
  targets.append(t)

expansions_prep = outputs

In [None]:
print(expansions_prep[8])
print(targets[8])

['<bos>', '-', '5', '4', '*', 'n', '**', '2', '+', '1', '4', '4', '*', 'n', '-', '9', '0', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['-', '5', '4', '*', 'n', '**', '2', '+', '1', '4', '4', '*', 'n', '-', '9', '0', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [None]:
# Add padding,bos,eos tags to vocabulary

expansions_vocab['<pad>'] = 0
expansions_vocab['<bos>'] = 30
expansions_vocab['<eos>'] = 31
vocabulary = expansions_vocab

In [None]:
print(vocabulary)

{'*': 1, '2': 2, '-': 3, '**': 4, '1': 5, '+': 6, '4': 7, '6': 8, '3': 9, '0': 10, '5': 11, '8': 12, '7': 13, '9': 14, 's': 15, 'n': 16, 'i': 17, 't': 18, 'c': 19, 'a': 20, 'o': 21, 'k': 22, 'y': 23, 'z': 24, 'j': 25, 'x': 26, 'h': 27, '(': 28, ')': 29, '<pad>': 0, '<bos>': 30, '<eos>': 31}


In [None]:
# Create reverse vocabulary

reverse_vocabulary = {v:k for k, v in vocabulary.items()}

In [None]:
# Convert tokenized sentences into interger sequences

def text2seq(input_sentence):
  sequence = []
  for token in input_sentence:
    sequence.append(int(vocabulary[token]))
  return sequence

In [None]:
factors_sequences = np.array(list(map(text2seq,factors_prep)))
expansions_sequences = np.array(list(map(text2seq,expansions_prep)))
targets_sequences = np.array(list(map(text2seq,targets)))

In [None]:
# Print an example

print(factors[0])
print(factors_sequences[0])
print(expansions[0])
print(expansions_sequences[0])

2*y*(2*y-15)
[ 2  1 23  1 28  2  1 23  3  5 11 29  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0]
4*y**2-30*y
[30  7  1 23  4  2  3  9 10  1 23  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0]


In [None]:
# Create labels for model

def create_labels(targets_data, num_samples, MAX_LEN, VOCAB_SIZE):
  decoder_output_data = np.zeros((num_samples, MAX_LEN, VOCAB_SIZE), dtype="float32")
  for i, seqs in enumerate(targets_data):
    for j, seq in enumerate(seqs):
      decoder_output_data[i,j,seq] = 1.
        
  print(decoder_output_data.shape)
  return decoder_output_data

labels = create_labels(targets_sequences, len(targets_sequences), max_expansions_length, len(vocabulary))

(732171, 28, 32)


In [None]:
# Variables for the model

num_words = len(vocabulary)
hidden_dim = 100
lstm_dim = 256
dropout_rate = 0.3
lr = 0.005
beta1 = 0.9
beta2 = 0.999
lr_decay = 0.01
batch_size = 128
num_epochs = 100

In [None]:
# MODEL 

# Embedding layers
ENC_EMB = Embedding(num_words, hidden_dim)
DEC_EMB = Embedding(num_words, hidden_dim)

def Polynomial_Seq2Seq():
  
  #Training Encoder
  enc_inputs = Input(shape=(max_factors_length,),dtype=np.int64)
  enc_embedding = ENC_EMB(enc_inputs)
  enc_lstm = Bidirectional(LSTM(lstm_dim, activation='tanh', dropout=dropout_rate, return_state=True),merge_mode='concat')
  encoder_outputs, h_forw, c_forw, h_back, c_back = enc_lstm(enc_embedding)
  h = Concatenate()([h_forw, h_back])
  c = Concatenate()([c_forw, c_back])
  encoder_states = [h,c]

  #Training Decoder
  dec_inputs = Input(shape=(max_expansions_length,),dtype=np.int64)
  dec_embedding = DEC_EMB(dec_inputs)
  dec_lstm = LSTM(2*lstm_dim,activation='tanh', dropout=dropout_rate, return_sequences=True, return_state=True)
  decoder_outputs, _ , _ = dec_lstm(dec_embedding, initial_state=encoder_states)

  #Dense (to map the LSTM outputs to the words)
  dense_1 = Dense(256, activation='tanh')
  dense_2 = Dense(num_words, activation='softmax')
  outputs = dense_1(decoder_outputs)
  outputs = dense_2(outputs)

  #Training Model
  model = Model([enc_inputs, dec_inputs], outputs, name='Training_Seq2Seq')

  #Inference Encoder
  encoder = Model(enc_inputs, encoder_states, name='Inference_Encoder')

  #Inference Decoder
  inf_dec_input = Input(shape=(1,))
  decoder_state_input_h = Input(shape=(2*lstm_dim,))
  decoder_state_input_c = Input(shape=(2*lstm_dim,))
  decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
  decoder_input_emb = DEC_EMB(inf_dec_input)
  decoder_outputs, h, c = dec_lstm(decoder_input_emb, initial_state=decoder_states_inputs)
  decoder_states = [h, c]
  decoder_outputs = dense_1(decoder_outputs)
  decoder_outputs = dense_2(decoder_outputs)
  decoder = Model([inf_dec_input] + decoder_states_inputs, [decoder_outputs] + decoder_states, name='Inference_Decoder')

  return model, encoder, decoder

In [None]:
model, encoder, decoder = Polynomial_Seq2Seq()
opt = tf.keras.optimizers.Adam(learning_rate= lr, beta_1= beta1, beta_2= beta2, decay= lr_decay)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "Training_Seq2Seq"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 29)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 29, 100)      3200        ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 28)]         0           []                               
                                                                                                  
 bidirectional (Bidirectional)  [(None, 512),        731136      ['embedding[0][0]']              
                                 (None, 256),                                      

In [None]:
# Save model architecture

with open('network.txt', 'w') as f:

    model.summary(print_fn=lambda x: f.write(x + '\n'))
    encoder.summary(print_fn=lambda x: f.write(x + '\n'))
    decoder.summary(print_fn=lambda x: f.write(x + '\n'))

In [None]:
# Training

results = model.fit(
    [factors_sequences, expansions_sequences],
    labels,
    batch_size = batch_size,
    epochs = num_epochs,
    validation_split = 0.2
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
# Print accuracies over epochs

x = [title for title in model.history.history.keys() if 'accuracy' in title]
df = pd.DataFrame(model.history.history)[x]
df

Unnamed: 0,accuracy,val_accuracy
0,0.858567,0.914899
1,0.922184,0.943011
2,0.937930,0.953807
3,0.946654,0.961905
4,0.952204,0.964958
...,...,...
95,0.980802,0.987416
96,0.980873,0.987440
97,0.980841,0.987457
98,0.980928,0.987472


In [None]:
# Save model weigths

model.save_weights('seq2seq_weights.h5')
encoder.save_weights('encoder_weights.h5')
decoder.save_weights('decoder_weights.h5')

In [None]:
# Save model

model.save('seq2seq.h5')
encoder.save('encoder.h5')
decoder.save('decoder.h5')

In [None]:
# Predictive function

def predict(input_seq):
    states_value = encoder.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = vocabulary['<bos>']
    eos = vocabulary['<eos>']
    output_sentence = []

    for _ in range(max_expansions_length):
        output_tokens, h, c = decoder.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break 

        word = ''

        if idx > 0:
            word = reverse_vocabulary[idx]
            output_sentence.append(word)

        target_seq[0, 0] = idx
        states_value = [h, c]

    return (' '.join(output_sentence)).replace(' ','')