# Calculate with seq2seq model 
You are going to build a calculator for evaluating arithmetic expressions, by taking a string as input.

## Data 
We do not need any data to be downloaded, we will generate it all by ourselves. We will use two operators addition and subtraction, working with positive integer numbers in some range. Here are some inputs and some ideal outputs proveived by our networks
    
    Input: '1+2'
    Output: '3'
    
    Input: '0-99'
    Output: '-99'


Define the generate_equations function first

In [None]:
import random
def generate_equations(allowed_operator, dataset_size, min_value, max_value):
    sample = []
    for i in range(dataset_size):
        l = random.randint(min_value, max_value)
        r = random.randint(min_value, max_value)
        op = random.choice(allowed_operator)
        if op == "-":
            sulotion = l - r
        if op == '+':
            sulotion = l + r 
        sample.append((str(l)+op+str(r), str(sulotion)))
    return sample

In [None]:
#test
equations = generate_equations(["+", "-"], 3, 0, 9999)
equations

In [None]:
#generate the dataset
from sklearn.model_selection import train_test_split
allowed_operator = ["+", "-"]
dataset_size = 100000
data = generate_equations(allowed_operator, dataset_size, 0, 9999)
train_set, test_set = train_test_split(data, test_size = 0.2, random_state = 4)

In [None]:
print(len(train_set))
print(train_set[1])

## Prepare the data for network 
The next stage is to creating mappings of the characters to their indices in some vocabulary. The dictionary is fixed(0-9, +, -), so it is easy to encode the inputs.

In [None]:
word2id = {word:idx for (idx, word) in enumerate("^$#+-1234567890")}
id2word = {idx:word for (idx, word) in enumerate("^$#+-1234567890")}
print(word2id)
print(id2word)

In [None]:
#special symbols
start_symbol = '^'   #0, beginning of the decoding
end_symbol = "$"     #1, end of the string, both for input and output
padding_symbol = '#' #2, padding symbol, to make the input placeholds' length consistent

### Padding
Convert a sentence to a list of vocabulary word.
* Predifined length padding_len
* Padding with "#"
* Ends with "$"

In [None]:
def sentence_to_ids(sentence, word2id, padded_len):
    sent_ids = [word2id[i] for i in sentence]
    if (len(sent_ids) >= padded_len):
        sent_ids = sent_ids[:padded_len]
        sent_ids[-1] = 1
    else:
        sent_ids.append(1)
        while(len(sent_ids) < padded_len):
            sent_ids.append(2)
    sent_len = min(len(sentence)+1, padded_len)
    return sent_ids, sent_len

In [None]:
sentence = "123+123"
print(sentence_to_ids(sentence, word2id, 7),"\n",
      sentence_to_ids(sentence, word2id, 8),"\n",
      sentence_to_ids(sentence, word2id, 10))

In [None]:
#convert back
def ids_to_sentence(ids, id2word):
    return [id2word[i] for i in ids]

In [None]:
print(ids_to_sentence([5,6,7,3,5,6,1], id2word))
print(ids_to_sentence([5, 6, 7, 3, 5, 6, 7, 1], id2word))
print(ids_to_sentence([5, 6, 7, 3, 5, 6, 7, 1, 2, 2], id2word))

## Network building
### Generate batches

In [None]:
def batch_to_ids(sentences, word2id, max_len):
    max_len_in_batch = min(max(len(s) for s in sentences)+1, max_len)
    batch_ids, batch_ids_len = [], []
    for sentence in sentences:
        ids, ids_len = sentence_to_ids(sentence, word2id, max_len_in_batch)
        batch_ids.append(ids)
        batch_ids_len.append(ids_len)
    return batch_ids, batch_ids_len

In [None]:
def generate_batches(samples, batch_size = 64):
    X, Y = [], []
    for i, (x,y) in enumerate(samples, 1):
        X.append(x)
        Y.append(y)
        if i%batch_size == 0:
            yield X, Y
            X, Y = [], []
    if X and Y:
        yield X, Y

In [None]:
sentences =  train_set[0]
print(sentences)
ids, sent_lens = batch_to_ids(sentences, word2id, max_len = 100)
print(ids, sent_lens)

### Encoder-Decoder architecture

In [None]:
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")

In [None]:
class Seq2SeqModel(object):
    pass

In [None]:
def declare_placeholders(self):
    self.input_batch = tf.placeholder(shape = (None, None), dtype=tf.int32, name = "input_batch")
    self.input_batch_lengths = tf.placeholder(shape = (None,), dtype=tf.int32, name = "input_batch_lengths")
    
    self.ground_truth = tf.placeholder(shape = (None, None), dtype=tf.int32)
    self.ground_truth_lengths = tf.placeholder(shape = (None, ), dtype = tf.int32)
    
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape = [])
    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape = [])
Seq2SeqModel.__declare_placeholders = classmethod(declare_placeholders)

### Specify the layer(Embeddings) 

In [None]:
#perform as a dictionary to encode the input batch()
def create_embeddings(self, vocab_size, embeddings_size):
    random_initializer = tf.random_uniform((vocab_size, embeddings_size), -1.0, 1.0)
    self.embeddings = tf.Variable(random_initializer)
    
    self.input_batch_embedded = tf.nn.embedding_lookup(self.embeddings, self.input_batch)
Seq2SeqModel.__create_embeddings = classmethod(create_embeddings)

### Encoder
Encoding an input sequence to a real-valued vector. 

In [None]:
def build_encoder(self, hidden_size):
    encoder_cell = tf.nn.run_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(hidden_size),
                                                input_keep_prob = self.dropout_ph,
                                                output_keep_prob = self.dropout_ph)
    
    _, self.final_encoder_state = tf.nn.dynamic_rnn(encoder_cell, 
                                                   self.input_batch_embedded,
                                                   dtype = tf.float32,
                                                   sequence_length = self.input_batch_lengths)

In [None]:
Seq2SeqModel.__build_encoder = classmethod(build_encoder)

### Decoder 

In [None]:
def build_decoder(self, hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id):
    batch_size = tf.shape(self.input_bacth)[0];
    start_tokens = tf.fill([batch_size], start_symbol_id)
    ground_truth_as_input = tf.concat([tf.expand_dims(start_tokens, 1), self.ground_truth],1)
    
    self.ground_truth_embedded = tf.nn.embedding_lookup(self.embeddings, ground_truth_as_input)
    
    train_helper = tf.contrib.seq2seq.TrainingHelper(self.ground_truth_embedded,
                                                    self.ground_truth_lengths)
    
    infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.embeddings, start_tokens, end_symbol_id)
    
    def decode(helper, scope, reuse = None):
        with tf.variable_scope(scope, reuse=reuse):
            decoder_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(hidden_size, reuse),
                                                        input_keep_prob = self.dropout_ph,
                                                         output_keep_prob = self.dropout_ph)
            decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(deocder_cell, vocab_size, reuse = reuse)

            deocder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, self.final_encoder_state)
            outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=max_iter,
                                                             output_time_major = False, impute_finished = True)
            return outputs
    self.train_outputs = decode(train_helper, 'decode')
    self.infer_outputs = decode(infer_helper, 'decode', reuse = True)

In [None]:
Seq2SeqModel.__build_encoder = classmethod(build_decoder)

### Define loss

In [None]:
def compute_loss(self):
    weights = tf.cast(tf.sequence_mask(self.ground_truth_lengths), dtype=tf.float32)
    self.loss = tf.contrib.seq2seq.sequence_loss(self.train_outputs.rnn_output, self.ground_truth, weights)

In [None]:
Seq2SeqModel.__compute_loss = classmethod(compute_loss)

### Define optimization

In [None]:
def perform_optimization(self):
    self.train_op = tf.contrib.layers.optimize_loss(self.loss, tf.train.get_global_step(),
                                                   self.learning_rate_ph, 'Adam', clip_gradients = 1.0)

In [None]:
Seq2SeqModel.__perform_optimization = classmethod(perform_optimization)

### Init the model

In [None]:
def init_model(self, vocab_size, embeddings_size, hidden_size, 
               max_iter, start_symbol_id, end_symbol_id, padding_symbol_id):
    
    self.__declare_placeholders()
    self.__create_embeddings(vocab_size, embeddings_size)
    self.__build_encoder(hidden_size)
    self.__build_decoder(hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id)
    
    self.__compute_loss()
    self.__perform_optimization()
    
    self.train_predictions = self.train_outputs.sample_id
    self.infer_predictions = self.infer_outputs.sample_id

In [None]:
Seq2SeqModel.__init__ = classmethod(init_model)

## Train the network

In [None]:
def train_on_batch(self, session, X, X_seq_len, Y, Y_seq_len, learning_rate, dropout_keep_probability):
    feed_dict = {
            self.input_batch: X,
            self.input_batch_lengths: X_seq_len,
            self.ground_truth: Y,
            self.ground_truth_lengths: Y_seq_len,
            self.learning_rate_ph: learning_rate,
            self.dropout_ph: dropout_keep_probability
        }
    pred, loss, _ = session.run([
            self.train_predictions,
            self.loss,
            self.train_op], feed_dict=feed_dict)
    return pred, loss
Seq2SeqModel.train_on_batch = classmethod(train_on_batch)

In [None]:
def predict_for_batch(self, session, X, X_seq_len):
    feed_dict = {
        self.input_batch: X,
        self.input_batch_lengths: X_seq_len,
    }
    ######### YOUR CODE HERE #############
    pred = session.run([
            self.infer_predictions
        ], feed_dict=feed_dict)[0]
    return pred

def predict_for_batch_with_loss(self, session, X, X_seq_len, Y, Y_seq_len):
    feed_dict = {
        self.input_batch: X,
        self.input_batch_lengths: X_seq_len,
        self.ground_truth: Y,
        self.ground_truth_lengths: Y_seq_len,
    }
    ######### YOUR CODE HERE #############
    pred, loss = session.run([
            self.infer_predictions,
            self.loss,
        ], feed_dict=feed_dict)
    return pred, loss

In [None]:
Seq2SeqModel.predict_for_batch = classmethod(predict_for_batch)
Seq2SeqModel.predict_for_batch_with_loss = classmethod(predict_for_batch_with_loss)

## Run the model

In [None]:
tf.reset_default_graph()

model = Seq2SeqModel(len(word2id), 20, 512, 7,
                     word2id[start_symbol], word2id[end_symbol], word2id[padding_symbol])

batch_size = 128 
n_epochs = 10 
learning_rate = 0.001 
dropout_keep_probability = 0.5 
max_len = 20 

n_step = int(len(train_set) / batch_size)

In [None]:
session = tf.Session()
session.run(tf.global_variables_initializer())
            
invalid_number_prediction_counts = []
all_model_predictions = []
all_ground_truth = []

print('Start training... \n')
for epoch in range(n_epochs):  
    random.shuffle(train_set)
    random.shuffle(test_set)
    
    print('Train: epoch', epoch + 1)
    for n_iter, (X_batch, Y_batch) in enumerate(generate_batches(train_set, batch_size=batch_size)):
        ######################################
        ######### YOUR CODE HERE #############
        ######################################
        # prepare the data (X_batch and Y_batch) for training
        # using function batch_to_ids
        X, X_seq_len = batch_to_ids(X_batch, word2id, max_len)
        Y, Y_seq_len = batch_to_ids(Y_batch, word2id, max_len)
        predictions, loss = model.train_on_batch(session,
                                                 X,
                                                 X_seq_len,
                                                 Y,
                                                 Y_seq_len,
                                                 learning_rate,
                                                 dropout_keep_probability) ######### YOUR CODE HERE #############
        
        if n_iter % 200 == 0:
            print("Epoch: [%d/%d], step: [%d/%d], loss: %f" % (epoch + 1, n_epochs, n_iter + 1, n_step, loss))
                
    X_sent, Y_sent = next(generate_batches(test_set, batch_size=batch_size))
    ######################################
    ######### YOUR CODE HERE #############
    ######################################
    # prepare test data (X_sent and Y_sent) for predicting 
    # quality and computing value of the loss function
    # using function batch_to_ids
    
    X_test, X_test_len = batch_to_ids(X_sent, word2id, max_len)
    Y_test, Y_test_len = batch_to_ids(Y_sent, word2id, max_len)
    
    predictions, loss = model.predict_for_batch_with_loss(session, X_test, X_test_len, Y_test, Y_test_len) ######### YOUR CODE HERE #############
    print('Test: epoch', epoch + 1, 'loss:', loss,)
    for x, y, p  in list(zip(X, Y, predictions))[:3]:
        print('X:',''.join(ids_to_sentence(x, id2word)))
        print('Y:',''.join(ids_to_sentence(y, id2word)))
        print('O:',''.join(ids_to_sentence(p, id2word)))
        print('')

    model_predictions = []
    ground_truth = []
    invalid_number_prediction_count = 0
    # For the whole test set calculate ground-truth values (as integer numbers)
    # and prediction values (also as integers) to calculate metrics.
    # If generated by model number is not correct (e.g. '1-1'), 
    # increase invalid_number_prediction_count and don't append this and corresponding
    # ground-truth value to the arrays.
    for X_batch, Y_batch in generate_batches(test_set, batch_size=batch_size):
        ######################################
        ######### YOUR CODE HERE #############
        ######################################
        X, X_seq_len = batch_to_ids(X_batch, word2id, max_len)
        Y, Y_seq_len = batch_to_ids(Y_batch, word2id, max_len)
        predictions = model.predict_for_batch(session, X, X_seq_len)
        for y, p in zip(Y, predictions):
            valid_y = ''.join(ids_to_sentence(y, id2word))
            valid_y = valid_y[:valid_y.find('$')]
            valid_p = ''.join(ids_to_sentence(p, id2word))
            valid_p = valid_p if -1 == valid_p.find('$') else valid_p[:valid_p.find('$')]
            try:
                po = int(valid_p)
                py = int(valid_y)
                model_predictions.append(po)
                ground_truth.append(py)
            except:
                print(valid_y, valid_p)
                invalid_number_prediction_count += 1
    
    all_model_predictions.append(model_predictions)
    all_ground_truth.append(ground_truth)
    invalid_number_prediction_counts.append(invalid_number_prediction_count)
            
print('\n...training finished.')

## Evaluate result

In [None]:
from sklearn.metrics import mean_absolute_error
for i, (gts, predictions, invalid_number_prediction_count) in enumerate(zip(all_ground_truth,
                                                                            all_model_predictions,
                                                                            invalid_number_prediction_counts), 1):
    mae = mean_absolute_error(gts, predictions) ######### YOUR CODE HERE #############
    print("Epoch: %i, MAE: %f, Invalid numbers: %i" % (i, mae, invalid_number_prediction_count))