# Calculate with seq2seq model 
You are going to build a calculator for evaluating arithmetic expressions, by taking a string as input.

## Data 
We do not need any data to be downloaded, we will generate it all by ourselves. We will use two operators addition and subtraction, working with positive integer numbers in some range. Here are some inputs and some ideal outputs proveived by our networks
    
    Input: '1+2'
    Output: '3'
    
    Input: '0-99'
    Output: '-99'


Define the generate_equations function first

In [1]:
import random
def generate_equations(allowed_operator, dataset_size, min_value, max_value):
    sample = []
    for i in range(dataset_size):
        l = random.randint(min_value, max_value)
        r = random.randint(min_value, max_value)
        op = random.choice(allowed_operator)
        if op == "-":
            sulotion = l - r
        if op == '+':
            sulotion = l + r 
        sample.append((str(l)+op+str(r), str(sulotion)))
    return sample

In [2]:
#test
equations = generate_equations(["+", "-"], 3, 0, 9999)
equations

[('8991-9721', '-730'), ('8542+3274', '11816'), ('3972+1648', '5620')]

In [3]:
#generate the dataset
from sklearn.model_selection import train_test_split
allowed_operator = ["+", "-"]
dataset_size = 100000
data = generate_equations(allowed_operator, dataset_size, 0, 9999)
train_set, test_set = train_test_split(data, test_size = 0.2, random_state = 4)

In [7]:
print(len(train_set))
print(train_set[0])

80000
('5250+7736', '12986')


## Prepare the data for network 
The next stage is to creating mappings of the characters to their indices in some vocabulary. The dictionary is fixed(0-9, +, -), so it is easy to encode the inputs.

In [30]:
word2id = {word:idx for (idx, word) in enumerate("^$#+-1234567890")}
id2word = {idx:word for (idx, word) in enumerate("^$#+-1234567890")}
print(word2id)
print(id2word)

{'#': 2, '3': 7, '2': 6, '+': 3, '4': 8, '^': 0, '0': 14, '8': 12, '5': 9, '-': 4, '9': 13, '1': 5, '$': 1, '7': 11, '6': 10}
{0: '^', 1: '$', 2: '#', 3: '+', 4: '-', 5: '1', 6: '2', 7: '3', 8: '4', 9: '5', 10: '6', 11: '7', 12: '8', 13: '9', 14: '0'}


In [29]:
#special symbols
start_symbol = '^'   #0, beginning of the decoding
end_symbol = "$"     #1, end of the string, both for input and output
padding_symbol = '#' #2, padding symbol, to make the input placeholds' length consistent

### Padding
Convert a sentence to a list of vocabulary word.
* Predifined length padding_len
* Padding with "#"
* Ends with "$"

In [34]:
def sentence_to_ids(sentence, word2id, padded_len):
    sent_ids = [word2id[i] for i in sentence]
    if (len(sent_ids) >= padded_len):
        sent_ids = sent_ids[:padded_len]
        sent_ids[-1] = 1
    else:
        sent_ids.append(1)
        while(len(sent_ids) < padded_len):
            sent_ids.append(2)
    sent_len = min(len(sentence)+1, padded_len)
    return sent_ids, sent_len

In [41]:
sentence = "123+123"
print(sentence_to_ids(sentence, word2id, 7),"\n",
      sentence_to_ids(sentence, word2id, 8),"\n",
      sentence_to_ids(sentence, word2id, 10))

([5, 6, 7, 3, 5, 6, 1], 7) 
 ([5, 6, 7, 3, 5, 6, 7, 1], 8) 
 ([5, 6, 7, 3, 5, 6, 7, 1, 2, 2], 8)


In [45]:
#convert back
def ids_to_sentence(ids, id2word):
    return [id2word[i] for i in ids]

In [49]:
print(ids_to_sentence([5,6,7,3,5,6,1], id2word))
print(ids_to_sentence([5, 6, 7, 3, 5, 6, 7, 1], id2word))
print(ids_to_sentence([5, 6, 7, 3, 5, 6, 7, 1, 2, 2], id2word))

['1', '2', '3', '+', '1', '2', '$']
['1', '2', '3', '+', '1', '2', '3', '$']
['1', '2', '3', '+', '1', '2', '3', '$', '#', '#']


## Network building
### Generate batches

In [50]:
def batch_to_ids(sentences, word2id, max_len):
    max_len_in_batch = min(max(len(s) for s in sentences)+1, max_len)
    batch_ids, batch_ids_len = [], []
    for sentence in sentences:
        ids, ids_len = sentence_to_ids(sentence, word2id, max_len_in_batch)
        batch_ids.append(ids)
        batch_ids_len.append(ids_len)
    return batch_ids, batch_ids_len

In [52]:
def generate_batches(samples, batch_size = 64):
    X, Y = [], []
    for i, (x,y) in enumerate(samples, 1):
        X.append(x)
        Y.append(y)
        if i%batch_size == 0:
            yield X, Y
            X, Y = [], []
    if X and Y:
        yield X, Y

In [61]:
sentences =  train_set[0]
ids, sent_lens = batch_to_ids(sentences, word2id, max_len = 100)
print(ids, sent_lens)

[[9, 6, 9, 14, 3, 11, 11, 7, 10, 1], [5, 6, 13, 12, 10, 1, 2, 2, 2, 2]] [10, 6]


### Encoder-Decoder architecture

In [67]:
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")

In [68]:
class Seq2SeqModel(object):
    pass

In [70]:
def declare_placeholders(self):
    self.input_batch = tf.placeholder(shape = (None, None), dtype=tf.int32, name = "input_batch")
    self.input_batch_lengths = tf.placeholder(shape = (None,), dtype=tf.int32, name = "input_batch_lengths")
    
    self.ground_truth = tf.placeholder(shape = (None, None), dtype=tf.int32)
    self.ground_truth_lengths = tf.placeholder(shape = (None, ), dtype = tf.int32)
    
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape = [])
    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape = [])
Seq2SeqModel.__declare_placeholders = classmethod(declare_placeholders)

### Specify the layer(Embeddings) 

In [71]:
#perform as a dictionary to encode the input batch()
def create_embeddings(self, vocab_size, embeddings_size):
    random_initializer = tf.random_uniform((vocab_size, embeddings_size), -1.0, 1.0)
    self.embeddings = tf.Variable(random_initializer)
    
    self.input_batch_embedded = tf.nn.embedding_lookup(self.embeddings, self.input_batch)
Seq2SeqModel.__create_embeddings = classmethod(create_embeddings)

### Encoder
Encoding an input sequence to a real-valued vector. 

In [72]:
def build_encoder(self, hidden_size):
    encoder_cell = tf.nn.run_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(hidden_size),
                                                input_keep_prob = self.dropout_ph,
                                                output_keep_prob = self.dropout_ph)
    
    _, self.final_encoder_state = tf.nn.dynamic_rnn(encoder_cell, 
                                                   self.input_batch_embedded,
                                                   dtype = tf.float32,
                                                   sequence_length = self.input_batch_lengths)

In [73]:
Seq2SeqModel.__build_encoder = classmethod(build_encoder)

### Decoder 

In [150]:
a = 'aaabbbcccaab'
a += ' '
current = a[0]
count = 1
s = ''
for i in a[1:]:
    if i == current:
        count += 1
    else:
        if count == 1:
            s += current
        else:
            s = s + current + str(count)
        current = i
        count = 1
print(s)
        

'a3b3c3a2b'