# Seq2seq Model for Simple Math Problems

This is a RNN seq2seq neural network model for solving simple addition and substraction problems. The query and solution are given as a string (for fun) - so we solve these mathematical problems using a string-based approach.

We generate the dataset and encode the problems using one-hot encoding. We then train a RNN sequence-to-sequence model for predicting the solution as a string. For example: We learn to predict the correct solution for the problems $167+52$ or $2319-72$.

After 10 iterations, the accuracy on the test set increases from 39.01% to 64.39% 

In [1]:
# Imports
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, RepeatVector, Dense
import numpy as np

In [2]:
class Character_Encoding(object):
    """Character encoding and decoding."""
        
    
    def __init__(self, chars):
        
        # Get character set
        self.chars = sorted(set(chars))
        self.n_chars = len(self.chars)
        
        # Get dict from char to index and from index to char
        self.char_to_ind = dict((c, i) for i, c in enumerate(self.chars))
        self.ind_to_char = dict((i, c) for i, c in enumerate(self.chars))

        
    def encode(self, string, max_string=12):
        """One-hot encode the string and pad up to max_string length."""
        
        # Set array
        encoded_string = np.zeros((max_string, self.n_chars))
        
        # For each character, set to 1.0 at corresponding index
        for i, char in enumerate(string):
            encoded_string[i, self.char_to_ind[char]] = 1.0
        
        # Return one-hot encoded string
        return encoded_string

    def decode(self, encoded_string):
        """Decode one-hot encoding to character string."""
        
        # Initialize
        decoded_string = []

        # For each character/row
        for i in range(encoded_string.shape[0]):
            
            # Get the encoding and index with highest value
            encoding = encoded_string[i,:]          
            index = encoding.argmax()
            
            # Look up character for this index
            decoded_string.append(self.ind_to_char[index])
            
        # Return decoded string
        return "".join(decoded_string)
    

# Testing
char_enc = Character_Encoding('0123456789+- ')

example = '152 + 829'
encoded_string = char_enc.encode(example)
decoded_string = char_enc.decode(encoded_string)

print('Encoded\n', encoded_string)
print('\nDecoded\n', decoded_string)

Encoded
 [[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

Decoded
 152 + 829   


In [3]:
def number_generator(max_digits):
    """Generate number of at most max_digits length."""
      
    # Randomly set digit length (smaller/equal to max_digits)
    len_digit = np.random.randint(1, max_digits+1)    
    
    # Get random number between 0 and 9 -- repeat as many times as len_digits
    number = [np.random.choice(['0','1','2','3','4','5','6','7','8','9']) for pos in range(len_digit)]
    number = "".join(number)

    # Do not accept numbers starting with 0 (e.g. 015)
    while number[0] == '0':
        number = number_generator(max_digits=max_digits)
    
    # Return number
    return number


def generate_data(n_samples=10000, max_digits=4, max_string=12):
    """Generate data of math queries and solutions."""
    
    # Initalize
    queries = []
    solutions = []    
    max_solution_len = max_digits+1
    
    # Character encoding
    char_enc = Character_Encoding('0123456789+- ')
    
    # Generate sample
    for sample in range(n_samples):
    
        # Get two numbers
        number_1 = int(number_generator(max_digits=max_digits))
        number_2 = int(number_generator(max_digits=max_digits))

        # Generate query (addition & substraction)
        add_query = '{} + {}'.format(number_1, number_2)
        sub_query = '{} - {}'.format(number_1, number_2)

        # Padding: fill up with white space
        len_query = len(add_query)
        add_query = add_query + ' ' * (max_string - len_query)
        sub_query = sub_query + ' ' * (max_string - len_query)  

        # Generate solution with padding
        add_solution = str(number_1 + number_2)
        add_solution = add_solution + " " * (max_solution_len - len(add_solution))
        
        # Save query and solution if unseen        
        if add_query not in queries:
            queries.append(add_query)
            solutions.append(add_solution)

        # If number_1 is larger, get substraction as well
        if number_1 > number_2:
            
            # Get padded solution
            sub_solution = str(number_1 - number_2)
            sub_solution = sub_solution + " " * (max_solution_len - len(sub_solution))
            
            # Save query and solution if unseen
            if sub_query not in queries:
                queries.append(sub_query)
                solutions.append(sub_solution)
            
    # Return queries and solutions
    return queries, solutions


In [4]:
def vectorize_dataset(queries, solutions, max_string, max_digits, char_enc):
    """Encode and vectorize the dataset (queries and solutions)."""
    
    # Get number of samples
    n_samples = len(queries)
    
    # Set arrays for samples and labels
    X = np.zeros((n_samples, max_string, char_enc.n_chars))     # dtype=np.bool (optional)
    y = np.zeros((n_samples, max_digits+1, char_enc.n_chars))   # dtype=np.bool (optional)
    
    # For each query, get one-hot encodded padded representation
    for i, query in enumerate(queries):
        X[i] = char_enc.encode(query, max_string)
  
    # For each solution, get one-hot encodded padded representation
    for i, solution in enumerate(solutions):
        y[i] = char_enc.encode(solution, max_digits+1)
    
    # Return data
    return X, y

In [5]:
# Character encoding
char_enc = Character_Encoding('0123456789+- ')
n_chars = char_enc.n_chars

# Dataset settings
n_samples = 100000
max_string = 15
max_digits = 4

# Neural network settings
hidden_size = 120
batch_size = 100
iterations = 10

In [6]:
# Generate dataset
dataset = generate_data(n_samples=n_samples)
queries, solutions = dataset

# Vectorize dataset
X,y = vectorize_dataset(queries, solutions, max_string, max_digits, char_enc)

# Print
dataset[:5]

(['54 + 821    ',
  '24 + 27     ',
  '776 + 157   ',
  '776 - 157   ',
  '55 + 681    ',
  '3 + 99      ',
  '224 + 547   ',
  '9255 + 4589 ',
  '9255 - 4589 ',
  '521 + 233   ',
  '521 - 233   ',
  '9 + 17      ',
  '1403 + 9    ',
  '1403 - 9    ',
  '7458 + 542  ',
  '7458 - 542  ',
  '984 + 9     ',
  '984 - 9     ',
  '8 + 2512    ',
  '18 + 458    ',
  '5 + 7       ',
  '6469 + 3707 ',
  '6469 - 3707 ',
  '62 + 67     ',
  '8946 + 5225 ',
  '8946 - 5225 ',
  '8929 + 66   ',
  '8929 - 66   ',
  '5885 + 920  ',
  '5885 - 920  ',
  '99 + 3      ',
  '99 - 3      ',
  '1706 + 7434 ',
  '6830 + 7    ',
  '6830 - 7    ',
  '874 + 7900  ',
  '644 + 721   ',
  '854 + 98    ',
  '854 - 98    ',
  '6539 + 8351 ',
  '8 + 2       ',
  '8 - 2       ',
  '90 + 6      ',
  '90 - 6      ',
  '107 + 3276  ',
  '3464 + 13   ',
  '3464 - 13   ',
  '6 + 6       ',
  '4 + 5       ',
  '841 + 689   ',
  '841 - 689   ',
  '2 + 15      ',
  '996 + 52    ',
  '996 - 52    ',
  '2369 + 340  ',
  '2369 - 

In [7]:
# Shuffle data
indices = np.arange(len(y))
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

# Set training and validation size
val_size = int(len(y)//10)
train_size = int(len(y)//1.25)

# Set training, validation and test data
(X_train, X_val, X_test) = X[:train_size], X[train_size:train_size+val_size], X[train_size+val_size:]
(y_train, y_val, y_test) = y[:train_size], y[train_size:train_size+val_size], y[train_size+val_size:]

In [8]:
# Define a seq2seq model
model = Sequential()
model.add(LSTM(hidden_size, input_shape=(max_string, n_chars)))
model.add(RepeatVector(max_digits + 1))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(TimeDistributed(Dense(n_chars, activation='softmax')))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 120)               64320     
_________________________________________________________________
repeat_vector (RepeatVector) (None, 5, 120)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 5, 120)            115680    
_________________________________________________________________
time_distributed (TimeDistri (None, 5, 13)             1573      
Total params: 181,573
Trainable params: 181,573
Non-trainable params: 0
_________________________________________________________________


In [9]:
# For each iteration
for iteration in range(iterations):

    print('\nIteration', iteration+1)
    print('-'*50)

    # Train the model
    model.fit(X_train, y_train, batch_size=batch_size, epochs=1, validation_data=(X_val, y_val))
    
    # Show 3 examples
    for i in range(3):
        
        # Get random index
        random_index = np.random.randint(0, len(X_val))
        
        # Get query, solution and prediction
        query = X_val[np.array([random_index])]
        solution = y_val[np.array([random_index])]     
        prediction = model.predict(query, verbose=0)
 
        # Decode query, solution and prediction
        query_decoded = char_enc.decode(query[0])
        sol_decoded = char_enc.decode(solution[0])
        pred_decoded = char_enc.decode(prediction[0])
        
        # Check whether prediction is correct
        is_correct = (sol_decoded == pred_decoded)
        is_correct_sign = '☑' if is_correct else '☒'
        
        print()
        print('Query: {}'.format(query_decoded))
        print('Solution: {}'.format(sol_decoded))
        print('Prediction: {} {}'.format(pred_decoded, is_correct_sign))          
        print('-'*50)



Iteration 1
--------------------------------------------------

Query: 74 + 22        
Solution: 96   
Prediction: 10    ☒
--------------------------------------------------

Query: 310 + 52       
Solution: 362  
Prediction: 122   ☒
--------------------------------------------------

Query: 6462 - 787     
Solution: 5675 
Prediction: 8707  ☒
--------------------------------------------------

Iteration 2
--------------------------------------------------

Query: 7175 + 7795    
Solution: 14970
Prediction: 10844 ☒
--------------------------------------------------

Query: 136 - 5        
Solution: 131  
Prediction: 212   ☒
--------------------------------------------------

Query: 4947 - 3       
Solution: 4944 
Prediction: 3993  ☒
--------------------------------------------------

Iteration 3
--------------------------------------------------

Query: 748 + 5        
Solution: 753  
Prediction: 791   ☒
--------------------------------------------------

Query: 30 - 19        
Solutio