In [None]:
!pip install matplotlib



In [None]:
!pip install spacy
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
!pip install torchtext



In [None]:
!pip install pandas



In [None]:
%matplotlib inline
import matplotlib
matplotlib.use('WebAgg')
#matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

# Requirements

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

# import spacy
import torch
import torch.nn.init as init
from torch.nn.parameter import Parameter
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

device(type='cpu')

In [None]:
import math

class NeuralAccumulatorCell(nn.Module):
    
    # Feed forward but Weight decomposition
    
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim

        self.W_hat = Parameter(torch.Tensor(out_dim, in_dim))
        self.M_hat = Parameter(torch.Tensor(out_dim, in_dim))
        self.W = Parameter(torch.tanh(self.W_hat) * torch.sigmoid(self.M_hat))
        self.register_parameter('bias', None)

        init.kaiming_uniform_(self.W_hat, a=math.sqrt(5))
        init.kaiming_uniform_(self.M_hat, a=math.sqrt(5))
        
        #init.normal_(self.W_hat)
        #init.normal_(self.M_hat)
        
    def forward(self, input):
        return F.linear(input, self.W, self.bias)


class NAC(nn.Module):
    
    def __init__(self, dims):
        '''
        dims = [input_dim + hidden_dims + output_dims]
        '''
        super().__init__()
        self.num_layers = len(dims) - 1
        
        layers = nn.ModuleList()
        layers.extend([NeuralAccumulatorCell(dims[i],dims[i+1]) for i in range(self.num_layers)])
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        out = self.model(x)
        return out

class NeuralArithmeticLogicUnitCell(nn.Module):

    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.eps = 1e-10

        self.G = Parameter(torch.Tensor(out_dim, in_dim))
        self.W = Parameter(torch.Tensor(out_dim, in_dim))
        self.register_parameter('bias', None)
        self.nac = NeuralAccumulatorCell(in_dim, out_dim)

        init.kaiming_uniform_(self.G, a=math.sqrt(5))
        init.kaiming_uniform_(self.W, a=math.sqrt(5))

    def forward(self, input):
        
        a = self.nac(input)
        g = torch.sigmoid(F.linear(input, self.G, self.bias))
        add_sub = g * a
        log_input = torch.log(torch.abs(input) + self.eps)
        m = torch.exp(self.nac(log_input))
        # m = torch.exp(F.linear(log_input, self.W, self.bias))
        mul_div = (1 - g) * m
        y = add_sub + mul_div
        return y


class NALU(nn.Module):
    
    def __init__(self, dims):
        super().__init__()
        self.num_layers = len(dims) - 1
        layers = nn.ModuleList()
        layers.extend([NeuralArithmeticLogicUnitCell(dims[i],dims[i+1]) for i in range(self.num_layers)])
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        out = self.model(x)
        return out

class NALU_LSTMCell(nn.Module):

    def __init__(self, input_size, hidden_size, bias=True):
        super().__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.i2h = nn.Sequential(
            nn.Linear(input_size, 4 * hidden_size, bias=bias),
#             nn.BatchNorm1d(4 * hidden_size),
#             nn.LeakyReLU(0.2,inplace=True),
#             nn.Linear(4 *input_size, 4 * hidden_size, bias=bias),
        )
        self.h2h = nn.Sequential(
            nn.Linear(hidden_size, 4 * hidden_size, bias=bias),
#            nn.BatchNorm1d(4 * hidden_size),
#             nn.LeakyReLU(0.2,inplace=True),
#             nn.Linear(4 * hidden_size, 4 * hidden_size, bias=bias)
        )
        self.nalu_h = NALU([hidden_size, hidden_size])
        self.nalu_c = NALU([hidden_size, hidden_size])
        self.out = nn.Linear(hidden_size, input_size, bias=bias)
        self.apply(self.weight_init)

    def weight_init(self,m):

        std = 1.0 / math.sqrt(self.hidden_size)
        for name, w in m.named_parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, hidden = None):
        
        if hidden is None:
            hidden = x.new_zeros(x.size(0), self.hidden_size, requires_grad=False)
            hidden = (hidden, hidden)
            
        h, c = hidden
        
        preact = self.i2h(x) + self.h2h(h)
        
        # First: apply nalu to replace activation func
        
        # self.nalu(preact)
        
        gates = preact[:, :3 * self.hidden_size].sigmoid()
        g_t = preact[:, 3 * self.hidden_size:].tanh()
        i_t = gates[:, :self.hidden_size] 
        f_t = gates[:, self.hidden_size:2 * self.hidden_size]
        o_t = gates[:, -self.hidden_size:]
        
        # Second: Apply it in the output and hidden layer

        c_t = (c*f_t) + (i_t*g_t)

        h_t = o_t * c_t.tanh()
        
        # return x, (h_t, c_t) # LSTM
        
        # return x + self.out(h_t), (h_t + h, c_t + c) # Residule LSTM

        return  x + self.out(h_t) , (self.nalu_h(h_t + h), self.nalu_c(c_t + c)) # Residule NALU

class NALU_LSTM(nn.Module):
    
    def __init__(self,input_size, hidden_sizes, bidirectional = False):
        
        super().__init__()
        self.bidirectional = bidirectional
        
        if self.bidirectional:
            self.num_dir = 2
        else:
            self.num_dir = 1
            
        self.input_size = input_size
        self.L = len(hidden_sizes)
        self.layers = nn.ModuleList()
        self.layers.extend([NALU_LSTMCell(input_size,i) for i in hidden_sizes])
        self.c0 = nn.ParameterList([nn.Parameter(torch.randn(self.num_dir, 1,i)) for i in hidden_sizes])
        self.h0 = nn.ParameterList([nn.Parameter(torch.randn(self.num_dir, 1,i)) for i in hidden_sizes])
        
        
    def forward(self, input):
        
        '''
        input_shape = B, S, input_size
        output_shape = B, num_dir, L, S, input_size
        hidden, cells = S * (B, hidden_dim)
        '''
        
        B,S = input.shape[:-1]
        
        outputs = torch.zeros(B, self.num_dir, self.L+1, S, self.input_size)
        outputs[:,:,0,:,:] = input.unsqueeze(1).expand_as(outputs[:,:,0,:,:])
        hiddens = []
        cells = []
    
        for i, layer in enumerate(self.layers):
            f_h, f_c = self.h0[i][0].repeat(B,1), self.c0[i][0].repeat(B,1)
            if self.bidirectional:
                i_h, i_c = self.h0[i][1].repeat(B,1), self.c0[i][1].repeat(B,1)
            for j in range(S):
                f_out, (f_h,f_c) = layer(outputs[:,0,i,j,:].clone(), (f_h,f_c))
                outputs[:,0,i+1,j,:] = f_out

            if self.bidirectional:
                for j in reversed(range(S)):
                    i_out, (i_h,i_c) = layer(outputs[:,1,i,j,:].clone(), (i_h,i_c))
                    outputs[:,1,i+1,j,:] = i_out
                hiddens.append((torch.stack([f_h,i_h])))
                cells.append(torch.stack([f_c,i_c]))
            else:
                hiddens.append(f_h)
                cells.append(f_c)
                
        
        return outputs[:,:,1:,:,:].contiguous(), (hiddens, cells)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls

drive  sample_data


In [None]:
cd drive/MyDrive/math_te_cha

/content/drive/MyDrive/math_te_cha


In [None]:
!ls

number_word_std.dev.json	     number_word_std.linear_t6.dev.json
number_word_std.linear.dev.json      number_word_std.linear_t6.test.json
number_word_std.linear_t2.dev.json   number_word_std.linear.test.json
number_word_std.linear_t2.test.json  number_word_std.test.json


# Load and Explore Math Data

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
math_train = pd.read_json('number_word_std.dev.json')
math_train.equations = math_train.equations.apply(lambda x: '; '.join(x)) 
math_test = pd.read_json('number_word_std.test.json')
math_test.equations = math_test.equations.apply(lambda x: '; '.join(x)) 

  


## View Pairs

### Train Set 

In [None]:
math_train[['text', 'equations', 'ans_simple']]

Unnamed: 0,text,equations,ans_simple
0,one number is 11 more than another number. Find the two numbers if three times the larger exceeds four times the smaller number by 4.,"unkn: x,y; equ: x=y+11; equ: 3*x=4*y+4","[40, 29]"
1,One number is 3 less than a second number. Twice the second number is 12 less than 5 times the first. Find the two numbers.,"unkn: x,y; equ: x + 3 = y; equ: 2*y + 12 = 5*x","[6, 9]"
2,Find two numbers whose sum is 62 and whose difference is 6.,"unkn: x,y; equ: x+y=62; equ: x-y=6","[34, 28]"
3,the sum of two numbers is 68. their difference is 16. what are the numbers?,"unkn: x,y; equ: x+y=68; equ: x-y=16","[42, 26]"
4,the sum of two numbers is 97. the difference of the two numbers is 57. find the two numbers,"unkn: x,y; equ: x+y=97; equ: x-y=57","[77, 20]"
...,...,...,...
369,The sum of the reciprocals of two consecutive odd integers is 8/15. Find the integers.,"unkn: 2*n+1,2*n+3; equ: 1/(2*n+1) + 1/(2*n+3) = 8/15; equ: is_integer(n)","[3, 5]"
370,The sum of 4 consecutive odd integers is 3 more than 5 times the least of the integers. Find the integers.,"unkn: m, n, x, y; equ: m % 2 = 1; equ: m + 2 = n; equ: n + 2 = x; equ: x + 2 = y; equ: m + n + x + y = 3 + 5*m","[9, 11, 13, 15]"
371,What number can be subtracted from both the numerator and the denominator of 17/25 to form a fraction equal to 3/5?,unkn: n; equ: (17-n)/(25-n) = 3/5,[5]
372,"When the reciprocal of three times a number is subtracted from the reciprocal of the number, the result is one sixth. Find the number.",unkn: x; equ: 1/x - 1/(3*x) = 1/6,[4]


#### Test Set

In [None]:
math_test[['text', 'equations', 'ans_simple']]

Unnamed: 0,text,equations,ans_simple
0,The difference of two integers is 9. Five times the smaller is 7 more than three times the larger. Find the numbers.,"unkn: x,y; equ: x-y=9; equ: 5*y=3*x+7","[26, 17]"
1,Two numbers differ by 3. Four times the lesser diminished by three times the greater is 7. Find the numbers.,"unkn: x,y; equ: x-y=3; equ: 4*y-3*x=7","[16, 19]"
2,one number is 9 more than the other number. The difference of seven times the smaller number and two times the larger is 157. What are the numbers?,"unkn: x,y; equ: x = y + 9; equ: 7*y - 2*x = 157","[44, 35]"
3,The difference between two numbers is -38. Two times the smaller number minus five times the larger number is -217. What are the numbers?,"unkn: x,y; equ: x-y=(-38); equ: 2*x-5*y=(-217)","[9, 47]"
4,one number is 4 less than another. the difference of twice the smaller and 5 times the larger number is -11. What are the numbers?,"unkn: x, y; equ: x = y - 4; equ: 2*x - 5*y = -11","[-3, 1]"
...,...,...,...
1499,if 5^21 x 4^11= 2 x 10^n what is the value of n?,unkn: n; equ: 5^21 * 4^11= 2 * 10^n,[21]
1500,"Find the sum of a geometric sequence whose first term is 54, whose last (nth) term is 2, and whose common ratio is 1/3.",unkn: s; equ: s = 54 * ((1/3)^n - 1) / (1/3 - 1); equ: 2 = 54 * (1/3)^(n-1),[80]
1501,"Find the sum of the terms in the geometric series with a_1 = -9, r = -4, and a_n = 9,216?",unkn: s; equ: 9216 = (-9)*(-4)^(n-1); equ: s = (-9) * ((-4)^n - 1) / (-4 - 1),[7371]
1502,"What is the 7th term of the geometric sequence where a1 = -4 and a5 = -1,024?",unkn: x; equ: -1024 = -4*r^(5-1); equ: x = -4*r^(7-1),[-16384]


In [None]:
MAX_LENGTH = max(math_train.text.apply(lambda x: len(x.split(' '))).max(), 
                 math_test.text.apply(lambda x: len(x.split(' '))).max())
MAX_LENGTH

77

# Create Vocab 

In [None]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# Normalization Functions

In [None]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters

def normalizeString(s):
    s = unicodeToAscii(s)
    # ensure each math symbol is it's own token
    s = "".join([ c if c.isalnum() else " {} ".format(c) for c in s])
    # remove the unknowns since tools like wolfram are good at identifying them
    s = s[s.index(';') + 1:]
    # remove all extra whitespaces
    s = " ".join(s.split())

    return s

In [None]:
math_train.equations = math_train.equations.apply(lambda x: normalizeString(x)) 
math_test.equations = math_test.equations.apply(lambda x: normalizeString(x)) 

In [None]:
input_lang = Lang('text')
output_lang = Lang('equations')
train_pairs = [list(x) for x in math_train[['text', 'equations']].to_records(index=False)]
test_pairs = [list(x) for x in math_test[['text', 'equations']].to_records(index=False)]

for pairs in [train_pairs, test_pairs]:
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
print(random.choice(train_pairs))

['if the sum of lcm and gcd of 2 consecutive numbers is 157, what is the sum of these numbers?', 'equ : s = n + ( n + 1 ) ; equ : 1 + n ( n + 1 ) = 157']


# The Seq2Seq Model


## Encoder

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

## AttnDecoder

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

Training
========

Preparing Training Data
-----------------------

To train, for each pair we will need an input tensor (indexes of the
words in the input sentence) and target tensor (indexes of the words in
the target sentence). While creating these vectors we will append the
EOS token to both sequences.



In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

Training the Model
------------------

To train we run the input sentence through the encoder, and keep track
of every output and the latest hidden state. Then the decoder is given
the ``<SOS>`` token as its first input, and the last hidden state of the
encoder as its first hidden state.

"Teacher forcing" is the concept of using the real target outputs as
each next input, instead of using the decoder's guess as the next input.
Using teacher forcing causes it to converge faster but `when the trained
network is exploited, it may exhibit
instability <http://minds.jacobs-university.de/sites/default/files/uploads/papers/ESNTutorialRev.pdf>`__.

You can observe outputs of teacher-forced networks that read with
coherent grammar but wander far from the correct translation -
intuitively it has learned to represent the output grammar and can "pick
up" the meaning once the teacher tells it the first few words, but it
has not properly learned how to create the sentence from the translation
in the first place.

Because of the freedom PyTorch's autograd gives us, we can randomly
choose to use teacher forcing or not with a simple if statement. Turn
``teacher_forcing_ratio`` up to use more of it.

In [None]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

This is a helper function to print time elapsed and estimated time
remaining given the current time and progress %.

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

The whole training process looks like this:

-  Start a timer
-  Initialize optimizers and criterion
-  Create set of training pairs
-  Start empty losses array for plotting

Then we call ``train`` many times and occasionally print the progress (%
of examples, time so far, estimated time) and average loss.

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters())#, lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters())#, lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs))\
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

Plotting results
----------------

Plotting is done with matplotlib, using the array of loss values
``plot_losses`` saved while training.



In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.


In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:


In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

Training and Evaluating
=======================

With all these helper functions in place (it looks like extra work, but
it makes it easier to run multiple experiments) we can actually
initialize a network and start training.

Remember that the input sentences were heavily filtered. For this small
dataset we can use relatively small networks of 256 hidden nodes and a
single GRU layer.

.. Note::
   If you run this notebook you can train, interrupt the kernel,
   evaluate, and continue training later. Comment out the lines where the
   encoder and decoder are initialized and run ``trainIters`` again.




In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.2).to(device)

trainIters(encoder1, attn_decoder1, 50, print_every=1)

0m 0s (- 0m 7s) (1 2%) 6.0084
0m 0s (- 0m 6s) (2 4%) 5.7657
0m 0s (- 0m 6s) (3 6%) 5.6221
0m 0s (- 0m 7s) (4 8%) 5.7414
0m 0s (- 0m 7s) (5 10%) 5.2937
0m 0s (- 0m 6s) (6 12%) 4.9521
0m 1s (- 0m 6s) (7 14%) 4.2893
0m 1s (- 0m 6s) (8 16%) 4.8399
0m 1s (- 0m 5s) (9 18%) 4.4413
0m 1s (- 0m 5s) (10 20%) 3.7816
0m 1s (- 0m 6s) (11 22%) 4.0559
0m 1s (- 0m 5s) (12 24%) 2.6468
0m 1s (- 0m 5s) (13 26%) 3.2175
0m 2s (- 0m 5s) (14 28%) 3.6204
0m 2s (- 0m 5s) (15 30%) 3.4000
0m 2s (- 0m 5s) (16 32%) 3.9219
0m 2s (- 0m 5s) (17 34%) 3.2652
0m 2s (- 0m 5s) (18 36%) 2.8869
0m 3s (- 0m 4s) (19 38%) 2.7847
0m 3s (- 0m 4s) (20 40%) 2.6839
0m 3s (- 0m 4s) (21 42%) 2.7054
0m 3s (- 0m 4s) (22 44%) 3.6195
0m 3s (- 0m 4s) (23 46%) 3.5382
0m 4s (- 0m 4s) (24 48%) 3.2953
0m 4s (- 0m 4s) (25 50%) 2.4645
0m 4s (- 0m 4s) (26 52%) 2.3769
0m 4s (- 0m 3s) (27 54%) 3.9385
0m 4s (- 0m 3s) (28 56%) 3.5920
0m 4s (- 0m 3s) (29 57%) 2.5540
0m 5s (- 0m 3s) (30 60%) 2.1523
0m 5s (- 0m 3s) (31 62%) 4.0917
0m 5s (- 0m 3s) (32 6

In [None]:
evaluateRandomly(encoder1, attn_decoder1)

> the sum of two consecutive even integers is -298. what are the integers?
= equ : 2 * k + ( 2 * k + 2 ) = - 298
< equ : x + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2

> The sum of two numbers is 55. The smaller is 5 less than the larger. What are the numbers
= equ : x + y = 55 ; equ : x = y - 5
< equ : x + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2

> What is the number that is one half of one quarter of one tenth of 400?
= equ : x = 1 / 2 * 1 / 4 * 1 / 10 * 400
< equ : x + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2

> Find four consecutive even integers such that the sum of the squares of the first and the second is 12 more than the last.
= equ : x ^ 2 + ( x + 2 ) ^ 2 - 12 = x + 

In [None]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    print (len(output_words))
    showAttention(input_sentence, output_words, attentions)


evaluateAndShowAttention("the sum of the digits of a 2-digit number is 7. The tens digit is one less than 3 times the units digit. Find the number.")



input = the sum of the digits of a 2-digit number is 7. The tens digit is one less than 3 times the units digit. Find the number.
output = equ : x + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2 * k + 2
77


In [None]:
import azureml.core

print("SDK Version:", azureml.core.VERSION)
from azureml.core import Workspace

subscription_id ='<subscription-id>'
resource_group ='<resource-group>'
workspace_name = '<workspace-name>'

try:
   ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
   ws.write_config()
   print('Workspace configuration succeeded. You are all set!')
except:
   print('Workspace not found. Run the cells below.')


ModuleNotFoundError: ignored