In [None]:
%matplotlib inline

import numpy as np
import itertools

import torch
import torch.nn as nn
import torch.optim as optim

import random
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_notebook

In [None]:
# set random seed to 0
np.random.seed(0)
torch.manual_seed(0)
torch.set_default_tensor_type('torch.DoubleTensor')

In [None]:
def create_data(prefix_chars, suffix_chars, prefix_length, suffix_length):
    """
    prefix_chars: iterable
    suffix_chars: iterable
    prefix_length: int
    suffix_length: int
    return: take every possible combination of prefixes of length prefix_length
        and take every possible combination of suffixies of length suffix_length,
        then create every possible combination of those.
        Generates len(prefix_chars)^prefix_length * len(suffix_chars)^suffix_length
    """
    prefixes = list(itertools.product(prefix_chars, repeat=prefix_length))
    suffixes = list(itertools.product(suffix_chars, repeat=suffix_length))
    return [p + s for p, s in itertools.product(prefixes, suffixes)]
    
def vectorize_2d(seq, vocab_size):
    vectors = np.zeros((len(seq), vocab_size), dtype=np.float64)
    for i in range(len(seq)):
        vectors[i][seq[i]] = 1
    return torch.tensor(vectors)

In [4]:
def create_output(seq):
    """
    pointer: index to inspect next (attention direction)
    task1: cur^2 mod n (rule-learning)
    task2: value of right-neighbor (attention direction, holding memory, write vs. no write)
    task3: sum of self and left and right neighbors, mod n (combination of tasks 1 and 2)
    """
    pointer_outputs = []
    task1_outputs = []
    task2_outputs = []
    task3_outputs = []
    
    cur = 0
    for i in range(len(seq)):
        
        task1_out = (seq[cur]**2)%len(seq)
        task2_out = seq[(cur+1)%len(seq)]
        
        task3_prev = seq[(cur-1)%len(seq)]
        task3_next = seq[(cur+1)%len(seq)]
        task3_out = (task3_prev + seq[cur] + task3_next)%len(seq)
        
        cur = seq[cur]
        pointer_outputs.append(cur)
        task1_outputs.append(task1_out)
        task2_outputs.append(task2_out)
        task3_outputs.append(task3_out)
        
    return torch.tensor(pointer_outputs), torch.tensor(task1_outputs), torch.tensor(task2_outputs), torch.tensor(task3_outputs)

In [5]:
def training_sequence_1(seq):
    """
    Trains goto, task1, and task2. e.g.
    Iter 1: Compute task 1, move to i=1 to perform task 2
        Input: seq[0]
        Goto: 1
        Task 1: seq[0]^2
        Task 2: No-op
    Iter 2: Compute task 2, move to i=seq[0] to perform task 1
        Input: seq[1]
        Goto: seq[0]
        Task 1: No-op
        Task 2: seq[1]
    etc...
    """
    n = len(seq)
    no_op = len(seq)
    pointer_outputs, task1_outputs, task2_outputs, task3_outputs = create_output(seq)
    X, Y1, Y2, Y3 = [], [], [], []
    index = 0
    for i in range(n):
        X.append(seq[index])
        X.append(seq[(index+1)%n])
        Y1.append((index+1)%n)
        Y1.append(pointer_outputs[i])
        Y2.append(task1_outputs[i])
        Y2.append(no_op)
        Y3.append(no_op)
        Y3.append(task2_outputs[i])
        index = pointer_outputs[i]
    return torch.tensor(X), torch.tensor(Y1), torch.tensor(Y2), torch.tensor(Y3)

In [173]:
class Model1(nn.Module):
    def __init__(self, input_dim, pointer_dim, task1_dim, task2_dim, hidden_layer_size):
        super(Model1, self).__init__()
        self.input_dim = input_dim
        self.pointer_dim = pointer_dim
        self.task1_dim = task1_dim
        self.task2_dim = task2_dim
        self.hidden_layer_size = hidden_layer_size
        
        self.lstm = nn.LSTMCell(input_dim, hidden_layer_size)
        self.lin_pointer = nn.Linear(hidden_layer_size, pointer_dim)
        self.lin_task1 = nn.Linear(hidden_layer_size, task1_dim)
        self.lin_task2 = nn.Linear(hidden_layer_size, task2_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        pointer_outputs, task1_outputs, task2_outputs = [], [], []
        h_t = torch.zeros(1, self.hidden_layer_size, dtype=torch.double)
        c_t = torch.zeros(1, self.hidden_layer_size, dtype=torch.double)
        
        for x_t in x.reshape(len(x), 1, self.input_dim):
            h_t, c_t = self.lstm(x_t, (h_t, c_t))
            pointer_outputs.append(self.softmax(self.lin_pointer(h_t)))
            task1_outputs.append(self.softmax(self.lin_task1(h_t)))
            task2_outputs.append(self.softmax(self.lin_task2(h_t)))
        
        return torch.stack(pointer_outputs).reshape(len(x), self.pointer_dim), \
                torch.stack(task1_outputs).reshape(len(x), self.task1_dim), \
                torch.stack(task2_outputs).reshape(len(x), self.task2_dim)

In [178]:
max_digit = 7
split = 4
train_data = create_data(range(split), range(split, max_digit), split, max_digit-split)
X, Y1, Y2, Y3 = [], [], [], []
for datum in train_data:
    inputs, pointer_outputs, task1_outputs, task2_outputs = training_sequence_1(datum)
    X.append(vectorize_2d(inputs, max_digit))
    Y1.append(pointer_outputs)
    Y2.append(task1_outputs)
    Y3.append(task2_outputs)

In [184]:
model = Model1(max_digit, max_digit, max_digit+1, max_digit+1, 30)
model.double()
optimizer = optim.Adam(model.parameters())

closures = []

for i in range(len(X)):
    x, y1, y2, y3 = X[i], Y1[i], Y2[i], Y3[i]
    def closure():
        optimizer.zero_grad()
        pred1, pred2, pred3 = model(x)
        pointer_loss = nn.functional.nll_loss(pred1, pointer_outputs)
        task1_loss = nn.functional.nll_loss(pred2, task1_outputs)
        task2_loss = nn.functional.nll_loss(pred3, task2_outputs)
        loss = pointer_loss + task1_loss + task2_loss

        loss.backward()
        return loss
    closures.append(closure)

In [185]:
for i in tqdm_notebook(range(10)):
    total_loss = 0
    random.shuffle(closures)
    for closure in closures:
        total_loss += float(optimizer.step(closure))
    print(total_loss)

-20158.072274769343
-20735.874594785273
-20735.996443550714
-20735.999879440817
-20735.999989012507
-20735.999995948823
-20735.999997528943
-20735.999998216656
-20735.999998603515
-20735.99999885207



In [174]:
model = Model1(7, 7, 8, 8, 30)
model.double()

optimizer = optim.Adam(model.parameters())
inputs, pointer_outputs, task1_outputs, task2_outputs = training_sequence_1([2,4,6,0,5,3,1])
X = vectorize_2d(inputs, 7)
Y1 = vectorize_2d(pointer_outputs, 7)
Y2 = vectorize_2d(task1_outputs, 8)
Y3 = vectorize_2d(task2_outputs, 8)

def closure():
    optimizer.zero_grad()
    pred1, pred2, pred3 = model(X)
    pointer_loss = nn.functional.nll_loss(pred1, pointer_outputs)
    task1_loss = nn.functional.nll_loss(pred2, task1_outputs)
    task2_loss = nn.functional.nll_loss(pred3, task2_outputs)
    loss = pointer_loss + task1_loss + task2_loss
    
    loss.backward()
    return loss

In [175]:
for i in range(1000):
    loss = optimizer.step(closure)
    print(loss)
#     pointer_loss, task1_loss, task2_loss = optimizer.step(closure)
#     pointer_loss, task1_loss, task2_loss = closure()
#     print(float(pointer_loss)+float(task1_loss)+float(task2_loss))
#     print('ptr', float(pointer_loss))
#     print('task1', float(task1_loss))
#     print('task2', float(task2_loss))

tensor(-0.3817, grad_fn=<ThAddBackward>)
tensor(-0.3827, grad_fn=<ThAddBackward>)
tensor(-0.3837, grad_fn=<ThAddBackward>)
tensor(-0.3846, grad_fn=<ThAddBackward>)
tensor(-0.3856, grad_fn=<ThAddBackward>)
tensor(-0.3866, grad_fn=<ThAddBackward>)
tensor(-0.3876, grad_fn=<ThAddBackward>)
tensor(-0.3887, grad_fn=<ThAddBackward>)
tensor(-0.3897, grad_fn=<ThAddBackward>)
tensor(-0.3908, grad_fn=<ThAddBackward>)
tensor(-0.3919, grad_fn=<ThAddBackward>)
tensor(-0.3931, grad_fn=<ThAddBackward>)
tensor(-0.3942, grad_fn=<ThAddBackward>)
tensor(-0.3955, grad_fn=<ThAddBackward>)
tensor(-0.3967, grad_fn=<ThAddBackward>)
tensor(-0.3980, grad_fn=<ThAddBackward>)
tensor(-0.3994, grad_fn=<ThAddBackward>)
tensor(-0.4008, grad_fn=<ThAddBackward>)
tensor(-0.4024, grad_fn=<ThAddBackward>)
tensor(-0.4039, grad_fn=<ThAddBackward>)
tensor(-0.4056, grad_fn=<ThAddBackward>)
tensor(-0.4074, grad_fn=<ThAddBackward>)
tensor(-0.4094, grad_fn=<ThAddBackward>)
tensor(-0.4114, grad_fn=<ThAddBackward>)
tensor(-0.4136, 

In [18]:
task2_loss

tensor(-0.1259, grad_fn=<NllLossBackward>)

In [None]:
torch.stack([torch.tensor([[1,2,3,4]]), torch.tensor([[4,3,2,1]])])

In [None]:
print(type([torch.tensor([[1,2,3,4]]), torch.tensor([[4,3,2,1]])]))

In [None]:
type(torch.tensor([[1,2,3,4]]))

In [105]:
model.lin_task2.weight.shape

torch.Size([8, 30])

In [108]:
model.lin_task2.bias

Parameter containing:
tensor([ 0.0158,  0.1807,  0.0529, -0.1628, -0.1240,  0.0059, -0.1141,  0.1313],
       requires_grad=True)

In [157]:
class Model1b(nn.Module):
    def __init__(self, input_dim, pointer_dim):
        super(Model1b, self).__init__()
        self.input_dim = input_dim
        self.pointer_dim = pointer_dim
        
        self.lin_pointer = nn.Linear(input_dim, pointer_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        pointer_outputs = []
        
        for x_t in x.reshape(len(x), 1, self.input_dim):
#             h_t, c_t = self.lstm(x_t, (h_t, c_t))
            pointer_outputs.append(self.softmax(self.lin_pointer(x_t)))
        
        return torch.stack(pointer_outputs).reshape(len(x), self.pointer_dim)

In [160]:
model = Model1b(7, 7)
model.double()

optimizer = optim.Adam(model.parameters())
inputs, pointer_outputs, task1_outputs, task2_outputs = training_sequence_1([2,4,6,0,5,3,1])
X = vectorize_2d(inputs, 7)
Y1 = vectorize_2d(pointer_outputs, 7)


def closure():
    optimizer.zero_grad()
    pred1 = model(X)
    pointer_loss = nn.functional.nll_loss(pred1, pointer_outputs)
    loss = pointer_loss
#     pointer_loss.backward(retain_graph=True)
#     task1_loss.backward(retain_graph=True)
#     task2_loss.backward(retain_graph=True)
    
    loss.backward()
    return loss #pointer_loss, task1_loss, task2_loss

In [161]:
for i in range(10000):
    loss = optimizer.step(closure)
    print(loss)

tensor(-0.1501, grad_fn=<NllLossBackward>)
tensor(-0.1504, grad_fn=<NllLossBackward>)
tensor(-0.1506, grad_fn=<NllLossBackward>)
tensor(-0.1508, grad_fn=<NllLossBackward>)
tensor(-0.1511, grad_fn=<NllLossBackward>)
tensor(-0.1513, grad_fn=<NllLossBackward>)
tensor(-0.1516, grad_fn=<NllLossBackward>)
tensor(-0.1518, grad_fn=<NllLossBackward>)
tensor(-0.1520, grad_fn=<NllLossBackward>)
tensor(-0.1523, grad_fn=<NllLossBackward>)
tensor(-0.1525, grad_fn=<NllLossBackward>)
tensor(-0.1527, grad_fn=<NllLossBackward>)
tensor(-0.1530, grad_fn=<NllLossBackward>)
tensor(-0.1532, grad_fn=<NllLossBackward>)
tensor(-0.1535, grad_fn=<NllLossBackward>)
tensor(-0.1537, grad_fn=<NllLossBackward>)
tensor(-0.1539, grad_fn=<NllLossBackward>)
tensor(-0.1542, grad_fn=<NllLossBackward>)
tensor(-0.1544, grad_fn=<NllLossBackward>)
tensor(-0.1547, grad_fn=<NllLossBackward>)
tensor(-0.1549, grad_fn=<NllLossBackward>)
tensor(-0.1551, grad_fn=<NllLossBackward>)
tensor(-0.1554, grad_fn=<NllLossBackward>)
tensor(-0.1

KeyboardInterrupt: 