In [1]:
%matplotlib inline

SUDOKU_PATH = '/home/ajhnam/sudoku'

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import itertools
import random
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_notebook
import time

import sys
sys.path.append(SUDOKU_PATH + '/src/sudoku')

from board import Board
from grid_string import GridString, read_solutions_file
from shuffler import Shuffler
from shuffled_grid import ShuffledGrid
from solutions import Solutions
from dataset import Dataset
import utils

In [2]:
# set random seed to 0
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.set_default_tensor_type('torch.DoubleTensor')
device = 4

In [3]:
def determine_edges(dim_x, dim_y):
    """
    Returns a 2-d array of (max_digit**2, n) where the i_th entry is a list of
        other cells' indices that cell i shares a house with
    """
    max_digit = dim_x*dim_y
    edges = []
    for row in range(max_digit):
        row_edges = []
        for col in range(max_digit):
            # row & column
            col_edges = {(row, i) for i in range(max_digit)}
            col_edges |= {(i, col) for i in range(max_digit)}
            
            # box
            x_min = (row // dim_x) * dim_x
            y_min = (col // dim_y) * dim_y
            col_edges |= set(itertools.product(range(x_min, x_min+dim_x), range(y_min, y_min+dim_y)))
            
            # removing self
            col_edges -= {(row, col)}
            col_edges = [row*max_digit + col for row, col in col_edges]
            row_edges.append(sorted(col_edges))
        edges.append(row_edges)
    edges = torch.tensor(edges)
    shape = edges.shape
    return edges.reshape(max_digit**2, shape[2])

def encode_input(grid_string: GridString):
    return torch.tensor(list(grid_string.traverse_grid()))

def encode_output(grid_string: GridString):
    return torch.tensor(list(grid_string.traverse_grid())) - 1

In [4]:
class MLP(nn.Module):
    def __init__(self, layer_sizes):
        super(MLP, self).__init__()
        self.layer_sizes = layer_sizes
        
        self.layers = nn.ModuleList()
        self.nonlinear = nn.ReLU()
        
        prev_layer_size = self.layer_sizes[0]
        for size in self.layer_sizes[1:]:
            self.layers.append(nn.Linear(prev_layer_size, size))
            prev_layer_size = size

    def forward(self, X):
        vector = X
        for layer in self.layers[:-1]:
            vector = self.nonlinear(layer(vector))
        return self.layers[-1](vector)

class RRN(nn.Module):
    def __init__(self, dim_x, dim_y, embed_size=16, hidden_layer_size=96):
        super(RRN, self).__init__()
        self.max_digit = dim_x * dim_y
        self.embed_size = embed_size
        self.hidden_layer_size = hidden_layer_size
        
        self.edges = determine_edges(dim_x, dim_y)


        self.embed_layer = nn.Embedding(self.max_digit+1, self.embed_size)
        self.input_mlp = MLP([self.embed_size,
                              self.hidden_layer_size,
                              self.hidden_layer_size,
                              self.hidden_layer_size])
        
        self.f = MLP([2*self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size])
        self.g_mlp = MLP([2*self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size])
        self.g_lstm = nn.LSTM(self.hidden_layer_size, self.hidden_layer_size)
        self.r = MLP([self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.max_digit])
    
    def compute_messages(self, H):
        messages = torch.zeros(H.shape)
        batch_size = H.shape[0]
        num_nodes = H.shape[1]
        for puzzle_index in range(batch_size): # for puzzle in batch
            messages[puzzle_index] = torch.tensor([torch.sum(H[puzzle_index][self.edges[n]]) for n in range(num_nodes)])
        return messages
                    

    def forward(self, grids, iters):
        batch_size = len(grids)
        num_nodes = self.max_digit**2
        edges_per_nodes = self.edges.shape[1]
        
        embeddings = self.embed_layer(grids)
        X = self.input_mlp(embeddings)
        H = torch.tensor(X).cuda(grids.get_device())
        g_lstm_h = H.reshape(1, batch_size*num_nodes, self.hidden_layer_size)
        g_lstm_c = torch.randn(1, batch_size*num_nodes, self.hidden_layer_size).cuda(grids.get_device())
        
        
        outputs = []
        for i in range(iters):
            M = torch.zeros(batch_size, self.max_digit**2, self.hidden_layer_size).cuda(grids.get_device())
            for node in range(num_nodes):
                msgs = torch.cat([self.f(torch.cat([H[:,node,:], H[:,other,:]], dim=1)) for other in self.edges[node]])
                msgs = msgs.reshape(edges_per_nodes, batch_size, self.hidden_layer_size).permute(1,0,2)
                M[:,node,:] = torch.sum(msgs, dim=1)
            
            input_to_g_lstm = self.g_mlp(torch.cat([X, M], dim=2)).reshape(1, batch_size*num_nodes, self.hidden_layer_size)
            
            _, (g_lstm_h, g_lstm_c) = self.g_lstm(input_to_g_lstm, (g_lstm_h, g_lstm_c))
            H = g_lstm_h.reshape(H.shape)
            output = self.r(H)
            
            outputs.append(output)
                
        return outputs

In [5]:
max_digit = 4
dst = Dataset.load(SUDOKU_PATH + '/data/puzzles.dst')

num_cells = max_digit**2
cell_vec_dim = max_digit + 1
train_inputs = dst.get_input_data(0)
train_outputs = dst.get_output_data(0)
train_x = torch.cat([encode_input(p) for p in train_inputs]).reshape(len(train_inputs), num_cells).cuda(device)
train_y = torch.cat([encode_output(p) for p in train_outputs]).reshape(len(train_outputs), num_cells)#.cuda(device)

In [8]:
start = time.time()

device = 0
device_ids = [0, 2]

model = RRN( dim_x=2, dim_y=2, embed_size=6, hidden_layer_size=32)
model = torch.nn.DataParallel(model, device_ids=device_ids).cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

train_y_gpu = {device: train_y.cuda(device) for device in device_ids}

def closure():
    optimizer.zero_grad()
    predictions = [p.permute(0,2,1) for p in model(train_x, 32)]
    loss = sum([F.cross_entropy(p, train_y_gpu[p.get_device()]) for p in predictions])
    loss.backward()
    return loss

for i in tqdm_notebook(range(25)):
    print(optimizer.step(closure))
    
end = time.time()
print(end - start)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

  self.dropout, self.training, self.bidirectional, self.batch_first)


tensor(44.4990, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.4724, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.4489, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.4305, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.4136, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3978, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3835, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3715, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3620, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3522, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3435, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3342, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3252, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3168, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3040, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.2907, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.2782, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.2571, device='cuda:0'

In [6]:
start = time.time()

device = 0
train_y_cuda = train_y.cuda(device)

model = RRN( dim_x=2, dim_y=2, embed_size=6, hidden_layer_size=32).cuda(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

def closure():
    optimizer.zero_grad()
    predictions = [p.permute(0,2,1) for p in model(train_x, 32)]
    loss = sum([F.cross_entropy(p, train_y) for p in predictions])
    loss.backward()
    return loss

for i in tqdm_notebook(range(25)):
    print(optimizer.step(closure))
    
end = time.time()
print(end - start)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




RuntimeError: arguments are located on different GPUs at /opt/conda/conda-bld/pytorch_1544176307774/work/aten/src/THC/generic/THCTensorIndex.cu:519

In [7]:
start = time.time()

device = 0
device_ids = [0,1,2,3,4]

model = RRN( dim_x=2, dim_y=2, embed_size=6, hidden_layer_size=32)
model = torch.nn.DataParallel(model, device_ids=device_ids).cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

train_y_gpu = {device: train_y.cuda(device) for device in device_ids}

def closure():
    optimizer.zero_grad()
    predictions = [p.permute(0,2,1) for p in model(train_x, 32)]
    loss = sum([F.cross_entropy(p, train_y_gpu[p.get_device()]) for p in predictions])
    loss.backward()
    return loss

for i in tqdm_notebook(range(25)):
    print(optimizer.step(closure))
    
end = time.time()
print(end - start)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

  self.dropout, self.training, self.bidirectional, self.batch_first)


tensor(44.3883, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3794, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3727, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3671, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3606, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3556, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3507, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3452, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3384, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3308, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3229, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3099, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.2969, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.2788, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.2562, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.2293, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.1948, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.1527, device='cuda:0'

In [14]:
start = time.time()

device = 0
device_ids = list(range(9))

model = RRN( dim_x=2, dim_y=2, embed_size=6, hidden_layer_size=32)
model = torch.nn.DataParallel(model, device_ids=device_ids).cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

train_y_gpu = {device: train_y.cuda(device) for device in device_ids}

def closure():
    optimizer.zero_grad()
    predictions = [p.permute(0,2,1) for p in model(train_x, 32)]
    loss = sum([F.cross_entropy(p, train_y_gpu[p.get_device()]) for p in predictions])
    loss.backward()
    return loss

for i in tqdm_notebook(range(25)):
    print(optimizer.step(closure))
    
end = time.time()
print(end - start)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

  self.dropout, self.training, self.bidirectional, self.batch_first)


tensor(44.6348, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.6106, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.5881, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.5661, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.5444, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.5229, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.5018, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.4825, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.4635, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.4436, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.4241, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.4058, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3852, device='cuda:0', grad_fn=<AddBackward0>)
tensor(44.3653, device='cuda:0', grad_fn=<AddBackward0>)


KeyboardInterrupt: 

In [13]:
def closure():
    optimizer.zero_grad()
    predictions = [p.permute(0,2,1) for p in model(train_x, 32)]
    loss = sum([F.cross_entropy(p, train_y) for p in predictions])
    loss.backward()
    return loss

for i in tqdm_notebook(range(250)):
    print(optimizer.step(closure))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

tensor(44.3883, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.3800, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.3728, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.3670, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.3618, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.3567, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.3511, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.3461, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.3405, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.3338, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.3249, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.3141, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.3006, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.2832, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.2634, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.2363, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.2043, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(44.1635

tensor(8.8192, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(8.0575, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(8.4432, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(8.2004, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(7.9485, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(7.8166, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(7.7160, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(7.4770, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(7.2847, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(7.1249, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(6.7020, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(6.5377, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(6.3583, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(6.5894, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(6.3154, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(6.3246, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(5.9659, device='cuda:1', grad_fn=<ThAddBackward>)
tensor(5.7095, device='cuda:1',

In [47]:
model = RRN( dim_x=2, dim_y=2, embed_size=6, hidden_layer_size=32)
model = torch.nn.DataParallel(model, device_ids=[0,1]).cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

def closure():
    optimizer.zero_grad()
    predictions = [p.permute(0,2,1) for p in model(train_x, 32)]
    loss = sum([F.cross_entropy(p, train_y) for p in predictions])
    loss.backward()
    return loss

for i in tqdm_notebook(range(250)):
    print(optimizer.step(closure))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

X at device 1: torch.Size([500, 16, 32])
H at device 4: torch.Size([500, 16, 32])
LSTM_H at device 4: torch.Size([1, 8000, 32])
LSTM_C at device 4: torch.Size([1, 8000, 32])
X at device 0: torch.Size([500, 16, 32])
H at device 4: torch.Size([500, 16, 32])
LSTM_H at device 4: torch.Size([1, 8000, 32])
LSTM_C at device 4: torch.Size([1, 8000, 32])





RuntimeError: arguments are located on different GPUs at /opt/conda/conda-bld/pytorch_1544176307774/work/aten/src/THC/generic/THCTensorMathBlas.cu:253

In [15]:
torch.save(model.state_dict(), SUDOKU_PATH + "/models/250.pth")

In [6]:
model = RRN( dim_x=2, dim_y=2, embed_size=6, hidden_layer_size=32).cuda(device)
model.load_state_dict(torch.load(SUDOKU_PATH + "/models/250.pth"), strict=False)
model.eval()

RRN(
  (embed_layer): Embedding(5, 6)
  (input_mlp): MLP(
    (layers): ModuleList(
      (0): Linear(in_features=6, out_features=32, bias=True)
      (1): Linear(in_features=32, out_features=32, bias=True)
      (2): Linear(in_features=32, out_features=32, bias=True)
    )
    (nonlinear): ReLU()
  )
  (f): MLP(
    (layers): ModuleList(
      (0): Linear(in_features=64, out_features=32, bias=True)
      (1): Linear(in_features=32, out_features=32, bias=True)
      (2): Linear(in_features=32, out_features=32, bias=True)
    )
    (nonlinear): ReLU()
  )
  (g_mlp): MLP(
    (layers): ModuleList(
      (0): Linear(in_features=64, out_features=32, bias=True)
      (1): Linear(in_features=32, out_features=32, bias=True)
      (2): Linear(in_features=32, out_features=32, bias=True)
    )
    (nonlinear): ReLU()
  )
  (g_lstm): LSTM(32, 32)
  (r): MLP(
    (layers): ModuleList(
      (0): Linear(in_features=32, out_features=32, bias=True)
      (1): Linear(in_features=32, out_features=32, b

In [10]:
test_inputs = dst.get_input_data(1)
test_outputs = dst.get_output_data(1)
with torch.no_grad():
    test_x = torch.cat([encode_input(p) for p in test_inputs]).reshape(len(test_inputs), num_cells).cuda(device)
    test_y = torch.cat([encode_output(p) for p in test_outputs]).reshape(len(test_outputs), num_cells).cuda(device)

    softmax = nn.Softmax(dim=2)
    predictions = [softmax(p) for p in model(test_x, 32)]
    output = predictions[-1].argmax(dim=2)

In [13]:
output

tensor([[0, 1, 2,  ..., 3, 0, 1],
        [1, 0, 2,  ..., 1, 3, 2],
        [0, 2, 1,  ..., 3, 0, 1],
        ...,
        [1, 0, 2,  ..., 3, 0, 1],
        [2, 1, 3,  ..., 2, 0, 1],
        [1, 0, 2,  ..., 2, 1, 0]], device='cuda:4')

In [8]:
incorrects = []
for i in range(len(test_inputs)):
    if not bool(torch.all(output[i] == test_y[i])):
        incorrects.append(i)

In [9]:
len(incorrects)

16

In [40]:
test_inputs = list(dst.data)
test_outputs = [dst.data[i] for i in test_inputs]
test_x = torch.cat([encode_input(p) for p in test_inputs]).reshape(len(test_inputs), num_cells).cuda(device)
test_y = torch.cat([encode_output(p) for p in test_outputs]).reshape(len(test_outputs), num_cells).cuda(device)

softmax = nn.Softmax(dim=2)
predictions = [softmax(p) for p in model(test_x, 32)]
output = predictions[-1].argmax(dim=2)

RuntimeError: CUDA error: out of memory

TypeError: 'dict' object is not callable

In [37]:
dst.split_boundaries

[1000, 2000]

In [62]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np


class MyModule(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()
        self.conv = nn.Conv2d(3, 3, 3, 1, 0)

    def forward(self, x):
        print('{}, {}'.format(x.get_device(), x.size()))
        
        size = [int(s * 0.5) for s in x.shape[2:]]
        a = self.conv(x)
        b = F.upsample(x, size=size, mode='bilinear', align_corners=True)
        b = self.conv(b)
        c = F.upsample(b, size=a.shape[2:], mode='bilinear', align_corners=True)

        return a, b, c

data = torch.rand(5, 3, 32, 32).cuda()

data = Variable(data)

model = MyModule()
model = torch.nn.DataParallel(model, device_ids=None).cuda()
# model = nn.DataParallel(model)
# model.cuda()

outputs = model(data)

loss = 0

target_a = np.random.randint(0, 3, size=(5, 30, 30))
target_a = torch.from_numpy(target_a).long().cuda()
target_a = Variable(target_a, requires_grad=False)
loss += F.nll_loss(F.log_softmax(outputs[0], dim=1), target_a, ignore_index=-1)

target_b = np.random.randint(0, 3, size=(5, 14, 14))
target_b = torch.from_numpy(target_b).long().cuda()
target_b = Variable(target_b, requires_grad=False)
loss += F.nll_loss(F.log_softmax(outputs[1], dim=1), target_b, ignore_index=-1)

target_c = np.random.randint(0, 3, size=(5, 30, 30))
target_c = torch.from_numpy(target_c).long().cuda()
target_c = Variable(target_c, requires_grad=False)

# print(target_c.get_device())
loss += F.nll_loss(F.log_softmax(outputs[2], dim=1), target_c, ignore_index=-1)

loss.backward()

0, torch.Size([1, 3, 32, 32])1, torch.Size([1, 3, 32, 32])

2, torch.Size([1, 3, 32, 32])
3, torch.Size([1, 3, 32, 32])
4, torch.Size([1, 3, 32, 32])




In [10]:
print(torch.__version__)

1.0.0


In [11]:
print(torch.__version__)

1.0.0


In [68]:
class RRNParallel(nn.Module):
    def __init__(self, dim_x, dim_y, embed_size=16, hidden_layer_size=96, device_ids = None):
        super(RRNParallel, self).__init__()
        self.max_digit = dim_x * dim_y
        self.embed_size = embed_size
        self.hidden_layer_size = hidden_layer_size
        self.device_ids = device_ids
        
        self.edges = determine_edges(dim_x, dim_y)


        self.embed_layer = nn.Embedding(self.max_digit+1, self.embed_size)
        self.input_mlp = MLP([self.embed_size,
                              self.hidden_layer_size,
                              self.hidden_layer_size,
                              self.hidden_layer_size])
        
        self.f = MLP([2*self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size])
        self.g_mlp = MLP([2*self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size])
        self.g_lstm = nn.LSTM(self.hidden_layer_size, self.hidden_layer_size)
        self.r = MLP([self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.max_digit])
        
        self.embed_layer = torch.nn.DataParallel(self.embed_layer, device_ids=device_ids).cuda()
        self.f = torch.nn.DataParallel(self.f, device_ids=device_ids).cuda()
        self.g_mlp = torch.nn.DataParallel(self.g_mlp, device_ids=device_ids).cuda()
        self.g_lstm = torch.nn.DataParallel(self.g_lstm, device_ids=device_ids).cuda()
        self.r = torch.nn.DataParallel(self.r, device_ids=device_ids).cuda()
    
    def compute_messages(self, H):
        messages = torch.zeros(H.shape)
        batch_size = H.shape[0]
        num_nodes = H.shape[1]
        for puzzle_index in range(batch_size): # for puzzle in batch
            messages[puzzle_index] = torch.tensor([torch.sum(H[puzzle_index][self.edges[n]]) for n in range(num_nodes)])
        return messages
                    

    def forward(self, grids, iters):
        batch_size = len(grids)
        num_nodes = self.max_digit**2
        edges_per_nodes = self.edges.shape[1]
        
        embeddings = self.embed_layer(grids)
        X = self.input_mlp(embeddings)
        H = torch.tensor(X).cuda()
        g_lstm_h = H.reshape(1, batch_size*num_nodes, self.hidden_layer_size)
        g_lstm_c = torch.randn(1, batch_size*num_nodes, self.hidden_layer_size).cuda()
        
        outputs = []
        for i in range(iters):
            M = torch.zeros(batch_size, self.max_digit**2, self.hidden_layer_size).cuda()
#             print(M.get_device())
            for node in range(num_nodes):
#                 print(H.get_device())
                msgs = torch.cat([self.f(torch.cat([H[:,node,:], H[:,other,:]], dim=1)) for other in self.edges[node]])
                msgs = msgs.reshape(edges_per_nodes, batch_size, self.hidden_layer_size).permute(1,0,2)
                M[:,node,:] = torch.sum(msgs, dim=1)
            
            input_to_g_lstm = self.g_mlp(torch.cat([X, M], dim=2)).reshape(1, batch_size*num_nodes, self.hidden_layer_size)
            
            _, (g_lstm_h, g_lstm_c) = self.g_lstm(input_to_g_lstm, (g_lstm_h, g_lstm_c))
            H = g_lstm_h.reshape(H.shape)
            output = self.r(H)
            
            outputs.append(output)
                
        return outputs

In [66]:
max_digit = 4
dst = Dataset.load(SUDOKU_PATH + '/data/puzzles.dst')

num_cells = max_digit**2
cell_vec_dim = max_digit + 1
train_inputs = dst.get_input_data(0)
train_outputs = dst.get_output_data(0)
train_x = torch.cat([encode_input(p) for p in train_inputs]).reshape(len(train_inputs), num_cells).cuda()
train_y = torch.cat([encode_output(p) for p in train_outputs]).reshape(len(train_outputs), num_cells).cuda()

In [71]:


model = RRNParallel( dim_x=2, dim_y=2, embed_size=6, hidden_layer_size=32).cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

def closure():
    optimizer.zero_grad()
    predictions = [p.permute(0,2,1) for p in model(train_x, 32)]
    loss = sum([F.cross_entropy(p, train_y) for p in predictions])
    loss.backward()
    return loss

for i in tqdm_notebook(range(5)):
    print(optimizer.step(closure))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))



X at device 0: torch.Size([1000, 16, 32])
H at device 0: torch.Size([1000, 16, 32])
LSTM_H at device 0: torch.Size([1, 16000, 32])
LSTM_C at device 0: torch.Size([1, 16000, 32])
tensor(44.5333, device='cuda:0', grad_fn=<AddBackward0>)
X at device 0: torch.Size([1000, 16, 32])
H at device 0: torch.Size([1000, 16, 32])
LSTM_H at device 0: torch.Size([1, 16000, 32])
LSTM_C at device 0: torch.Size([1, 16000, 32])


KeyboardInterrupt: 

In [74]:
max_digit = 4
dst = Dataset.load(SUDOKU_PATH + '/data/puzzles.dst')

num_cells = max_digit**2
cell_vec_dim = max_digit + 1
train_inputs = dst.get_input_data(0)
train_outputs = dst.get_output_data(0)
train_x = torch.cat([encode_input(p) for p in train_inputs]).reshape(len(train_inputs), num_cells).cuda(device)
train_y = torch.cat([encode_output(p) for p in train_outputs]).reshape(len(train_outputs), num_cells).cuda(device)

model = RRN( dim_x=2, dim_y=2, embed_size=6, hidden_layer_size=32).cuda(device)
# model = torch.nn.DataParallel(model, device_ids=[0,1]).cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

def closure():
    optimizer.zero_grad()
    predictions = [p.permute(0,2,1) for p in model(train_x, 32)]
    loss = sum([F.cross_entropy(p, train_y) for p in predictions])
    loss.backward()
    return loss

for i in tqdm_notebook(range(250)):
    print(optimizer.step(closure))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))



X at device 4: torch.Size([1000, 16, 32])
H at device 4: torch.Size([1000, 16, 32])
LSTM_H at device 4: torch.Size([1, 16000, 32])
LSTM_C at device 4: torch.Size([1, 16000, 32])
tensor(44.6004, device='cuda:4', grad_fn=<AddBackward0>)
X at device 4: torch.Size([1000, 16, 32])
H at device 4: torch.Size([1000, 16, 32])
LSTM_H at device 4: torch.Size([1, 16000, 32])
LSTM_C at device 4: torch.Size([1, 16000, 32])
tensor(44.5794, device='cuda:4', grad_fn=<AddBackward0>)
X at device 4: torch.Size([1000, 16, 32])
H at device 4: torch.Size([1000, 16, 32])
LSTM_H at device 4: torch.Size([1, 16000, 32])
LSTM_C at device 4: torch.Size([1, 16000, 32])
tensor(44.5598, device='cuda:4', grad_fn=<AddBackward0>)
X at device 4: torch.Size([1000, 16, 32])
H at device 4: torch.Size([1000, 16, 32])
LSTM_H at device 4: torch.Size([1, 16000, 32])
LSTM_C at device 4: torch.Size([1, 16000, 32])


KeyboardInterrupt: 