In [1]:
%matplotlib inline

SUDOKU_PATH = '/home/ajhnam/sudoku'

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda as cutorch
import torch.optim as optim
import numpy as np
import itertools
import random
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_notebook
import time

import sys
sys.path.append(SUDOKU_PATH + '/src/sudoku')

from board import Board
from grid_string import GridString, read_solutions_file
from shuffler import Shuffler
from shuffled_grid import ShuffledGrid
from solutions import Solutions
from dataset import Dataset
import utils

In [2]:
# set random seed to 0
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.set_default_tensor_type('torch.DoubleTensor')
device = 0

In [3]:
def count_model_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_tensor_memory_size(tensor):
    return tensor.element_size() * tensor.nelement() // (2**20)

def get_gpu_memory(device):
    return cutorch.memory_allocated(device) // (2**20)

def determine_edges(dim_x, dim_y):
    """
    Returns a 2-d array of (max_digit**2, n) where the i_th entry is a list of
        other cells' indices that cell i shares a house with
    """
    max_digit = dim_x*dim_y
    edges = []
    for row in range(max_digit):
        row_edges = []
        for col in range(max_digit):
            # row & column
            col_edges = {(row, i) for i in range(max_digit)}
            col_edges |= {(i, col) for i in range(max_digit)}
            
            # box
            x_min = (row // dim_x) * dim_x
            y_min = (col // dim_y) * dim_y
            col_edges |= set(itertools.product(range(x_min, x_min+dim_x), range(y_min, y_min+dim_y)))
            
            # removing self
            col_edges -= {(row, col)}
            col_edges = [row*max_digit + col for row, col in col_edges]
            row_edges.append(sorted(col_edges))
        edges.append(row_edges)
    edges = torch.tensor(edges)
    shape = edges.shape
    return edges.reshape(max_digit**2, shape[2])

def encode_input(grid_string: GridString):
    return torch.tensor(list(grid_string.traverse_grid()))

def encode_output(grid_string: GridString):
    return torch.tensor(list(grid_string.traverse_grid())) - 1

In [4]:
dst = Dataset.load(SUDOKU_PATH + '/data/puzzles.dst')

max_digit = 4
num_cells = max_digit**2
cell_vec_dim = max_digit + 1
train_inputs = dst.get_input_data(0)
train_outputs = dst.get_output_data(0)
train_x = torch.cat([encode_input(p) for p in train_inputs]).reshape(len(train_inputs), num_cells)#.cuda(device)
train_y = torch.cat([encode_output(p) for p in train_outputs]).reshape(len(train_outputs), num_cells)#.cuda(device)

In [10]:
class MLP(nn.Module):
    def __init__(self, layer_sizes):
        super(MLP, self).__init__()
        self.layer_sizes = layer_sizes
        
        self.layers = nn.ModuleList()
        self.nonlinear = nn.ReLU()
        
        prev_layer_size = self.layer_sizes[0]
        for size in self.layer_sizes[1:]:
            self.layers.append(nn.Linear(prev_layer_size, size))
            prev_layer_size = size

    def forward(self, X):
        vector = X
        for layer in self.layers[:-1]:
            vector = self.nonlinear(layer(vector))
        return self.layers[-1](vector)

class RRN(nn.Module):
    def __init__(self, dim_x, dim_y, embed_size=16, hidden_layer_size=96):
        super(RRN, self).__init__()
        self.max_digit = dim_x * dim_y
        self.embed_size = embed_size
        self.hidden_layer_size = hidden_layer_size
        
        self.edges = determine_edges(dim_x, dim_y)


        self.embed_layer = nn.Embedding(self.max_digit+1, self.embed_size)
        self.input_mlp = MLP([self.embed_size,
                              self.hidden_layer_size,
                              self.hidden_layer_size,
                              self.hidden_layer_size])
        
        self.f = MLP([2*self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size])
        self.g_mlp = MLP([2*self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size])
        self.g_lstm = nn.LSTM(self.hidden_layer_size, self.hidden_layer_size)
        self.r = MLP([self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.hidden_layer_size,
                      self.max_digit])
    
    def compute_messages(self, H):
        messages = torch.zeros(H.shape)
        batch_size = H.shape[0]
        num_nodes = H.shape[1]
        for puzzle_index in range(batch_size): # for puzzle in batch
            messages[puzzle_index] = torch.tensor([torch.sum(H[puzzle_index][self.edges[n]]) for n in range(num_nodes)]).cuda(H.get_device())
        return messages
                    

    def forward(self, grids, iters):
        batch_size = len(grids)
        num_nodes = self.max_digit**2
        edges_per_nodes = self.edges.shape[1]
        
        embeddings = self.embed_layer(grids)
        X = self.input_mlp(embeddings)
        H = torch.tensor(X).cuda(grids.get_device())
        g_lstm_h = H.reshape(1, batch_size*num_nodes, self.hidden_layer_size)
        g_lstm_c = torch.randn(1, batch_size*num_nodes, self.hidden_layer_size).cuda(grids.get_device())

        outputs = []
        for i in range(iters):
            M = torch.zeros(batch_size, self.max_digit**2, self.hidden_layer_size).cuda(grids.get_device())
            for node in range(num_nodes):
                msgs = torch.cat([self.f(torch.cat([H[:,node,:], H[:,other,:]], dim=1)) for other in self.edges[node]])
                msgs = msgs.reshape(edges_per_nodes, batch_size, self.hidden_layer_size).permute(1,0,2)
                M[:,node,:] = torch.sum(msgs, dim=1)
            
            input_to_g_lstm = self.g_mlp(torch.cat([X, M], dim=2)).reshape(1, batch_size*num_nodes, self.hidden_layer_size)
            
            , (g_lstm_h, g_lstm_c) = self.g_lstm(input_to_g_lstm, (g_lstm_h, g_lstm_c))
            H = g_lstm_h.reshape(H.shape)
            output = self.r(H)
            outputs.append(output)

        return outputs

In [12]:
start = time.time()

device = 0

x_batch = train_x.cuda(device)
y_batch = train_y.cuda(device)

model = RRN( dim_x=2, dim_y=2, embed_size=6, hidden_layer_size=32).cuda(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

num_iters = 32
epochs = 25

def closure():
    optimizer.zero_grad()
    total_loss = 0
    for j, p in zip(range(num_iters), model(x_batch, num_iters)):
        loss = F.cross_entropy(p.permute(0,2,1), y_batch)
        total_loss += loss
    total_loss.backward()
    return total_loss

for i in tqdm_notebook(range(epochs)):
    loss = optimizer.step(closure)
    print("Iter {} | Device {} | Memory {} MB | Loss {}".format(i, device, get_gpu_memory(device), round(float(loss), 5)))
    
end = time.time()
print(end - start)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))



Iter 0 | Device 0 | Memory 0 MB | Loss 44.40128
Iter 1 | Device 0 | Memory 0 MB | Loss 44.36824
Iter 2 | Device 0 | Memory 0 MB | Loss 44.3356
Iter 3 | Device 0 | Memory 0 MB | Loss 44.30364
Iter 4 | Device 0 | Memory 0 MB | Loss 44.27222
Iter 5 | Device 0 | Memory 0 MB | Loss 44.24123
Iter 6 | Device 0 | Memory 0 MB | Loss 44.21095
Iter 7 | Device 0 | Memory 0 MB | Loss 44.18199
Iter 8 | Device 0 | Memory 0 MB | Loss 44.15249
Iter 9 | Device 0 | Memory 0 MB | Loss 44.12293
Iter 10 | Device 0 | Memory 0 MB | Loss 44.09279
Iter 11 | Device 0 | Memory 0 MB | Loss 44.06256
Iter 12 | Device 0 | Memory 0 MB | Loss 44.03141
Iter 13 | Device 0 | Memory 0 MB | Loss 43.99917
Iter 14 | Device 0 | Memory 0 MB | Loss 43.96599
Iter 15 | Device 0 | Memory 0 MB | Loss 43.93149
Iter 16 | Device 0 | Memory 0 MB | Loss 43.89633
Iter 17 | Device 0 | Memory 0 MB | Loss 43.86076
Iter 18 | Device 0 | Memory 0 MB | Loss 43.82517
Iter 19 | Device 0 | Memory 0 MB | Loss 43.79014
Iter 20 | Device 0 | Memory 0 M

In [11]:
device_ids = [0,2]

start = time.time()

device = 0

x_batch = train_x.cuda(device)
y_batch = train_y.cuda(device)

model = RRN( dim_x=2, dim_y=2, embed_size=6, hidden_layer_size=32).cuda(device)
model = torch.nn.DataParallel(model, device_ids=device_ids).cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

num_iters = 32
epochs = 25

def closure():
    optimizer.zero_grad()
    total_loss = 0
    for j, p in zip(range(num_iters), model(x_batch, num_iters)):
        loss = F.cross_entropy(p.permute(0,2,1), y_batch)
        total_loss += loss
    total_loss.backward()
    return total_loss

for i in tqdm_notebook(range(epochs)):
    loss = optimizer.step(closure)
    print("Iter {} | Loss {}".format(i, round(float(loss), 5)))
    
end = time.time()
print(end - start)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))



Iter 0 | Loss 44.43356
Iter 1 | Loss 44.4025
Iter 2 | Loss 44.38362
Iter 3 | Loss 44.36824
Iter 4 | Loss 44.3536
Iter 5 | Loss 44.34033
Iter 6 | Loss 44.32745
Iter 7 | Loss 44.31503
Iter 8 | Loss 44.30307
Iter 9 | Loss 44.29182
Iter 10 | Loss 44.28115
Iter 11 | Loss 44.27115
Iter 12 | Loss 44.26207
Iter 13 | Loss 44.25346
Iter 14 | Loss 44.24501
Iter 15 | Loss 44.23696
Iter 16 | Loss 44.22907
Iter 17 | Loss 44.22157
Iter 18 | Loss 44.21484
Iter 19 | Loss 44.20761
Iter 20 | Loss 44.20038
Iter 21 | Loss 44.1931
Iter 22 | Loss 44.18606
Iter 23 | Loss 44.17916
Iter 24 | Loss 44.1723

54.17464590072632
