In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
from sklearn import model_selection
import copy
from collections import Counter

### Getting the data

In [2]:
# setting up some useful functions

In [3]:
training_path = "/Users/aysjajohnson/Desktop/ARC-master/data/training/"
test_path = "/Users/aysjajohnson/Desktop/ARC-master/data/evaluation/"

def load_task(task_filename, path = training_path):
    with open(path + task_filename, 'r') as f:
        task = json.load(f)    
    return task

def flatten_task(task):
    """given a json format for a task, return all grids in a list"""
    grids_list = []
    for grid in task["train"]:
        grids_list.append(grid_to_str(grid["input"]))
        grids_list.append(grid_to_str(grid["output"]))
    for grid in task["test"]:
        grids_list.append(grid_to_str(grid["input"]))
        grids_list.append(grid_to_str(grid["output"]))
    return grids_list

def plot_task(task):
    """
    Plots the first train and test pairs of a specified task,
    using same color scheme as the ARC app
    """
    
    cmap = colors.ListedColormap(
            ['#000000', '#0074D9','#FF4136','#2ECC40','#FFDC00',
             '#AAAAAA', '#F012BE', '#FF851B', '#7FDBFF', '#870C25'])
    
    norm = colors.Normalize(vmin=0, vmax=9)
    n_train = len(task['train'])
    fig, axs = plt.subplots(n_train+1, 2, figsize=(10, 10))
    for i in range(n_train):
        axs[i, 0].imshow(task['train'][i]['input'], cmap=cmap, norm=norm)
        axs[i, 0].axis('off')
        axs[i, 0].set_title('Train Input')
        axs[i, 1].imshow(task['train'][i]['output'], cmap=cmap, norm=norm)
        axs[i, 1].axis('off')
        axs[i, 1].set_title('Train Output')
    axs[n_train, 0].imshow(task['test'][0]['input'], cmap=cmap, norm=norm)
    axs[n_train, 0].axis('off')
    axs[n_train, 0].set_title('Test Input')
    axs[n_train, 1].imshow(task['test'][0]['output'], cmap=cmap, norm=norm)
    axs[n_train, 1].axis('off')
    axs[n_train, 1].set_title('Test Output')
    plt.tight_layout()
    plt.show()

In [4]:
from matplotlib import colors

def str_to_grid(grid):
    new_grid = []
    str_ = grid.split("|")
    for s in str_:
        if s == '':
            continue
        new_grid.append(list(map(int, s)))
    return new_grid

def grid_to_str(grid):
    new_grid = ''
    for row in grid:
        for num in row:
            new_grid += str(num)
        new_grid+='|'
    return new_grid[:-1]
    

def plot_grid(grid):
    cmap = colors.ListedColormap(
            ['#000000', '#0074D9','#FF4136','#2ECC40','#FFDC00',
             '#AAAAAA', '#F012BE', '#FF851B', '#7FDBFF', '#870C25'])
    
    
    norm = colors.Normalize(vmin=0, vmax=9)
    fig, axs = plt.subplots(1, 1, figsize=(6, 6), squeeze=False)
    axs[0, 0].imshow(grid, cmap=cmap, norm=norm)
    axs[0, 0].axis('off')
    plt.tight_layout()
    plt.show()
    
def plot_grids(grids):
    cmap = colors.ListedColormap(
            ['#000000', '#0074D9','#FF4136','#2ECC40','#FFDC00',
             '#AAAAAA', '#F012BE', '#FF851B', '#7FDBFF', '#870C25'])
    n_grids = len(grids)
    norm = colors.Normalize(vmin=0, vmax=9)
    fig, axs = plt.subplots(1, n_grids, figsize=(6, 6), squeeze=False)
    for i in range(n_grids):
        axs[0, i].imshow(grids[i], cmap=cmap, norm=norm)
        axs[0, i].axis('off')
    plt.tight_layout()
    plt.show()

Loading flattened training and evaluation (test) data

In [5]:
train_tasks = list(map(load_task, sorted(os.listdir(training_path))[1:]))
train_tasks = list(map(flatten_task, train_tasks))
train_tasks_str = [item for sublist in train_tasks for item in sublist]
train_tasks = list(map(str_to_grid, train_tasks_str))

In [6]:
test_tasks = list(map(lambda x: load_task(x, path=test_path), sorted(os.listdir(test_path))[1:]))
test_tasks = list(map(flatten_task, test_tasks))
test_tasks_str = [item for sublist in test_tasks for item in sublist]
test_tasks = list(map(str_to_grid, test_tasks_str))

In [7]:
# largest task in training and test is 30x30
len(str_to_grid(max(train_tasks_str, key=len)))

30

In [8]:
# next, pad data and then mask (token = 11)
def pad_data(grid, padding_token = 0):
    """given a grid, return a 30x30 grid padded with 0s"""
    # int_grid = np.asarray(copy.deepcopy(grid)).astype('int32') 
    nrows = len(grid)
    ncols = len(grid[0])
    if nrows == 30 and ncols == 30:
        return np.asarray(grid)
    else:
        new_grid = np.ones((30,30)).astype('int32')*padding_token
        new_grid[:nrows, :ncols] = grid
    return new_grid

In [9]:
# might want to have this lognormal distributed or something, since most of the cells happen in the first few lines
def mask_data(grid, percent_mask = 0.3):
    """given a grid, return a masked grid -- mask token = 11"""
    masked_grid = copy.deepcopy(grid)
    mask = np.random.rand(30,30)
    masked_grid[np.where(mask < percent_mask)] = 11
    return masked_grid

In [10]:
pad_train = list(map(pad_data, train_tasks))
pad_test = list(map(pad_data, test_tasks))
mask_train = list(map(mask_data, pad_train))
mask_test = list(map(mask_data, pad_test))

In [11]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(mask_train, pad_train, test_size = 0.1)
X_test, y_test = mask_test, pad_test

### Heuristic Approach

First, implement heuristic based approach to the masking problem: for any 3x3 grid, fill in the missing cells with the dominant color of that grid

In [12]:
def max_fill(grid, window=3):
    """given a masked grid, return a best guess grid given the dominant color of each 3x3 section"""
    pred_grid = copy.deepcopy(grid)
    for i in list(range(30))[::window]:
        for j in list(range(30))[::window]:
            most_common_colors = Counter(grid[i:i+window, j:j+window].flatten()).most_common(3)
            dom_color = most_common_colors[0][0]
            if dom_color == 11.0:
            # if there are no other dominant colors (3x3 is all masks, then set the dominant color to black)
                if len(most_common_colors) < 2:
                    dom_color = 0.0
                else:
                    dom_color = most_common_colors[1][0]
            pred_grid[i:i+3,j:j+3][np.where(grid[i:i+window, j:j+window] == 11.0)] = dom_color
    return pred_grid

In [13]:
def score_binary(pred_grid, test_grid):
    """return percentage of correct grid cells (match or not)"""
    return np.sum(pred_grid == test_grid)/900

In [14]:
def score_color(pred_grid, test_grid):
    """return precentage of correct grid cells for colored cells only"""
    color_pred = pred_grid[np.where(test_grid!=0)]
    color_val = test_grid[np.where(test_grid!=0)]
    return np.sum(color_pred == color_val)/len(color_pred)

In [15]:
pred_train = list(map(max_fill, X_train))

In [16]:
score_list = []
predictions = list(zip(pred_train, y_train))
for pred in predictions:
    score_list.append(score_binary(pred[0], pred[1]))

In [17]:
# baseline is ~80% (depending on mask parameter)
np.mean(score_list)

0.9865192376842862

In [18]:
score_list = []
predictions = list(zip(pred_train, y_train))
for pred in predictions:
    score_list.append(score_color(pred[0], pred[1]))

  """


In [19]:
score_list = [score for score in score_list if str(score) != 'nan']

In [20]:
np.mean(score_list)

0.7693854128139594

### Transformer

In [21]:
import torch
from torch import nn
from torch.optim import Adam
from mlm_pytorch import MLM
from torch.utils.data import Dataset, DataLoader

Borrowing code from: https://github.com/lucidrains/mlm-pytorch

In [22]:
# mask token = 11, pad token = 12, | = 13, sos = 14, eos = 15 

In [23]:
def replace_walls(str_grid):
    """given a grid in string format, remove the | tokens"""
    return [x if x!='|' else '13' for x in str_grid]

In [24]:
def str_to_float(str_grid):
    """given a list of strings, return a list of floats"""
    return [float(i) for i in str_grid]

In [25]:
def transformer_data(grids, padding_token=12):
    """given a list of all grids, return list of strings with eos/sos tokens"""
    padded_grids = list(map(lambda x: pad_data(x,padding_token), grids))
    str_grids = list(map(grid_to_str, padded_grids))
    tokenized_grids = list(map(replace_walls, str_grids))
    sos = list(map(lambda x: ['14'] + x, tokenized_grids))
    eos = list(map(lambda x: x + ['15'], sos))
    float_grids = list(map(str_to_float, eos))
    return list(map(torch.LongTensor, float_grids))

In [26]:
transformer_tasks = transformer_data(train_tasks)

In [35]:
transformer_tasks[0]

tensor([14,  0,  7,  ...,  1,  2, 15])

In [28]:
# setting up the data class
# I think because masking happens later, you want just the padded data here 
class ARCDataset(Dataset):
    """ARC masked dataset."""

    # def __init__(self, csv_file, root_dir, transform=None):
    def __init__(self, tasks, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.ARC_data = tasks
        self.root_dir = training_path
        self.transform = transform

    def __len__(self):
        return len(self.ARC_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        sample = self.ARC_data[idx]

        if self.transform:
            sample = self.transform(sample)

        return sample

In [29]:
ARC_dataset = ARCDataset(transformer_tasks)

In [30]:
# creating the language model
from reformer_pytorch import ReformerLM, Autopadder

transformer = ReformerLM(
    num_tokens = 15,
    dim = 512,
    depth = 1,
    max_seq_len = 1822
    # max_seq_len = 2048
)

model = Autopadder(transformer)

In [33]:
# plugin the language model into the MLM trainer

trainer = MLM(
    model,
    mask_token_id = 11,          # the token id reserved for masking
    pad_token_id = 12,           # the token id for padding
    mask_prob = 0.15,           # masking probability for masked language modeling
    replace_prob = 0.90,        # ~10% probability that token will not be masked, but included in loss, as detailed in the epaper
    mask_ignore_token_ids = [13,14,15]  # other tokens to exclude from masking, include the [cls] and [sep] here
)

# optimizer

opt = Adam(trainer.parameters(), lr=3e-4)

# one training step (do this for many steps in a for loop, getting new `data` each time)
loss = 0
for i in range(len(ARC_dataset)):
    if i%10 == 0: 
        print(i, loss)
    data = torch.unsqueeze(ARC_dataset[0], 0)
    loss = trainer(data)
    loss.backward()
    opt.step()
    opt.zero_grad()

# after much training, the model should have improved for downstream tasks

torch.save(transformer, f'./ARC-model_0.pt')

0 0


IndexError: index out of range in self

In [None]:
loss

In [32]:
transformer = ReformerLM(
    num_tokens = 20000,
    dim = 512,
    depth = 1,
    max_seq_len = 1024
)

# plugin the language model into the MLM trainer

trainer = MLM(
    transformer,
    mask_token_id = 2,          # the token id reserved for masking
    pad_token_id = 0,           # the token id for padding
    mask_prob = 0.15,           # masking probability for masked language modeling
    replace_prob = 0.90,        # ~10% probability that token will not be masked, but included in loss, as detailed in the epaper
    mask_ignore_token_ids = []  # other tokens to exclude from masking, include the [cls] and [sep] here
)

# optimizer

opt = Adam(trainer.parameters(), lr=3e-4)

# one training step (do this for many steps in a for loop, getting new `data` each time)

data = torch.randint(0, 20000, (10, 1024))
print(data)

loss = trainer(data)
# loss.backward()
# opt.step()
# opt.zero_grad()

# after much training, the model should have improved for downstream tasks

# torch.save(transformer, f'./pretrained-model.pt')

tensor([[17080,  3317, 11094,  ..., 15304, 16984, 15031],
        [ 3411,  9744, 16893,  ..., 14923,   779, 10869],
        [ 4334,  8292,  2010,  ..., 12168, 18736, 12822],
        ...,
        [11606,  7297, 11082,  ..., 18047,   163, 11690],
        [14243,   909,  9955,  ...,  6297, 19983, 17896],
        [10394, 10844,  7049,  ..., 17132, 10793,  8393]])
prediction:  tensor([[[ 4.1205e-01,  4.3679e-02,  6.3976e-03,  ...,  6.7762e-01,
          -5.7868e-01, -1.2010e-01],
         [ 3.2005e-01, -3.6025e-02, -4.6149e-02,  ...,  1.0616e-01,
          -8.7187e-01,  4.7881e-01],
         [ 5.5489e-01,  7.4331e-02, -7.1933e-01,  ...,  7.0006e-01,
           4.2986e-01,  2.6815e-02],
         ...,
         [-2.1083e-01, -6.1884e-01, -1.1903e-01,  ..., -1.0906e+00,
          -1.3950e-01, -4.3852e-02],
         [ 1.7213e-01, -8.2585e-01, -4.4409e-01,  ...,  5.9111e-02,
          -3.4356e-01, -5.5135e-01],
         [ 7.7929e-01,  6.4803e-02,  1.3746e-01,  ..., -4.0445e-02,
           2.4477e

In [None]:
loss

In [None]:
pad_data(train_tasks[0], 12)