In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-a

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as torchF
import gc
from copy import deepcopy
from tqdm import tqdm
from scipy.stats import rankdata as rank

from datasets import load_dataset
from torch.utils.data import DataLoader

# dimensionality of input data
input_dim = 6
num_workers = 2
batch_size = 128
num_threads = 1
hidden_dim = 1

# |O|, number of operations
space_size = 5

torch.set_num_threads(num_threads)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

dataset = load_dataset("inria-soda/tabular-benchmark", data_files="reg_num/diamonds.csv")
dataset = dataset['train'].with_format("torch")
dataset_length = dataset.num_rows
dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers) #16

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/15.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.55M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
batch = next(iter(dataloader))
batch = torch.stack(tuple(batch.values()), dim=1).to(device)

x = batch[:, :-1] # columns except price
y = batch[:, -1] # price column

In [3]:
class Zero(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, batch):
        return torch.zeros_like(batch)

def get_modules(input_dim, output_dim):

    return nn.ModuleList([
        Zero(), nn.Identity(), nn.ReLU(),
        nn.Sigmoid(), nn.Linear(input_dim, output_dim)
    ])

In [4]:
class MLP(nn.Module):

    def __init__(self, input_dim, hidden_dim):

        super().__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        self.in_ = nn.Linear(input_dim, hidden_dim)
        self.out_ = nn.Linear(hidden_dim, 1)

        self.edges = nn.ModuleDict({
            '1': get_modules(hidden_dim, hidden_dim),
            '2': get_modules(hidden_dim, hidden_dim),
            '3': get_modules(hidden_dim, hidden_dim),
            '4': get_modules(hidden_dim, hidden_dim),
            '5': get_modules(hidden_dim, hidden_dim),
            '6': get_modules(hidden_dim, hidden_dim),
        })

        self.saved = None

    def perm_delete(self, edge, ind):

        self.edges[edge].pop(ind)


    def delete(self, edge, ind):

        if edge is None or ind is None:
            return

        self.saved = (edge, deepcopy(self.edges[edge]))
        if len(self.edges[edge]) > 1:
            self.edges[edge].pop(ind)

    def restore(self):
        if self.saved is not None:
            self.edges[self.saved[0]] = self.saved[1]
            self.saved = None

    def forward(self, batch):

        node0 = self.in_(batch)

        edge1 = torch.stack([op(node0) for op in self.edges['1']], dim=1).sum(1)

        node1 = edge1

        edge2 = torch.stack([op(node0) for op in self.edges['2']], dim=1).sum(1)
        edge3 = torch.stack([op(node0) for op in self.edges['3']], dim=1).sum(1)

        edge4 = torch.stack([op(node1) for op in self.edges['4']], dim=1).sum(1)
        edge5 = torch.stack([op(node1) for op in self.edges['5']], dim=1).sum(1)

        node2 = edge2 + edge4

        edge6 = torch.stack([op(node2) for op in self.edges['6']], dim=1).sum(1)

        node3 = edge3 + edge5 + edge6

        return self.out_(node3)


In [13]:
mlp = MLP(input_dim, hidden_dim).to(device)
batch = next(iter(dataloader))
batch = torch.stack(tuple(batch.values()), dim=1).to(device)
mlp(batch[:, :-1]).shape

torch.Size([128, 1])

In [14]:
def calculate_k(loader, mlp, device):

    gradients = {}
    for edge in mlp.edges.keys():
        if len(mlp.edges[edge]) > 1:
            for ind in range(len(mlp.edges[edge])):
                gradients[(edge, ind)] = []

    mlp = mlp.to(device)

    for i, batch in enumerate(loader):

        if i < 421:
          continue

        batch = torch.stack(tuple(batch.values()),
                            dim=1)[:, :-1].cuda(device=device,
                                                non_blocking=True)
        for edge, ind in gradients:
            mlp.zero_grad()
            batch_ = batch.clone().cuda(device=device, non_blocking=True)

            mlp.delete(edge, ind)
            preds = mlp(batch_)
            mlp.restore()

            for j in range(len(preds)):
                preds[j:j + 1].backward(torch.ones_like(preds[j:j + 1]),
                                        retain_graph=True)
                grads = []
                for name, layer in mlp.named_parameters():
                    if 'weight' in name and layer.grad is not None:
                        grads.append(layer.grad.flatten().detach())
                gradients[(edge, ind)].append(torch.cat(grads, dim=-1))
                mlp.zero_grad()
                torch.cuda.empty_cache()

    k = {}
    for edge, ind in gradients:
        grads = torch.stack(gradients[(edge, ind)], dim=0)
        ntk = torch.einsum('nc,mc->nm', [grads, grads])
        ev = torch.linalg.eigvalsh(ntk, UPLO='U')
        new_k = (ev[-1] / ev[0]).item()
        k[(edge, ind)] = - np.nan_to_num(new_k, copy=True, nan=1e7)

    return k


In [15]:
def calc_LR(activations):

    output = torch.matmul(activations.half(), (1 - activations).T.half())
    output = 1. / (torch.sum(1 - torch.sign(output + output.T), dim=1).float() + 1e-12)
    return round(output.sum().item())

def calculate_lr(loader, mlp, device):

    lrs = {}
    for edge in mlp.edges.keys():
        if len(mlp.edges[edge]) > 1:
            for ind in range(len(mlp.edges[edge])):
                lrs[(edge, ind)] = []

    for edge, ind in lrs:

        ptr = 0
        model = deepcopy(mlp).to(device)
        model.delete(edge, ind)
        LR = 0

        for m in model.modules():
            if isinstance(m, nn.ReLU):
                m.register_forward_hook(
                    hook=lambda mod, inp, out: iFeats.append(out.detach())
                )

        for i, batch in enumerate(loader):

            if i < 421:
              continue

            iFeats = []

            batch = torch.stack(tuple(batch.values()),
                                dim=1)[:, :-1].cuda(device=device,
                                                    non_blocking=True)
            batch_size_ = len(batch)
            model.zero_grad()
            interFeatures = []

            with torch.no_grad():
                model.forward(batch)

            if len(iFeats) == 0:
                continue

            activations = torch.cat([feat.view(batch_size_, -1)
                                     for feat in iFeats], dim=1)

            LR += calc_LR(torch.sign(activations))

            ptr += batch_size_

            del iFeats

        del model

        torch.cuda.empty_cache()
        lrs[(edge, ind)] = -LR

    return lrs


In [16]:
def TENAS(loader, mlp, device):

  for cur_size in range(space_size, 1, -1):
    k = calculate_k(loader, mlp, device)
    lr = calculate_lr(loader, mlp, device)

    k_list, lr_list = [], []

    for edge in mlp.edges:
      for ind in range(len(mlp.edges[edge])):
          k_list.append(-k[(edge, ind)])
          lr_list.append(lr[(edge, ind)])

    k_ranks = rank(k_list)
    lr_ranks = rank(lr_list)
    total_ranks = k_ranks + lr_ranks

    ptr = 0
    for edge in mlp.edges:
      ind = np.argmin(total_ranks[ptr:ptr + cur_size])
      mlp.perm_delete(edge, ind)
      ptr += cur_size

  return mlp

In [17]:
new_mlp = TENAS(dataloader, mlp, device)
new_mlp

MLP(
  (in_): Linear(in_features=6, out_features=1, bias=True)
  (out_): Linear(in_features=1, out_features=1, bias=True)
  (edges): ModuleDict(
    (1): ModuleList(
      (0): ReLU()
    )
    (2): ModuleList(
      (0): Zero()
    )
    (3): ModuleList(
      (0): Zero()
    )
    (4): ModuleList(
      (0): ReLU()
    )
    (5): ModuleList(
      (0): ReLU()
    )
    (6): ModuleList(
      (0): Zero()
    )
  )
)