# Train and Evaluate a Sum-of-Squares non-monotonic PC

In [1]:
import time
import random
import numpy as np

Set the random seeds.

In [2]:
random.seed(42)
np.random.seed(42)

# Load the MiniBooNE UCI data set

In [3]:
from datasets import load_uci_dataset

In [4]:
data = load_uci_dataset('miniboone', path='datasets')
data_train, data_test = data['train'], data['test']
num_variables = data_train.shape[1]
print(f'Number of variables: {num_variables}')

Number of variables: 43


# Create the Sum-of-Squares PC class

In [5]:
from typing import Tuple

In [6]:
import torch
from torch import nn, Tensor
device = torch.device('cuda')  # The device to use
torch.manual_seed(42)
torch.cuda.manual_seed(42)
# Extend the default cache limit. In the future, torch will support compilation with 'more dynamic shapes',
# and therefore many recompilations will be avoided
torch._dynamo.config.cache_size_limit = 32

In [7]:
from cirkit.templates.region_graph import RegionGraph, RandomBinaryTree
from cirkit.pipeline import PipelineContext
from cirkit.symbolic.circuit import Circuit
from cirkit.utils.scope import Scope
from cirkit.symbolic.layers import GaussianLayer, DenseLayer, HadamardLayer
from cirkit.symbolic.parameters import Parameter, SoftmaxParameter, ScaledSigmoidParameter, TensorParameter
from cirkit.symbolic.initializers import UniformInitializer, NormalInitializer
import cirkit.symbolic.functional as SF
from cirkit.backend.torch.circuits import TorchCircuit
from cirkit.backend.torch.layers.inner import TorchMixingLayer

In [8]:
class SOS(nn.Module):
    def __init__(
        self,
        num_variables: int,
        *,
        num_input_units: int,
        num_sum_units: int,
        num_squares: int = 2,
        seed: int = 42,
        optimize: bool = False
    ) -> None:
        assert num_variables > 1
        assert num_squares > 1
        super().__init__()
        max_depth = int(np.ceil(np.log2(num_variables)))
        self._rgs = [
            SOS._build_region_graph(num_variables, max_depth, seed=seed + i * 123)
            for i in range(num_squares)
        ]
        self._ctx = PipelineContext(backend='torch', semiring='complex-lse-sum', fold=True, optimize=optimize)
        circuit, int_sq_circuit = self.build_circuits(
            num_input_units=num_input_units, num_sum_units=num_sum_units
        )
        self._circuit = circuit
        self._int_sq_circuit = int_sq_circuit
        self.register_buffer('_mixing_log_weight', -torch.log(torch.tensor(num_squares)))
        self.__cache_log_z: Optional[Tensor] = None

    def train(self, mode: bool = True):
        if mode:
            self.__cache_log_z = None
        else:
            with torch.no_grad():
                self.__cache_log_z = self._int_sq_circuit().real
        super().train(mode)
    
    def forward(self, x: Tensor) -> Tensor:
        log_score = 2.0 * self._circuit(x).real
        return torch.logsumexp(self._mixing_log_weight + log_score, dim=1)

    def log_likelihood(self, x: Tensor) -> Tensor:
        if self.__cache_log_z is None:
            log_z = self._int_sq_circuit().real
        else:
            log_z = self.__cache_log_z
        log_z = torch.logsumexp(self._mixing_log_weight + log_z, dim=0)
        log_score = self.forward(x)
        return log_score - log_z

    def build_circuits(
        self, num_input_units: int, num_sum_units: int,
    ) -> Tuple[TorchCircuit, TorchCircuit, TorchMixingLayer]:
        # Build one symbolic circuit for each region graph
        symbolic_circuits = [
            SOS._build_symbolic_circuit(rg, num_input_units=num_input_units, num_sum_units=num_sum_units)
            for rg in self._rgs
        ]

        # Merge the symbolic circuits into a single one having multiple outputs
        symbolic_circuit = SF.merge(symbolic_circuits)

        # Square each symbolic circuit and merge them into a single one having multiple outputs
        symbolic_sq_circuit = SF.merge([
            SF.multiply(sc, sc) for sc in symbolic_circuits
        ])

        # Integrate the squared circuits (by integrating the merged symbolic representation)
        symbolic_int_sq_circuit = SF.integrate(symbolic_sq_circuit)

        # Compile the symbolic circuits
        circuit = self._ctx.compile(symbolic_circuit)
        int_sq_circuit = self._ctx.compile(symbolic_int_sq_circuit)

        return circuit, int_sq_circuit
    
    def _build_region_graph(num_variables:int, depth: int, seed: int = 42) -> RegionGraph:
        return RandomBinaryTree(num_variables, depth=depth, seed=seed)

    def _build_symbolic_circuit(rg: RegionGraph, *, num_input_units: int, num_sum_units: int) -> Circuit:
        def gaussian_layer_factory(
            scope: Scope,
            num_units: int,
            num_channels: int
        ) -> GaussianLayer:
            return GaussianLayer(
                scope, num_units, num_channels,
                stddev_factory=lambda shape: Parameter.from_sequence(
                    TensorParameter(*shape, initializer=NormalInitializer(0.0, 1e-1)),
                    ScaledSigmoidParameter(shape, vmin=1e-5, vmax=1.0)
                )
            )

        def hadamard_layer_factory(
            scope: Scope, num_input_units: int, arity: int
        ) -> HadamardLayer:
            return HadamardLayer(scope, num_input_units, arity)

        def dense_layer_factory(
            scope: Scope,
            num_input_units: int,
            num_output_units: int
        ) -> DenseLayer:
            return DenseLayer(
                scope, num_input_units, num_output_units,
                weight_factory=lambda shape: Parameter.from_leaf(
                    TensorParameter(*shape, initializer=UniformInitializer(0.0, 1.0))
                )
            )

        return Circuit.from_region_graph(
            rg,
            num_input_units=num_input_units,
            num_sum_units=num_sum_units,
            input_factory=gaussian_layer_factory,
            sum_factory=dense_layer_factory,
            prod_factory=hadamard_layer_factory
        )

In [9]:
sos = SOS(
    num_variables,
    num_input_units=32,
    num_sum_units=32,
    num_squares=8
)

In [10]:
print(sos)

SOS(
  (_circuit): TorchCircuit(
    (_nodes): ModuleList(
      (0): TorchGaussianLayer(
        (mean): TorchParameter(
          (_nodes): ModuleList(
            (0-7): 8 x TorchPointerParameter(
              (_parameter): TorchTensorParameter()
            )
          )
        )
        (stddev): TorchParameter(
          (_nodes): ModuleList(
            (0-7): 8 x TorchPointerParameter(
              (_parameter): TorchTensorParameter()
            )
            (8): TorchScaledSigmoidParameter()
          )
        )
      )
      (1): TorchDenseLayer(
        (weight): TorchParameter(
          (_nodes): ModuleList(
            (0-7): 8 x TorchPointerParameter(
              (_parameter): TorchTensorParameter()
            )
          )
        )
      )
      (2): TorchHadamardLayer()
      (3): TorchDenseLayer(
        (weight): TorchParameter(
          (_nodes): ModuleList(
            (0-7): 8 x TorchPointerParameter(
              (_parameter): TorchTensorParameter()
 

## Training and Testing

We are not ready to learn the parameters and do inference.
First, we wrap our data into PyTorch data loaders by specifying the batch size.
Then, we initialize any PyTorch optimizer, e.g., Adam.

Note that the parameters of the integral squared circuit are the same parameters of the circuit itself.

In [11]:
from torch import optim
from torch.utils.data import DataLoader

In [12]:
train_dataloader = DataLoader(data_train, shuffle=True, batch_size=512, drop_last=True, num_workers=4)
test_dataloader = DataLoader(data_test, shuffle=False, batch_size=512, num_workers=4)
optimizer = optim.Adam(sos.parameters(), lr=1e-2)

In [13]:
# Move circuit to device
sos = sos.to(device)

In [14]:
import time

In [15]:
sos.train()
start_time = time.perf_counter()
num_epochs = 25
step_idx = 0
running_loss = 0.0
for epoch_idx in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        batch = batch.to(device).unsqueeze(dim=1)   # Add a channel dimension
        lls = sos.log_likelihood(batch)             # Compute the log-likelihood
        loss = -torch.mean(lls)   # The loss is the negative average log-likelihood
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        running_loss += loss.detach() * len(batch)
        step_idx += 1
        if step_idx % 300 == 0:
            print(f"Epoch {epoch_idx}, step {step_idx}: Average NLL: {running_loss / (300 * len(batch)):.3f}")
            running_loss = 0.0
end_time = time.perf_counter()
print(f"Training time: {end_time - start_time:.1f} seconds")

Epoch 5, step 300: Average NLL: 47.161
Epoch 10, step 600: Average NLL: 39.575
Epoch 15, step 900: Average NLL: 38.443
Epoch 21, step 1200: Average NLL: 37.760
Training time: 224.2 seconds


Next, we check the likelihood on the test data.

In [16]:
sos.eval()
with torch.no_grad():
   test_lls = 0.0
   for batch in test_dataloader:
       batch = batch.to(device).unsqueeze(dim=1)   # Add a channel dimension
       lls = sos.log_likelihood(batch)             # Compute the log-likelihood
       test_lls += lls.sum().item()
   average_ll = test_lls / len(data_test)
   print(f"Average test LL: {average_ll:.3f}")

Average test LL: -37.120


# Bonus: Optimize your Circuit

In [17]:
import torch
from torch import nn, Tensor
device = torch.device('cuda')  # The device to use
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [18]:
sos = SOS(
    num_variables,
    num_input_units=32,
    num_sum_units=32,
    num_squares=8,
    optimize=True
)

In [19]:
print(sos)

SOS(
  (_circuit): TorchCircuit(
    (_nodes): ModuleList(
      (0): TorchGaussianLayer(
        (mean): TorchParameter(
          (_nodes): ModuleList(
            (0-7): 8 x TorchPointerParameter(
              (_parameter): TorchTensorParameter()
            )
          )
        )
        (stddev): TorchParameter(
          (_nodes): ModuleList(
            (0-7): 8 x TorchPointerParameter(
              (_parameter): TorchTensorParameter()
            )
            (8): TorchScaledSigmoidParameter()
          )
        )
      )
      (1): TorchDenseLayer(
        (weight): TorchParameter(
          (_nodes): ModuleList(
            (0-7): 8 x TorchPointerParameter(
              (_parameter): TorchTensorParameter()
            )
          )
        )
      )
      (2-7): 6 x TorchCPLayer(
        (weight): TorchParameter(
          (_nodes): ModuleList(
            (0-7): 8 x TorchPointerParameter(
              (_parameter): TorchTensorParameter()
            )
          )
    

In [20]:
from torch import optim
from torch.utils.data import DataLoader

In [21]:
train_dataloader = DataLoader(data_train, shuffle=True, batch_size=512, drop_last=True, num_workers=4)
test_dataloader = DataLoader(data_test, shuffle=False, batch_size=512, num_workers=4)
optimizer = optim.Adam(sos.parameters(), lr=1e-2)

In [22]:
# Move circuit to device
sos = sos.to(device)

In [23]:
sos.train()
start_time = time.perf_counter()
num_epochs = 25
step_idx = 0
running_loss = 0.0
for epoch_idx in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        batch = batch.to(device).unsqueeze(dim=1)   # Add a channel dimension
        lls = sos.log_likelihood(batch)             # Compute the log-likelihood
        loss = -torch.mean(lls)   # The loss is the negative average log-likelihood
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        running_loss += loss.detach() * len(batch)
        step_idx += 1
        if step_idx % 300 == 0:
            print(f"Epoch {epoch_idx}, step {step_idx}: Average NLL: {running_loss / (300 * len(batch)):.3f}")
            running_loss = 0.0
end_time = time.perf_counter()
print(f"Training time: {end_time - start_time:.1f} seconds")

Epoch 5, step 300: Average NLL: 47.162
Epoch 10, step 600: Average NLL: 39.652
Epoch 15, step 900: Average NLL: 38.197
Epoch 21, step 1200: Average NLL: 37.610
Training time: 63.0 seconds


Note that enabling optimizations at compile time provided us a ~3.5x speed up during training.
Next, we check the likelihood on the test data.

In [24]:
sos.eval()
with torch.no_grad():
    test_lls = 0.0
    for batch in test_dataloader:
        batch = batch.to(device).unsqueeze(dim=1)   # Add a channel dimension
        lls = sos.log_likelihood(batch)             # Compute the log-likelihood
        test_lls += lls.sum().item()
    average_ll = test_lls / len(data_test)
    print(f"Average test LL: {average_ll:.3f}")

Average test LL: -37.058
