# Train and evaluate a squared PC ($\mathrm{NPC}^2$) with real parameters

In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt

Set the random seeds.

In [2]:
random.seed(42)
np.random.seed(42)

# Load the MiniBooNE UCI data set

In [3]:
from datasets import load_uci_dataset

In [4]:
data = load_uci_dataset('miniboone', path='datasets')
data_train, data_test = data['train'], data['test']
num_variables = data_train.shape[1]
print(f'Number of variables: {num_variables}')

Number of variables: 43


# Instantiating a Circuit structure Template: the Region Graph

Instantiate a bunch of _random binary tree_ region graphs of maximum depth, with different seed.

In [5]:
from cirkit.templates.region_graph import RandomBinaryTree
max_depth = int(np.ceil(np.log2(num_variables)))
region_graph = RandomBinaryTree(num_variables, depth=max_depth, seed=42)

## Constructing the Symbolic Circuit Representation

We construct a non-monotonic PC with Gaussian distributions as input.

In [6]:
from cirkit.utils.scope import Scope
from cirkit.symbolic.layers import GaussianLayer, DenseLayer, HadamardLayer
from cirkit.symbolic.parameters import Parameter, TensorParameter, ScaledSigmoidParameter
from cirkit.symbolic.initializers import UniformInitializer, NormalInitializer

In [7]:
def gaussian_layer_factory(
    scope: Scope,
    num_units: int,
    num_channels: int
) -> GaussianLayer:
    return GaussianLayer(
        scope, num_units, num_channels,
        stddev_factory=lambda shape: Parameter.from_sequence(
            TensorParameter(*shape, initializer=NormalInitializer(0.0, 1e-1)),
            ScaledSigmoidParameter(shape, vmin=1e-5, vmax=1.0)
        )
    )

def hadamard_layer_factory(
    scope: Scope, num_input_units: int, arity: int
) -> HadamardLayer:
    return HadamardLayer(scope, num_input_units, arity)

def dense_layer_factory(
    scope: Scope,
    num_input_units: int,
    num_output_units: int
) -> DenseLayer:
    return DenseLayer(
        scope, num_input_units, num_output_units,
        weight_factory=lambda shape: Parameter.from_leaf(
            TensorParameter(*shape, initializer=UniformInitializer(0.0, 1.0)),
        )
    )

In [8]:
from cirkit.symbolic.circuit import Circuit

In [9]:
symbolic_circuit = Circuit.from_region_graph(
    region_graph,
    num_input_units=64,
    num_sum_units=64,
    input_factory=gaussian_layer_factory,
    sum_factory=dense_layer_factory,
    prod_factory=hadamard_layer_factory
)

We can retrieve some information about the circuit and its structural properties as follows.

In [10]:
print(f'Smooth: {symbolic_circuit.is_smooth}')
print(f'Decomposable: {symbolic_circuit.is_decomposable}')
print(f'Structured decomposable: {symbolic_circuit.is_structured_decomposable}')
print(f'Number of variables: {symbolic_circuit.num_variables}')
print(f'Number of channels per variable: {symbolic_circuit.num_channels}')

Smooth: True
Decomposable: True
Structured decomposable: True
Number of variables: 43
Number of channels per variable: 1


## Computing the Circuit Square

We now square the symbolic circuit representation and compile it. However, to actually train it, we will only need to compute the integral of the squared circuit.

To do so, we need to set up a pipeline context first and specify a backend. Since the circuit is non-monotonic, we will use the complex log-sum-exp and sum semiring. Note that in the following we also compile the integral of the squared circuit, i.e., the partition functions.

In [11]:
from cirkit.pipeline import PipelineContext
from cirkit.pipeline import compile
import cirkit.symbolic.functional as SF

In [12]:
import torch
device = torch.device('cuda')  # The device to use
torch.manual_seed(42)
# Using float32 ...
#torch.set_default_dtype(torch.float64)
torch.cuda.manual_seed(42)
# Extend the default cache limit. In the future, torch will support compilation with 'more dynamic shapes',
# and therefore many recompilations will be avoided
torch._dynamo.config.cache_size_limit = 16

In [13]:
%%time
ctx = PipelineContext(
    backend='torch',   # Choose the torch compilation backend
    fold=True,         # Fold the circuit, this is a backend-specific compilation flag
    semiring='complex-lse-sum', # Use the (C, +, *) semiring, where + is the complex log-sum-exp and * is the sum
    optimize=True      # Optimize the circuit layers (set this flag to False to disable, which yields slower inference)
)

with ctx:
    symbolic_sq_circuit = SF.multiply(symbolic_circuit, symbolic_circuit)
    symbolic_int_sq_circuit = SF.integrate(symbolic_sq_circuit)
    int_sq_circuit = compile(symbolic_int_sq_circuit)
    # Note that compiling the integral squared circuit will also compile the symbolic circuit
    # we started from. We retrieve its compiled module as showed below.
    circuit = ctx.get_compiled_circuit(symbolic_circuit)

CPU times: user 315 ms, sys: 28.7 ms, total: 343 ms
Wall time: 345 ms


Let's print the circuit and the integral squared circuit torch modules.

In [14]:
print(circuit)

TorchCircuit(
  (_nodes): ModuleList(
    (0): TorchGaussianLayer(
      (mean): TorchParameter(
        (_nodes): ModuleList(
          (0): TorchTensorParameter()
        )
      )
      (stddev): TorchParameter(
        (_nodes): ModuleList(
          (0): TorchTensorParameter()
          (1): TorchScaledSigmoidParameter()
        )
      )
    )
    (1): TorchDenseLayer(
      (weight): TorchParameter(
        (_nodes): ModuleList(
          (0): TorchTensorParameter()
        )
      )
    )
    (2-7): 6 x TorchCPLayer(
      (weight): TorchParameter(
        (_nodes): ModuleList(
          (0): TorchTensorParameter()
        )
      )
    )
  )
)


In [15]:
print(int_sq_circuit)

TorchConstantCircuit(
  (_nodes): ModuleList(
    (0): TorchLogPartitionLayer(
      (value): TorchParameter(
        (_nodes): ModuleList(
          (0-1): 2 x TorchPointerParameter(
            (_parameter): TorchTensorParameter()
          )
          (2): TorchScaledSigmoidParameter()
          (3): TorchGaussianProductLogPartition()
          (4-5): 2 x TorchReduceSumParameter()
        )
      )
    )
    (1-2): 2 x TorchTensorDotLayer(
      (weight): TorchParameter(
        (_nodes): ModuleList(
          (0): TorchPointerParameter(
            (_parameter): TorchTensorParameter()
          )
        )
      )
    )
    (3): TorchHadamardLayer()
    (4-5): 2 x TorchTensorDotLayer(
      (weight): TorchParameter(
        (_nodes): ModuleList(
          (0): TorchPointerParameter(
            (_parameter): TorchTensorParameter()
          )
        )
      )
    )
    (6): TorchHadamardLayer()
    (7-8): 2 x TorchTensorDotLayer(
      (weight): TorchParameter(
        (_nodes): M

## Training and Testing

We are not ready to learn the parameters and do inference.
First, we wrap our data into PyTorch data loaders by specifying the batch size.
Then, we initialize any PyTorch optimizer, e.g., Adam.

Note that the parameters of the integral squared circuit are the same parameters of the circuit itself.

In [16]:
from torch import optim
from torch.utils.data import DataLoader

In [17]:
train_dataloader = DataLoader(data_train, shuffle=True, batch_size=256, drop_last=True, num_workers=4)
test_dataloader = DataLoader(data_test, shuffle=False, batch_size=256, num_workers=4)
optimizer = optim.Adam(circuit.parameters(), lr=1e-3)

In [18]:
# Move circuits to device
circuit = circuit.to(device)
int_sq_circuit = int_sq_circuit.to(device)

In [19]:
num_epochs = 100
step_idx = 0
running_loss = 0.0
for epoch_idx in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        batch = batch.to(device).unsqueeze(dim=1)   # Add a channel dimension
        log_output = circuit(batch)                 # Compute the log output of the circuit
        log_output = 2.0 * log_output.real          # Multipliying by two squares the output
        log_pf = int_sq_circuit().real              # Compute the log partition function of the circuit
        lls = log_output - log_pf                   # Compute the log-likelihood
        loss = -torch.mean(lls)   # The loss is the negative average log-likelihood
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        running_loss += loss.detach() * len(batch)
        step_idx += 1
        if step_idx % 500 == 0:
            print(f"Epoch {epoch_idx}, step {step_idx}: Average NLL: {running_loss / (500 * len(batch)):.3f}")
            running_loss = 0.0

Epoch 4, step 500: Average NLL: 63.504
Epoch 8, step 1000: Average NLL: 55.648
Epoch 13, step 1500: Average NLL: 52.484
Epoch 17, step 2000: Average NLL: 50.936
Epoch 21, step 2500: Average NLL: 49.458
Epoch 26, step 3000: Average NLL: 48.287
Epoch 30, step 3500: Average NLL: 49.778
Epoch 35, step 4000: Average NLL: 49.997
Epoch 39, step 4500: Average NLL: 48.489
Epoch 43, step 5000: Average NLL: 48.381
Epoch 48, step 5500: Average NLL: 48.535
Epoch 52, step 6000: Average NLL: 47.784
Epoch 57, step 6500: Average NLL: 47.131
Epoch 61, step 7000: Average NLL: 47.028
Epoch 65, step 7500: Average NLL: 46.472
Epoch 70, step 8000: Average NLL: 46.241
Epoch 74, step 8500: Average NLL: 46.224
Epoch 78, step 9000: Average NLL: 45.836
Epoch 83, step 9500: Average NLL: 46.752
Epoch 87, step 10000: Average NLL: 46.923
Epoch 92, step 10500: Average NLL: 46.105
Epoch 96, step 11000: Average NLL: 45.679


In [20]:
circuit.eval()
int_sq_circuit.eval()

with torch.no_grad():
    test_lls = 0.0
    log_pf = int_sq_circuit().real  # Compute the log partition function of the circuit (just once as we are evaluating)
    for batch in test_dataloader:
        batch = batch.to(device).unsqueeze(dim=1)   # Add a channel dimension
        log_output = circuit(batch)                 # Compute the log output of the circuit
        log_output = 2.0 * log_output.real          # Multipliying by two squares the output
        lls = log_output - log_pf                   # Compute the log-likelihood
        test_lls += lls.sum().item()
    average_ll = test_lls / len(data_test)
    print(f"Average test LL: {average_ll:.3f}")

Average test LL: -45.314
