# Train and Visualize a PC on the Moons Dataset

In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt

Set the random seeds.

In [None]:
random.seed(42)
np.random.seed(42)

Generate the Dataset

In [None]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

data = make_moons(n_samples=3000, noise=0.1, random_state=0)

X, y = data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Instantiate the PC

In [None]:
import cirkit.templates.region_graph as rg

region_graph = rg.algorithms.FullyFactorized(num_vars=2)

In [None]:
assert region_graph.is_smooth and region_graph.is_decomposable and region_graph.is_structured_decomposable and region_graph.is_omni_compatible
print(region_graph)

In [None]:
from cirkit.utils.scope import Scope
from cirkit.symbolic.parameters import SoftmaxParameter, Parameter
from cirkit.symbolic.layers import GaussianLayer, DenseLayer, HadamardLayer, MixingLayer
from cirkit.symbolic.initializers import NormalInitializer

In [None]:
def gaussian_layer_factory(
    scope: Scope,
    num_units: int,
    num_channels: int
) -> GaussianLayer:
    return GaussianLayer(scope, num_units, num_channels)

def hadamard_layer_factory(
    scope: Scope, num_input_units: int, arity: int
) -> HadamardLayer:
    return HadamardLayer(scope, num_input_units, arity)

def dense_layer_factory(
    scope: Scope,
    num_input_units: int,
    num_output_units: int
) -> DenseLayer:
    return DenseLayer(
        scope, num_input_units, num_output_units,
        parameterization=lambda p: Parameter.from_unary(SoftmaxParameter(p.shape), p),
        initializer=NormalInitializer(0.0, 1e-2)
    )


def mixing_layer_factory(
    scope: Scope, num_units: int, arity: int
) -> MixingLayer:
    return MixingLayer(
        scope, num_units, arity,
        parameterization=lambda p: Parameter.from_unary(SoftmaxParameter(p.shape), p),
        initializer=NormalInitializer(0.0, 1e-2)
    )

In [None]:
from cirkit.symbolic.circuit import Circuit

In [None]:
symbolic_circuit = Circuit.from_region_graph(
    region_graph,
    num_input_units=32,
    num_sum_units=32,
    input_factory=gaussian_layer_factory,
    sum_factory=dense_layer_factory,
    prod_factory=hadamard_layer_factory,
    mixing_factory=mixing_layer_factory
)

In [None]:
import torch
device = torch.device('cuda')  # The device to use
torch.manual_seed(42)
if 'cuda' in device.type:
    torch.cuda.manual_seed(42)

In [None]:
from cirkit.pipeline import PipelineContext

In [None]:
ctx = PipelineContext(
    backend='torch',   # Choose the torch compilation backend
    fold=True,         # Fold the circuit, this is a backend-specific compilation flag
    semiring='lse-sum' # Use the (R, +, *) semiring, where + is the log-sum-exp and * is the sum
)
circuit = ctx.compile(symbolic_circuit).to(device)

In [None]:
print(circuit)

prepare the dataset for pytorch

In [None]:
from torch import optim
from torch.utils.data import DataLoader
train_dataloader = DataLoader(X_train, shuffle=True, batch_size=64)
test_dataloader = DataLoader(X_test, shuffle=False, batch_size=256)
optimizer = optim.Adam(circuit.parameters(), lr=0.05)

setting up the data for the plots

In [None]:
x1_bounds = (-1.5,2.5)
x2_bounds = (-1,1.5)
num_samples = 400
x2 = np.linspace(0, 1, num_samples)*(x2_bounds[1] - x2_bounds[0]) + x2_bounds[0]
x1 = np.linspace(0, 1, num_samples)*(x1_bounds[1] - x1_bounds[0]) + x1_bounds[0]
x1v, x2v = np.meshgrid(x1, x2)
X_meshgrid_np = np.stack((x1v,x2v), axis=-1).reshape(-1,2)
X_meshgrid = torch.from_numpy(X_meshgrid_np).float()
X_meshgrid_np.shape

In [None]:
X_meshgrid.min(0).values, X_meshgrid.max(0).values

In [None]:
from matplotlib.pyplot import xlim, ylim

def print_density(title=None, scatter=False):
    log_probs = circuit(X_meshgrid.to(device).float().unsqueeze(dim=1))
    log_probs = log_probs.reshape(num_samples, num_samples)
    plt.imshow(log_probs.cpu().detach().numpy(), extent=(*x1_bounds, *x2_bounds), origin="lower")
    if title is not None:
        plt.title(title)
    if scatter:
        plt.scatter(X_train[:, 0], X_train[:, 1], alpha=0.5)
    plt.show()

In [None]:
print_density("initial density")

In [None]:
plt.title("data samples")
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors="k")
plt.show()

In [None]:
losses = []

In [None]:
num_epochs = 500
for epoch_idx in range(num_epochs):
    running_loss = 0.0
    for batch in train_dataloader:
        batch = batch.to(device).float().unsqueeze(dim=1)  # Add a channel dimension
        log_probs = circuit(batch)
        loss = -torch.mean(log_probs)   # The loss is the negative average log-likelihood
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss * len(batch)

    losses.append((running_loss / len(X_train)).cpu().detach().numpy())
    if epoch_idx % (num_epochs // 5) == 0:
        print_density(f"Epoch: {epoch_idx}")
    if epoch_idx % (num_epochs // 100) == 0:
        print(f"Epoch {epoch_idx}: Average NLL: {running_loss / len(X_train):.3f}")
print_density(f"final Epoch: {epoch_idx}", scatter=False)
print_density(f"final Epoch: {epoch_idx} \nwith training data-set", scatter=True)

test hold-out average log-likelihood

In [None]:
test_running_loss = 0.0
with torch.no_grad():
    for batch in test_dataloader:
        batch = batch.to(device).float().unsqueeze(dim=1)  # Add a channel dimension
        log_probs = circuit(batch)
        loss = -torch.mean(log_probs)   # The loss is the negative average log-likelihood
        test_running_loss += loss * len(batch)
train_running_loss = 0.0
with torch.no_grad():
    for batch in train_dataloader:
        batch = batch.to(device).float().unsqueeze(dim=1)  # Add a channel dimension
        log_probs = circuit(batch)
        loss = -torch.mean(log_probs)   # The loss is the negative average log-likelihood
        train_running_loss += loss * len(batch)
print(f"hold-out avg log-like.: {test_running_loss/len(X_test):.2f} vs train: {train_running_loss/len(X_train):.2f}")

A 3d-plot better visualizes our resulting density

In [None]:
from matplotlib import cbook, cm
from matplotlib.colors import LightSource
from matplotlib import colormaps


log_probs = circuit(X_meshgrid.to(device).float().unsqueeze(dim=1)).detach().cpu()
log_probs = log_probs.reshape(num_samples, num_samples).numpy()
nrows, ncols = log_probs.shape

# Set up plot
fig, ax = plt.subplots(figsize=(15,5), subplot_kw=dict(projection='3d'))
ax.set_title("3D surface plot")
ls = LightSource(270, 25)

# To use a custom hillshading mode, override the built-in shading and pass
# in the rgb colors of the shaded surface calculated from "shade".
rgb = ls.shade(log_probs, cmap=colormaps["magma"], vert_exag=1.0, blend_mode='soft')
ax.view_init(elev=35.)
surf = ax.plot_surface(x1v, x2v, log_probs, rstride=1, cstride=1, facecolors=rgb,
                       linewidth=0, antialiased=False, shade=False)

plt.show()

our log-likelihood over iterations

In [None]:
plt.title("loss over iterations")
plt.plot(losses)
plt.show()

our circuit is normalized by construction, which we can verify like this:

In [None]:
from cirkit.pipeline import integrate
with ctx:
    pc_pf = integrate(circuit)

In [None]:
log_pf = pc_pf()
assert torch.allclose(log_pf, torch.tensor(0.), atol=1e-6)