# Scaling of the errors in the symmetric 2D ALE plot

As the number of samples is increased, the discretisation errors are observed to decrease as expected, thus showing that the method is working as intended, producing the opposite 2nd order effect on the two quantile halves.

In [None]:
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed
from itertools import product

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

from alepython.ale import (
    _get_centres,
    _get_quantiles,
    first_order_ale_quant,
    second_order_ale_quant,
)

In [None]:
def interaction_predictor(X):
    """Interaction changes sign at b = 0.5."""
    a = X["a"]
    b = X["b"]

    out = np.empty_like(a)

    mask = b <= 0.5
    out[mask] = a[mask] * b[mask]
    mask = ~mask
    out[mask] = -a[mask] * (1 - b[mask])

    return out


def test_ale_mirror(N=int(1e3), seed=1, nbins=3):
    assert nbins % 2 == 1
    # nbins needs to be an odd number to yield an even number of edges.

    np.random.seed(seed)
    b = np.linspace(0, 1, N)

    n_unique = _get_quantiles(pd.DataFrame({"b": b}), "b", nbins)[1]
    if n_unique != nbins:
        # Likely a floating point error in the quantile calculation.
        return None, None

    # Ensure the data is mirrored around b=0.5.
    a_comp = np.random.random(N // 2) * 2
    a = np.append(a_comp, a_comp[::-1])
    X = pd.DataFrame({"a": a, "b": b})

    quantiles_list, ale, samples = second_order_ale_quant(
        interaction_predictor, X, X.columns, nbins
    )

    b_quantiles = quantiles_list[1]

    assert np.allclose(
        b_quantiles[: (nbins + 1) // 2],
        1 - np.array(b_quantiles[(nbins + 1) // 2 :][::-1]),
        atol=1e-2,
    )

    diff = np.abs(ale[:, : (nbins + 1) // 2] - ale[:, (nbins + 1) // 2 :][::-1, ::-1])
    return np.mean(diff), np.std(diff)

In [None]:
data = defaultdict(list)

Nlist = [int(1e5), int(1e6), int(1e7)]

with ProcessPoolExecutor(max_workers=None) as executor:
    fs = []
    for N, seed, nbins in product(
        Nlist, np.arange(20), (np.linspace(1, 16, 3, dtype=np.int64) * 2) + 1
    ):
        fs.append(executor.submit(test_ale_mirror, N, seed, nbins))
        data["N"].append(
            np.format_float_scientific(
                N, precision=0, unique=False, exp_digits=1, sign=False
            )
        )
        data["seed"].append(seed)
        data["nbins"].append(nbins)

    for _ in tqdm(as_completed(fs), total=len(fs), desc="Processing"):
        pass

# Finally, append the results themselves to the dictionary.
for f in fs:
    mean, std = f.result()
    data["mean"].append(mean)
    data["std"].append(std)

In [None]:
df = pd.DataFrame(data)
df

## The mean error decreases with an increasing number of samples per bin

### Thus it increases with the number of bins (nbins)

In [None]:
mpl.rc("figure", figsize=(15, 8))
axes = df.boxplot(column=["mean", "std"], by=["N", "nbins"])
for ax in axes:
    ax.xaxis.set_tick_params(rotation=45)

### This should be random, as the exact sequence of pseudorandom numbers should not affect the estimate

In [None]:
mpl.rc("figure", figsize=(15, 8))
axes = df.boxplot(column=["mean", "std"], by=["seed"])
for ax in axes:
    ax.xaxis.set_tick_params(rotation=45)