# MMDAgg experiments

This notebook contains the experiments with the updated version of MMDAgg.

The numbering of the experiments corresponds to the one in the file experiments.py.

The results are saved in the "results" directiory.

In [None]:
import numpy as np
from seed import generate_seed
from sampling import f_theta_sampler
from mmdagg.np import mmdagg
from pathlib import Path
Path("results").mkdir(exist_ok=True, parents=True)
from mnist import load_mnist
P, Q_list = load_mnist()

In [None]:
def sample_and_test_uniform(
    function_type, seed, kernel_type, approx_type, m, n, d, p, s, 
    perturbation_multiplier, alpha, l_minus, l_plus, B1, B2, B3, bandwidth_multipliers, number_bandwidths=10,
):  
    """
    Sample from uniform and perturbed uniform density and run two-sample test.
    inputs: function_type: "uniform", "increasing", "decreasing", "centred", "ost", 
                           "median", "split", "split (doubled sample sizes)" or "mmdagg_update"
            seed: integer random seed
            kernel_type: "gaussian" or "laplace": 
            approx_type: "permutation" (for MMD_a estimate Eq. (3)) 
                         or "wild bootstrap" (for MMD_b estimate Eq. (6))
            m: non-negative integer (sample size for uniform distribution)
            n: non-negative integer (sample size for perturbed uniform distribution)
            d: non-negative integer (dimension of samples)
            p: non-negative integer (number of permutations)
            s: positive number (smoothness parameter of Sobolev ball (Eq. (1))
            perturbation_multiplier: perturbation_multiplier: positive number (c_d in Eq. (17)) 
            alpha: real number in (0,1) (level of the test)
            l_minus: integer (for collection of bandwidths Eq. (16) in our paper)
            l_plus: integer (for collection of bandwidths Eq. (16) in our paper)
            B1: number of simulated test statistics to estimate the quantiles
            B2: number of simulated test statistics to estimate the probability in Eq. (13) in our paper
            B3: number of iterations for the bisection method
            bandwidth_multipliers: array such that mmd_split_test function (used for "split" 
                                   and "split (doubled sample sizes)") selects 'optimal' bandwidth from
                                   collection_bandwidths = [c*bandwidth_median for c in bandwidth_multipliers]
    output: result of test (1 for "REJECT H_0" and 0 for "FAIL TO REJECT H_0")
    """
    if function_type == "split (doubled sample sizes)":
        m = 2 * m
        n = 2 * n
    rs = np.random.RandomState(seed)
    if p == 0:
        X = rs.uniform(0, 1, (m, d)) 
        Y = rs.uniform(0, 1, (n, d))         
    else:
        X = f_theta_sampler(seed + 1, seed + 2, m, p, s, perturbation_multiplier, d)
        Y = rs.uniform(0, 1, (n, d))
    if function_type == "median":
        return mmd_median_test(
            seed, X, Y, alpha, kernel_type, approx_type, B1, bandwidth_multiplier=1
        )
    elif function_type in ["split", "split (doubled sample sizes)"]:
        return mmd_split_test(
            seed, X, Y, alpha, kernel_type, approx_type, B1, bandwidth_multipliers
        )
    elif function_type == "ost":
        return ost(seed, X, Y, alpha, kernel_type, l_minus, l_plus)
    elif function_type in ["uniform", "increasing", "decreasing", "centred"]:
        return mmdagg(
            seed, X, Y, alpha, kernel_type, approx_type, 
            function_type, l_minus, l_plus, B1, B2, B3
        )
    elif function_type == "mmdagg_update":
        if approx_type == "permutation":
            permutations_same_sample_size = True
        elif approx_type == "wild bootstrap" or approx_type == "wild_bootstrap": 
            permutations_same_sample_size = False
        else:
            raise ValueError('approx_type should be "permutation" or "wild bootstrap".')
        return mmdagg_update(
            X,
            Y,
            kernel=kernel_type,
            B1=B1,
            B2=B2,
            B3=B3,
            number_bandwidths=number_bandwidths,
            seed=seed,
            permutations_same_sample_size=permutations_same_sample_size,
        )
    elif function_type == "autotst":
        tst = autotst.AutoTST(X, Y)
        p_value = tst.p_value()
        output = int(p_value <= alpha)
        return output
    else:
        raise ValueError(
            'Undefined function_type: function_type should be "median", "split",' 
            '"split (doubled sample sizes)", "ost", "uniform", "increasing", '
            '"decreasing" or "centred".'
        )  

        
def sample_and_test_mnist(
    P, Q, function_type, seed, kernel_type, approx_type, m, n, 
    alpha, l_minus, l_plus, B1, B2, B3, bandwidth_multipliers, number_bandwidths=10,
):  
    """
    Sample from dataset P and dataset Q and run two-sample test.
    inputs: P: dataset of shape (number_points, dimension) from which to sample
            Q: dataset of shape (number_points, dimension) from which to sample
            function_type: "uniform", "increasing", "decreasing", "centred", "ost", 
                           "median", "split" or "split (doubled sample sizes)"
            seed: integer random seed
            kernel_type: "gaussian" or "laplace":
            approx_type: "permutation" (for MMD_a estimate Eq. (3)) 
                         or "wild bootstrap" (for MMD_b estimate Eq. (6))
            m: non-negative integer (sample size for uniform distribution)
            n: non-negative integer (sample size for perturbed uniform distribution)
            alpha: real number in (0,1) (level of the test)
            l_minus: integer (for collection of bandwidths Eq. (16) in our paper)
            l_plus: integer (for collection of bandwidths Eq. (16) in our paper)
            B1: number of simulated test statistics to estimate the quantiles
            B2: number of simulated test statistics to estimate the probability in Eq. (13) in our paper
            B3: number of iterations for the bisection method
            bandwidth_multipliers: array such that mmd_split_test function (used for "split" 
                                   and "split (doubled sample sizes)") selects 'optimal' bandwidth from
                                   collection_bandwidths = [c*bandwidth for c in bandwidth_multipliers]
    output: result of test (1 for "REJECT H_0" and 0 for "FAIL TO REJECT H_0")
    """
    rs = np.random.RandomState(seed)
    if function_type == "split (doubled sample sizes)":
        m = 2 * m
        n = 2 * n 
    idx_X = rs.randint(len(P), size=m)
    X = P[idx_X, :]
    idx_Y = rs.randint(len(Q), size=n)
    Y = Q[idx_Y, :]
    if function_type == "median":
        return mmd_median_test(
            seed, X, Y, alpha, kernel_type, approx_type, B1, bandwidth_multiplier=1
        )
    elif function_type in ["split", "split (doubled sample sizes)"]:
        return mmd_split_test(
            seed, X, Y, alpha, kernel_type, approx_type, B1, bandwidth_multipliers
        )
    elif function_type == "ost":
        return ost(seed, X, Y, alpha, kernel_type, l_minus, l_plus)
    elif function_type in ["uniform", "increasing", "decreasing", "centred"]:
        return mmdagg(
            seed, X, Y, alpha, kernel_type, approx_type, 
            function_type, l_minus, l_plus, B1, B2, B3
        )
    elif function_type == "mmdagg_update":
        if approx_type == "permutation":
            permutations_same_sample_size = True
        elif approx_type == "wild bootstrap" or approx_type == "wild_bootstrap": 
            permutations_same_sample_size = False
        else:
            raise ValueError('approx_type should be "permutation" or "wild bootstrap".')
        return mmdagg_update(
            X,
            Y,
            kernel=kernel_type,
            B1=B1,
            B2=B2,
            B3=B3,
            number_bandwidths=number_bandwidths,
            seed=seed,
            permutations_same_sample_size=permutations_same_sample_size,
        )
    elif function_type == "autotst":
        tst = autotst.AutoTST(X, Y)
        p_value = tst.p_value()
        output = int(p_value <= alpha)
        return output
    else:
        raise ValueError(
            'Undefined function_type: function_type should be "median", "split",' 
            '"split (doubled sample sizes)", "ost", "uniform", "increasing", '
            '"decreasing" or "centred".'
        )

# Experiment 1

Figure 3a, Figure 3b

In [4]:
def experiment1(j, approx_type="wild bootstrap"):
    
    number_bandwidths = 10
    B1 = B2 = 2000
    B3 = 50
    kernel_types = ["laplace_gaussian", "laplace", "gaussian", "all"]
    
    
    dataset = "uniform"
    bandwidth_multipliers = None
    sample_sizes = [500, 2000]
    N_epochs = 500
    alpha = 0.05
    delta = 1
    perturbation_multipliers = [2.7, 7.3]
    perturbations = [4, 3]
    R = [(0, 10), ]
    r_num = len(R)
    k_num = len(kernel_types)
    function_types = ["mmdagg_update", ]
    f_num = len(function_types)
    if approx_type == "wild bootstrap":
        app = "a"
    elif approx_type == "permutation":
        app = "b"
        
    ekr = [(e,k,r) for e in range(2) for k in range(k_num) for r in range(r_num)]
    e,k,r = ekr[j]

    d = e + 1
    p_num = perturbations[e]
    perturbation_multiplier = perturbation_multipliers[e]
    kernel_type = kernel_types[k]
    r_min, r_max = R[r]
    n = m = sample_sizes[e]
  
    print("sample size", sample_sizes[e])
    print("kernel", kernel_type)
    print("number bandwidths", number_bandwidths)
    print("B1", B1)
    print("B2", B2)
    print("B3", B3)
    print(" ")

    jobs = [[[] for p in range(p_num)] for w in range(f_num)] 

    k = 1 # same data for different kernels
    for w in range(f_num):
        function_type = function_types[w]
        for p in range(p_num):
            h = 1 / (p + 1)
            for i in range(N_epochs):
                seed = generate_seed(k, e, r, w, p, i)
                jobs[w][p].append(sample_and_test_uniform( 
                    function_type, 
                    seed, 
                    kernel_type, 
                    approx_type, 
                    m, n, d, p + 1, delta, perturbation_multiplier, 
                    alpha, r_min, r_max, B1, B2, B3, 
                    bandwidth_multipliers,
                    number_bandwidths=number_bandwidths,
                ))

    results = [[jobs[w][p] for p in range(p_num)] for w in range(f_num)] 
    power   = [[sum(results[w][p]) / N_epochs for p in range(p_num)] for w in range(f_num)]
    print(power)
 
    return power

In [6]:
i = 0
power = experiment1(i)
np.save("results/exp1_" + str(i) + ".npy", power[0])

sample size 500
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.988, 0.602, 0.228]]


In [7]:
i = 1
power = experiment1(i)
np.save("results/exp1_" + str(i) + ".npy", power[0])

sample size 500
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.99, 0.596, 0.216]]


In [8]:
i = 2
power = experiment1(i)
np.save("results/exp1_" + str(i) + ".npy", power[0])

sample size 500
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.988, 0.602, 0.226]]


In [9]:
i = 3
power = experiment1(i)
np.save("results/exp1_" + str(i) + ".npy", power[0])

sample size 500
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.988, 0.602, 0.228]]


In [10]:
i = 4
power = experiment1(i)
np.save("results/exp1_" + str(i) + ".npy", power[0])

sample size 2000
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.982, 0.26]]


In [11]:
i = 5
power = experiment1(i)
np.save("results/exp1_" + str(i) + ".npy", power[0])

sample size 2000
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.934, 0.134]]


In [12]:
i = 6
power = experiment1(i)
np.save("results/exp1_" + str(i) + ".npy", power[0])

sample size 2000
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.982, 0.266]]


In [13]:
i = 7
power = experiment1(i)
np.save("results/exp1_" + str(i) + ".npy", power[0])

sample size 2000
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.978, 0.252]]


# Experiment 1 bis

Figure 5a, Figure 5b

In [14]:
i = 0
power = experiment1(i, "permutation")
np.save("results/exp1bis_" + str(i) + ".npy", power[0])

sample size 500
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.988, 0.62, 0.23]]


In [15]:
i = 1
power = experiment1(i, "permutation")
np.save("results/exp1bis_" + str(i) + ".npy", power[0])

sample size 500
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.99, 0.606, 0.21]]


In [16]:
i = 2
power = experiment1(i, "permutation")
np.save("results/exp1bis_" + str(i) + ".npy", power[0])

sample size 500
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.986, 0.622, 0.23]]


In [17]:
i = 3
power = experiment1(i, "permutation")
np.save("results/exp1bis_" + str(i) + ".npy", power[0])

sample size 500
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.986, 0.618, 0.23]]


In [18]:
i = 4
power = experiment1(i, "permutation")
np.save("results/exp1bis_" + str(i) + ".npy", power[0])

sample size 2000
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.984, 0.262]]


In [19]:
i = 5
power = experiment1(i, "permutation")
np.save("results/exp1bis_" + str(i) + ".npy", power[0])

sample size 2000
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.942, 0.14]]


In [20]:
i = 6
power = experiment1(i, "permutation")
np.save("results/exp1bis_" + str(i) + ".npy", power[0])

sample size 2000
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.984, 0.264]]


In [21]:
i = 7
power = experiment1(i, "permutation")
np.save("results/exp1bis_" + str(i) + ".npy", power[0])

sample size 2000
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 0.982, 0.25]]


# Experiment 2

Figure 3c

In [14]:
from mnist import load_mnist
P, Q_list = load_mnist()

In [15]:
def experiment2(j, approx_type="wild bootstrap"):
    
    number_bandwidths = 10
    B1 = B2 = 2000
    B3 = 50
    kernel_types = ["laplace_gaussian", "laplace", "gaussian", "all"]
    
    dataset = "mnist"
    bandwidth_multipliers = None
    n = m = 500
    N_epochs = 500
    alpha = 0.05
    delta = 1
    R = [[(8,12), (10,14), (12,16)], [(10,14), (12,16), (14,18)]] 
    assert len(R[0]) == len(R[1])
    r_num = 1
    k_num = len(kernel_types)
    function_types = ["mmdagg_update", ]
    f_num = len(function_types)
    q_num = len(Q_list)
    if approx_type == "wild bootstrap":
        app = "a"
    elif approx_type == "permutation":
        app = "b"
        
    kr = [(k,r) for k in range(k_num) for r in range(r_num)]
    k,r = kr[j]

    kernel_type = kernel_types[k]
    r_min, r_max = 0, 1
    
    print("sample size", m)
    print("kernel", kernel_type)
    print("number bandwidths", number_bandwidths)
    print("B1", B1)
    print("B2", B2)
    print("B3", B3)
    print(" ")

    jobs = [[[] for q in range(q_num)] for w in range(f_num)] 
    
    k = 1
    for q in range(q_num):
        for w in range(f_num):
            function_type = function_types[w]
            for i in range(N_epochs):
                seed = generate_seed(k, 2, r, w, q, i)
                jobs[w][q].append(sample_and_test_mnist(P, Q_list[q], function_type, seed, kernel_type, approx_type, m, n, 
                        alpha, r_min, r_max, B1, B2, B3, bandwidth_multipliers, number_bandwidths=number_bandwidths))
    
    results = [[jobs[w][q] for q in range(q_num)] for w in range(f_num)]
    power   = [[sum(results[w][q]) / N_epochs for q in range(q_num)] for w in range(f_num)]
    print(power)
    
    return power

In [11]:
i = 0
power = experiment2(i)
np.save("results/exp2_" + str(i) + ".npy", power[0])

sample size 500
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 1.0, 0.996, 0.89, 0.248]]


In [9]:
i = 1
power = experiment2(i)
np.save("results/exp2_" + str(i) + ".npy", power[0])

sample size 500
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 1.0, 0.998, 0.898, 0.242]]


In [12]:
i = 2
power = experiment2(i)
np.save("results/exp2_" + str(i) + ".npy", power[0])

sample size 500
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 1.0, 0.998, 0.902, 0.252]]


In [70]:
i = 3
power = experiment2(i)
np.save("results/exp2_" + str(i) + ".npy", power[0])

sample size 500
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50

[[1.0, 1.0, 0.998, 0.884, 0.246]]


# Experiment 2.bis

Figure 5c

In [16]:
i = 0
power = experiment2(i, "permutation")
np.save("results/exp2bis_" + str(i) + ".npy", power[0])

sample size 500
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 1.0, 0.998, 0.884, 0.244]]


In [17]:
i = 1
power = experiment2(i, "permutation")
np.save("results/exp2bis_" + str(i) + ".npy", power[0])

sample size 500
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 1.0, 0.998, 0.902, 0.24]]


In [18]:
i = 2
power = experiment2(i, "permutation")
np.save("results/exp2bis_" + str(i) + ".npy", power[0])

sample size 500
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 1.0, 0.998, 0.892, 0.246]]


In [19]:
i = 3
power = experiment2(i, "permutation")
np.save("results/exp2bis_" + str(i) + ".npy", power[0])

sample size 500
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[1.0, 1.0, 0.998, 0.878, 0.236]]


# Experiment 5

In [19]:
def experiment5(j, approx_type="wild bootstrap"):
    
    number_bandwidths = 10
    B1 = B2 = 2000
    B3 = 50
    kernel_types = ["laplace_gaussian", "laplace", "gaussian", "all"]
    
    dataset = "butucea"
    bandwidth_multipliers = np.linspace(0.1,1,10)
    sample_sizes = [500, 2000]
    N_epochs = 5000
    alpha = 0.05
    delta = 1
    perturbation_multipliers = [2.7, 7.3]
    perturbations = [4,3]
    R = [(-4,-0)]
    r_num = len(R)
    k_num = len(kernel_types)
    function_types = ["mmdagg_update",]
    f_num = len(function_types)
    if approx_type == "wild bootstrap":
        app = "a"
    elif approx_type == "permutation":
        app = "b"
        
    ekr = [(e,k,r) for e in range(2) for k in range(k_num) for r in range(r_num)]
    e,k,r = ekr[j]

    d = e + 1
    p_num = perturbations[e]
    perturbation_multiplier = perturbation_multipliers[e]
    kernel_type = kernel_types[k]
    r_min, r_max = R[r]
    n = m = sample_sizes[e]
    
    print("sample size", sample_sizes[e])
    print("kernel", kernel_type)
    print("number bandwidths", number_bandwidths)
    print("B1", B1)
    print("B2", B2)
    print("B3", B3)
    print(" ")
    
    jobs = [[] for w in range(f_num)] 
    
    k = 1
    for w in range(f_num):
        function_type = function_types[w]
        for i in range(N_epochs):
            seed = generate_seed(k, e, r, w, 5, i) 
            jobs[w].append(sample_and_test_uniform( 
                function_type,
                seed,
                kernel_type,
                approx_type,
                m, n, d, 0, delta, perturbation_multiplier,
                alpha, r_min, r_max, B1, B2, B3,
                bandwidth_multipliers,
                number_bandwidths=number_bandwidths,
            ))
    
    results = [jobs[w] for w in range(f_num)] 
    power   = [sum(results[w]) / N_epochs for w in range(f_num)]
    print(power)

    return power

In [37]:
i = 0
power = experiment5(i)
np.save("results/exp5_" + str(i) + ".npy", power)

sample size 500
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0496]


In [38]:
i = 0
power = experiment5(i, "permutation")
np.save("results/exp5bis_" + str(i) + ".npy", power)

sample size 500
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0496]


In [39]:
i = 1
power = experiment5(i)
np.save("results/exp5_" + str(i) + ".npy", power)

sample size 500
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0506]


In [40]:
i = 1
power = experiment5(i, "permutation")
np.save("results/exp5bis_" + str(i) + ".npy", power)

sample size 500
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0528]


In [41]:
i = 2
power = experiment5(i)
np.save("results/exp5_" + str(i) + ".npy", power)

sample size 500
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.05]


In [42]:
i = 2
power = experiment5(i, "permutation")
np.save("results/exp5bis_" + str(i) + ".npy", power)

sample size 500
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0492]


In [43]:
i = 3
power = experiment5(i)
np.save("results/exp5_" + str(i) + ".npy", power)

sample size 500
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0492]


In [44]:
i = 3
power = experiment5(i, "permutation")
np.save("results/exp5bis_" + str(i) + ".npy", power)

sample size 500
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0494]


In [45]:
i = 4
power = experiment5(i)
np.save("results/exp5_" + str(i) + ".npy", power)

sample size 2000
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0434]


In [46]:
i = 4
power = experiment5(i, "permutation")
np.save("results/exp5bis_" + str(i) + ".npy", power)

sample size 2000
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0438]


In [47]:
i = 5
power = experiment5(i)
np.save("results/exp5_" + str(i) + ".npy", power)

sample size 2000
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0458]


In [48]:
i = 5
power = experiment5(i, "permutation")
np.save("results/exp5bis_" + str(i) + ".npy", power)

sample size 2000
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0456]


In [49]:
i = 6
power = experiment5(i)
np.save("results/exp5_" + str(i) + ".npy", power)

sample size 2000
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0428]


In [50]:
i = 6
power = experiment5(i, "permutation")
np.save("results/exp5bis_" + str(i) + ".npy", power)

sample size 2000
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0438]


In [51]:
i = 7
power = experiment5(i)
np.save("results/exp5_" + str(i) + ".npy", power)

sample size 2000
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.043]


In [30]:
i = 7
power = experiment5(i, "permutation")
np.save("results/exp5bis_" + str(i) + ".npy", power)

sample size 2000
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50

[0.0434]


# Experiment 6

In [36]:
def experiment6(j, approx_type="wild bootstrap"):
    
    number_bandwidths = 10
    B1 = B2 = 2000
    B3 = 50
    kernel_types = ["laplace_gaussian", "laplace", "gaussian", "all"]
    
    dataset = "mnist"
    bandwidth_multipliers = [2**i for i in range(10,21)]
    n = m = 500
    N_epochs = 5000
    alpha = 0.05
    delta = 1
    R = [[(10,14)], [(12,16)]] 
    assert len(R[0]) == len(R[1])
    r_num = len(R[0])
    k_num = len(kernel_types)
    function_types = ["mmdagg_update", ]
    f_num = len(function_types)
    q_num = len(Q_list)
    if approx_type == "wild bootstrap":
        app = "a"
    elif approx_type == "permutation":
        app = "b"
        
    kr = [(k,r) for k in range(k_num) for r in range(r_num)]
    k,r = kr[j]

    kernel_type = kernel_types[k]
    r_min, r_max = 1, 1
    
    print("sample size", m)
    print("kernel", kernel_type)
    print("number bandwidths", number_bandwidths)
    print("B1", B1)
    print("B2", B2)
    print("B3", B3)
    print(" ")

    jobs = [[]  for w in range(f_num)] 
    
    k = 1
    for w in range(f_num):
        function_type = function_types[w]
        for i in range(N_epochs):
            seed = generate_seed(k, 2, r, w, 5, i)
            jobs[w].append(sample_and_test_mnist(P, P, function_type, seed, kernel_type, approx_type, m, n, 
                    alpha, r_min, r_max, B1, B2, B3, bandwidth_multipliers, number_bandwidths=number_bandwidths))
    
    results = [jobs[w] for w in range(f_num)] 
    power   = [sum(results[w]) / N_epochs for w in range(f_num)]
    print(power)
    
    return power

In [31]:
i = 0
power = experiment6(i)
np.save("results/exp6_" + str(i) + ".npy", power)

sample size 500
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50

[0.0632]


In [32]:
i = 0
power = experiment6(i, "permutation")
np.save("results/exp6bis_" + str(i) + ".npy", power)

sample size 500
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50

[0.0628]


In [33]:
i = 1
power = experiment6(i)
np.save("results/exp6_" + str(i) + ".npy", power)

sample size 500
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50

[0.0624]


In [34]:
i = 1
power = experiment6(i, "permutation")
np.save("results/exp6bis_" + str(i) + ".npy", power)

sample size 500
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50

[0.0612]


In [37]:
i = 2
power = experiment6(i)
np.save("results/exp6_" + str(i) + ".npy", power)

sample size 500
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0624]


In [38]:
i = 2
power = experiment6(i, "permutation")
np.save("results/exp6bis_" + str(i) + ".npy", power)

sample size 500
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.062]


In [39]:
i = 3
power = experiment6(i)
np.save("results/exp6_" + str(i) + ".npy", power)

sample size 500
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0594]


In [41]:
i = 3
power = experiment6(i, "permutation")
np.save("results/exp6bis_" + str(i) + ".npy", power)

sample size 500
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[0.0622]


# Experiment 7

In [47]:
def experiment7(j, approx_type="permutation"):
    
    number_bandwidths = 10
    B1 = B2 = 2000
    B3 = 50
    kernel_types = ["laplace_gaussian", "laplace", "gaussian", "all"]
    
    dataset = "butucea"
    bandwidth_multipliers = np.linspace(0.1,1,10)
    sample_sizes_m = [100, 250]
    sample_sizes_n = [1000, 2000, 3000, 4000, 5000]
    s_num = len(sample_sizes_n)
    p_values = [2, 1] # d=1, 3 pert, d=2 2 pert
    N_epochs = 500
    alpha = 0.05
    delta = 1
    perturbation_multipliers = [2.7, 7.3]
    perturbations = [4,3]
    R = [(-4,-0)] 
    r_num = len(R)
    k_num = len(kernel_types)
    function_types = ["mmdagg_update",] 
    f_num = len(function_types)
    if approx_type == "wild bootstrap":
        app = "a"
    elif approx_type == "permutation":
        app = "b"
        
    ek = [(e,k) for e in range(2) for k in range(k_num)]
    e,k = ek[j]

    d = e + 1
    perturbation_multiplier = perturbation_multipliers[e]
    kernel_type = kernel_types[k]
    r_min, r_max = 1, 1 
    p = p_values[e]
    h = 1/(p+1)
    m = sample_sizes_m[e]
    
    print("sample size", m)
    print("sample sizes", sample_sizes_n)
    print("kernel", kernel_type)
    print("number bandwidths", number_bandwidths)
    print("B1", B1)
    print("B2", B2)
    print("B3", B3)
    print("perturbation", p + 1)
    print("d", d)
    print(" ")

    jobs = [[[] for s in range(s_num)] for w in range(f_num)]
    
    k = 1
    for w in range(f_num):
        function_type = function_types[w]
        for s in range(s_num):
            n = sample_sizes_n[s]
            for i in range(N_epochs):
                seed = generate_seed(k, e, 3, w, s, i) 
                jobs[w][s].append(sample_and_test_uniform( 
                    function_type, 
                    seed, 
                    kernel_type, 
                    approx_type, 
                    m, n, d, p + 1, delta, perturbation_multiplier, 
                    alpha, r_min, r_max, B1, B2, B3, 
                    bandwidth_multipliers,
                    number_bandwidths=number_bandwidths,
                ))
    
    results = [[jobs[w][s] for s in range(s_num)] for w in range(f_num)] 
    power   = [[sum(results[w][s]) / N_epochs for s in range(s_num)] for w in range(f_num)]
    print(power)

    return power

In [23]:
i = 0
power = experiment7(i)
np.save("results/exp7_" + str(i) + ".npy", power[0])

sample size 100
sample sizes [1000, 2000, 3000, 4000, 5000]
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
perturbation 3
d 1
 
[[0.212, 0.18, 0.19, 0.222, 0.228]]


In [24]:
i = 1
power = experiment7(i)
np.save("results/exp7_" + str(i) + ".npy", power[0])

sample size 100
sample sizes [1000, 2000, 3000, 4000, 5000]
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
perturbation 3
d 1
 
[[0.202, 0.178, 0.178, 0.218, 0.212]]


In [25]:
i = 2
power = experiment7(i)
np.save("results/exp7_" + str(i) + ".npy", power[0])

sample size 100
sample sizes [1000, 2000, 3000, 4000, 5000]
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
perturbation 3
d 1
 
[[0.212, 0.178, 0.19, 0.226, 0.23]]


In [26]:
i = 3
power = experiment7(i)
np.save("results/exp7_" + str(i) + ".npy", power[0])

sample size 100
sample sizes [1000, 2000, 3000, 4000, 5000]
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
perturbation 3
d 1
 
[[0.21, 0.178, 0.192, 0.222, 0.224]]


In [27]:
i = 4
power = experiment7(i)
np.save("results/exp7_" + str(i) + ".npy", power[0])

sample size 250
sample sizes [1000, 2000, 3000, 4000, 5000]
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
perturbation 2
d 2
 
[[0.176, 0.228, 0.198, 0.234, 0.244]]


In [28]:
i = 5
power = experiment7(i)
np.save("results/exp7_" + str(i) + ".npy", power[0])

sample size 250
sample sizes [1000, 2000, 3000, 4000, 5000]
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
perturbation 2
d 2
 
[[0.144, 0.174, 0.166, 0.166, 0.182]]


In [29]:
i = 6
power = experiment7(i)
np.save("results/exp7_" + str(i) + ".npy", power[0])

sample size 250
sample sizes [1000, 2000, 3000, 4000, 5000]
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
perturbation 2
d 2
 
[[0.18, 0.232, 0.206, 0.236, 0.25]]


In [30]:
i = 7
power = experiment7(i)
np.save("results/exp7_" + str(i) + ".npy", power[0])

sample size 250
sample sizes [1000, 2000, 3000, 4000, 5000]
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
perturbation 2
d 2
 
[[0.166, 0.224, 0.198, 0.226, 0.238]]


# Experiment 8

In [42]:
def experiment8(j, approx_type="permutation"):
    
    number_bandwidths = 10
    B1 = B2 = 2000
    B3 = 50
    kernel_types = ["laplace_gaussian", "laplace", "gaussian", "all"]
    
    dataset = "mnist"
    bandwidth_multipliers = np.linspace(0.1,1,10)
    m = 100
    sample_sizes_n = [200, 400, 600, 800, 1000]
    s_num = len(sample_sizes_n)
    N_epochs = 500
    alpha = 0.05
    delta = 1
    R = [(10,14), (12,16)] 
    r_num = len(R)
    k_num = len(kernel_types)
    function_types = ["mmdagg_update"]
    f_num = len(function_types)
    if approx_type == "wild bootstrap":
        app = "a"
    elif approx_type == "permutation":
        app = "b"
        
    ek = [k for k in range(k_num)]
    k = ek[j]

    kernel_type = kernel_types[k]
    r_min, r_max = 1, 1
    
    print("sample size", m)
    print("sample sizes", sample_sizes_n)
    print("kernel", kernel_type)
    print("number bandwidths", number_bandwidths)
    print("B1", B1)
    print("B2", B2)
    print("B3", B3)
    print(" ")

    jobs = [[[] for s in range(s_num)] for w in range(f_num)]
                    
    k = 1
    for w in range(f_num):
        function_type = function_types[w]
        for s in range(s_num):
            n = sample_sizes_n[s]
            for i in range(N_epochs):
                seed = generate_seed(k, 2, 3, w, s, i) 
                jobs[w][s].append(sample_and_test_mnist(P, Q, function_type, seed, kernel_type, approx_type, m, n, 
                        alpha, r_min, r_max, B1, B2, B3, bandwidth_multipliers, number_bandwidths=number_bandwidths))
                
    results = [[jobs[w][s] for s in range(s_num)] for w in range(f_num)] 
    power   = [[sum(results[w][s]) / N_epochs for s in range(s_num)] for w in range(f_num)]
    print(power)
    
    return power

In [43]:
i = 0
power = experiment8(i)
np.save("results/exp8_" + str(i) + ".npy", power[0])

sample size 100
sample sizes [200, 400, 600, 800, 1000]
kernel laplace_gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[0.3, 0.476, 0.494, 0.506, 0.552]]


In [44]:
i = 1
power = experiment8(i)
np.save("results/exp8_" + str(i) + ".npy", power[0])

sample size 100
sample sizes [200, 400, 600, 800, 1000]
kernel laplace
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[0.312, 0.452, 0.498, 0.486, 0.53]]


In [45]:
i = 2
power = experiment8(i)
np.save("results/exp8_" + str(i) + ".npy", power)

sample size 100
sample sizes [200, 400, 600, 800, 1000]
kernel gaussian
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[0.32, 0.488, 0.518, 0.524, 0.564]]


In [46]:
i = 3
power = experiment8(i)
np.save("results/exp8_" + str(i) + ".npy", power)

sample size 100
sample sizes [200, 400, 600, 800, 1000]
kernel all
number bandwidths 10
B1 2000
B2 2000
B3 50
 
[[0.294, 0.45, 0.48, 0.492, 0.536]]


# Experiment 9

In [None]:
def experiment9(j, approx_type="wild bootstrap"):
    
    B1 = B2 = 500
    B3 = 100
    kernel_types = ["laplace_gaussian", "laplace", "gaussian", "all"]
    
    dataset = "butucea"
    bandwidth_multipliers = None
    sample_sizes = [500, 2000]
    N_epochs = 500
    alpha = 0.05
    delta = 1
    p_values = [2, 2] # d=1 3 pert, d=2 3 pert
    perturbation_multipliers = [2.7, 7.3]
    R = [10, 100, 1000]
    r_num = len(R)
    k_num = len(kernel_types)
    function_types = ["mmdagg_update"]
    f_num = len(function_types)
    if approx_type == "wild bootstrap":
        app = "a"
    elif approx_type == "permutation":
        app = "b"
        
    ek = [(e,k) for e in range(2) for k in range(k_num)]
    e,k = ek[j]
    
    d = e + 1
    perturbation_multiplier = perturbation_multipliers[e]
    kernel_type = kernel_types[k]
    n = m = sample_sizes[e]
    p = p_values[e]
    h = 1 / (p+1)
    r_min, r_max = 1, 1
    
    print("sample size", m)
    print("kernel", kernel_type)
    print("number bandwidths", R)
    print("B1", B1)
    print("B2", B2)
    print("B3", B3)
    print(" ")
    
    jobs = [[[] for r in range(r_num)] for w in range(f_num)] 
    
    k = 1
    for w in range(f_num):
        function_type = function_types[w]
        for r in range(r_num):
            number_bandwidths = R[r]
            for i in range(N_epochs):
                seed = generate_seed(k, e, 1, w, p, i)
                jobs[w][r].append(sample_and_test_uniform(
                    function_type, 
                    seed, 
                    kernel_type, 
                    approx_type, 
                    m, n, d, p + 1, delta, perturbation_multiplier, 
                    alpha, r_min, r_max, B1, B2, B3, 
                    bandwidth_multipliers,
                    number_bandwidths=number_bandwidths,
                ))

    results = [[jobs[w][r] for r in range(r_num)] for w in range(f_num)] 
    power   = [[sum(results[w][r]) / N_epochs for r in range(r_num)] for w in range(f_num)] 
    print(power)
    
    return power


In [33]:
i = 0
power = experiment9(i)
np.save("results/exp9_" + str(i) + ".npy", power[0])

sample size 500
kernel laplace_gaussian
number bandwidths [10, 100, 1000]
B1 500
B2 500
B3 100
 
[[0.56, 0.562, 0.56]]


In [34]:
i = 1
power = experiment9(i)
np.save("results/exp9_" + str(i) + ".npy", power[0])

sample size 500
kernel laplace
number bandwidths [10, 100, 1000]
B1 500
B2 500
B3 100
 
[[0.552, 0.548, 0.548]]


In [35]:
i = 2
power = experiment9(i)
np.save("results/exp9_" + str(i) + ".npy", power[0])

sample size 500
kernel gaussian
number bandwidths [10, 100, 1000]
B1 500
B2 500
B3 100
 
[[0.558, 0.562, 0.56]]


In [36]:
i = 3
power = experiment9(i)
np.save("results/exp9_" + str(i) + ".npy", power[0])

sample size 500
kernel all
number bandwidths [10, 100, 1000]
B1 500
B2 500
B3 100
 
[[0.56, 0.558, 0.558]]


In [37]:
i = 4
power = experiment9(i)
np.save("results/exp9_" + str(i) + ".npy", power[0])

sample size 2000
kernel laplace_gaussian
number bandwidths [10, 100, 1000]
B1 500
B2 500
B3 100
 
[[0.262, 0.26, 0.262]]


In [38]:
i = 5
power = experiment9(i)
np.save("results/exp9_" + str(i) + ".npy", power[0])

sample size 2000
kernel laplace
number bandwidths [10, 100, 1000]
B1 500
B2 500
B3 100
 
[[0.116, 0.114, 0.114]]


In [41]:
i = 6
power = experiment9(i)
np.save("results/exp9_" + str(i) + ".npy", power[0])

sample size 2000
kernel gaussian
number bandwidths [10, 100, 1000]
B1 500
B2 500
B3 100
 
[[0.268, 0.264, 0.264]]


In [42]:
i = 7
power = experiment9(i)
np.save("results/exp9_" + str(i) + ".npy", power[0])

sample size 2000
kernel all
number bandwidths [10, 100, 1000]
B1 500
B2 500
B3 100
 
[[0.252, 0.242, 0.244]]


# Experiment 10

In [19]:
def experiment10(j, approx_type="wild bootstrap"):
    
    B1 = B2 = 500
    B3 = 100
    kernel_types = ["laplace_gaussian", "laplace", "gaussian", "all"]
    
    dataset = "mnist"
    bandwidth_multipliers = None
    n = m = 500
    N_epochs = 500
    alpha = 0.05
    R = [3, 4, 5, 6, 7, 8, 9, 10, 100, 1000]
    r_num = len(R)
    k_num = len(kernel_types)
    function_types = ["mmdagg_update"]
    f_num = len(function_types)
    if approx_type == "wild bootstrap":
        app = "a"
    elif approx_type == "permutation":
        app = "b"
        
    kw = [(k) for k in range(k_num) ]
    k = kw[j]

    kernel_type = kernel_types[k]
    q = 3 
    Q = Q_list[q] # Q4
    r_min, r_max = 1, 1
    
    print("sample size", m)
    print("kernel", kernel_type)
    print("number bandwidths", R)
    print("B1", B1)
    print("B2", B2)
    print("B3", B3)
    print(" ")

    jobs = [[[] for r in range(r_num)] for w in range(f_num)] 

    k = 1
    for w in range(f_num):
        function_type = function_types[w]
        for r in range(r_num):
            number_bandwidths = R[r]
            for i in range(N_epochs):
                seed = generate_seed(k, 2, 1, w, q, i)
                jobs[w][r].append(sample_and_test_mnist(P, Q, function_type, seed, kernel_type, approx_type, m, n, 
                        alpha, r_min, r_max, B1, B2, B3, bandwidth_multipliers, number_bandwidths=number_bandwidths))
    
    results = [[jobs[w][r] for r in range(r_num)] for w in range(f_num)] 
    power   = [[sum(results[w][r]) / N_epochs for r in range(r_num)] for w in range(f_num)]
    print(power)
    
    return power

In [21]:
i = 0
power = experiment10(i)
np.save("results/exp10_" + str(i) + ".npy", power[0])

sample size 500
kernel laplace_gaussian
number bandwidths [10, 100, 1000]
B1 500
B2 500
B3 100
 
[[0.878, 0.878, 0.878]]


In [22]:
i = 1
power = experiment10(i)
np.save("results/exp10_" + str(i) + ".npy", power[0])

sample size 500
kernel laplace
number bandwidths [10, 100, 1000]
B1 500
B2 500
B3 100
 
[[0.876, 0.874, 0.874]]


In [23]:
i = 2
power = experiment10(i)
np.save("results/exp10_" + str(i) + ".npy", power[0])

sample size 500
kernel gaussian
number bandwidths [10, 100, 1000]
B1 500
B2 500
B3 100
 
[[0.882, 0.88, 0.88]]


In [24]:
i = 3
power = experiment10(i)
np.save("results/exp10_" + str(i) + ".npy", power[0])

sample size 500
kernel all
number bandwidths [10, 100, 1000]
B1 500
B2 500
B3 100
 
[[0.868, 0.87, 0.87]]
