## Tests for linear case with three Gaussian inputs

In [5]:
import numpy as np
import pandas as pd
import chaospy as cp
import seaborn as sns
from numpy.testing import assert_array_almost_equal as aaae

from econsa_shapley import get_shapley
from econsa_shapley import _r_condmvn
from simulation_of_variance import simulate_variance

In [4]:
def test_get_shapley_linear_three_inputs():
    def linear_model(x):
        beta = np.array([[beta_1], [beta_2], [beta_3]])
        return x.dot(beta)

    def x_all(n):
        return cp.MvNormal(mean, cov).sample(n)

    def x_cond(n, subset_j, subsetj_conditional, xjc):
        if subsetj_conditional is None:
            cov_int = np.array(cov).take(subset_j, axis=1)[subset_j]
            distribution = cp.MvNormal(mean[subset_j], cov_int)
            return distribution.sample(n)
        else:
            return _r_condmvn(
                n,
                mean=mean,
                cov=cov,
                dependent_ind=subset_j,
                given_ind=subsetj_conditional,
                x_given=xjc,
            )

    np.random.seed(123)
    n_inputs = 3
    mean = np.zeros(n_inputs)
    var_1 = 16
    var_2 = 4
    var_3 = 9
    
    # rho is the correlation coefficient, and thus, in range [-1,1]. Correlation between X2 and X3 only, i.e. rho = Corr[X2, X3].
    rho = 0.3
    covariance = rho * np.sqrt(var_2) * np.sqrt(var_3)
    beta_1 = 1.3
    beta_2 = 1.5
    beta_3 = 2.5
    #beta = (beta_1, beta_2, beta)

    component_1 = beta_1 ** 2 * var_1
    component_2 = beta_2 ** 2 * var_2
    component_3 = beta_3 ** 2 * var_3
    var_y = component_1 + component_2 + component_3 + 2 * covariance * beta_2 * beta_3
    share = 0.5 * (rho**2)
    true_shapley_1 = (component_1)/var_y
    true_shapley_2 = (component_2 + covariance * beta_2 * beta_3 + share * (component_3 - component_2))/var_y
    true_shapley_3 = (component_3 + covariance * beta_2 * beta_3 + share * (component_2 - component_3))/var_y

    cov = np.array(
        [[var_1, 0, 0],
        [0, var_2, covariance],
        [0, covariance, var_3]]
        )

    method = "exact"
    n_perms = None
    n_output = 10 ** 7
    n_outer = 10 ** 4
    n_inner = 10 ** 2

    col = ["X" + str(i) for i in np.arange(n_inputs) + 1]
    names = ["Shapley effects", "std. errors", "CI_min", "CI_max"]

    expected = pd.DataFrame(
        data=[
            [true_shapley_1, true_shapley_2, true_shapley_3]
        ],
        index=names,
        columns=col,
    ).T

    calculated = get_shapley(
        method,
        linear_model,
        x_all,
        x_cond,
        n_perms,
        n_inputs,
        n_output,
        n_outer,
        n_inner,
    )

    aaae(calculated["Shapley effects"], expected["Shapley effects"], 6)
test_get_shapley_linear_three_inputs()

AssertionError: 
Arrays are not almost equal to 6 decimals

Mismatched elements: 3 / 3 (100%)
Max absolute difference: 0.00033723
Max relative difference: 0.00155642
 x: array([0.255526, 0.168716, 0.575758])
 y: array([0.255601, 0.168979, 0.575421])

In [15]:
%%time
# Check whether model variance is precisely estimated.
def linear_model(x):
    beta = np.array([[beta_1], [beta_2], [beta_3]])
    return x.dot(beta)
beta_1 = 1.3
beta_2 = 1.5
beta_3 = 2.5

n_inputs = 3
mean = np.zeros(n_inputs)
var_1 = 16
var_2 = 4
var_3 = 9
rho = 0.3
covariance = rho * np.sqrt(var_2) * np.sqrt(var_3)

cov = np.array(
    [[var_1, 0, 0],
    [0, var_2, covariance],
    [0, covariance, var_3]]
    )

estimated_variance = simulate_variance(model=linear_model,
                                       cov=cov,
                                       mean=mean,
                                       n_sim=10 ** 7)

Wall time: 3.16 s


In [16]:
# Analytical variance.
component_1 = beta_1 ** 2 * var_1
component_2 = beta_2 ** 2 * var_2
component_3 = beta_3 ** 2 * var_3
var_y = component_1 + component_2 + component_3 + 2 * covariance * beta_2 * beta_3
print(estimated_variance, var_y)

105.8114702432287 105.79


In [None]:
def x_all(n):
    return cp.MvNormal(mean, cov).sample(n)

def x_cond(n, subset_j, subsetj_conditional, xjc):
    if subsetj_conditional is None:
        cov_int = np.array(cov).take(subset_j, axis=1)[subset_j]
        distribution = cp.MvNormal(mean[subset_j], cov_int)
        return distribution.sample(n)
    else:
        return _r_condmvn(
            n,
            mean=mean,
            cov=cov,
            dependent_ind=subset_j,
            given_ind=subsetj_conditional,
            x_given=xjc,
        )

    np.random.seed(123)

    var_1 = 16
    var_2 = 4
    var_3 = 9
    
    # rho is the correlation coefficient, and thus, in range [-1,1]. Correlation between X2 and X3 only, i.e. rho = Corr[X2, X3].
    rho = 0.3
    covariance = rho * np.sqrt(var_2) * np.sqrt(var_3)
    beta_1 = 1.3
    beta_2 = 1.5
    beta_3 = 2.5
    #beta = (beta_1, beta_2, beta)

    component_1 = beta_1 ** 2 * var_1
    component_2 = beta_2 ** 2 * var_2
    component_3 = beta_3 ** 2 * var_3
    var_y = component_1 + component_2 + component_3 + 2 * covariance * beta_2 * beta_3
    share = 0.5 * (rho**2)
    true_shapley_1 = (component_1)/var_y
    true_shapley_2 = (component_2 + covariance * beta_2 * beta_3 + share * (component_3 - component_2))/var_y
    true_shapley_3 = (component_3 + covariance * beta_2 * beta_3 + share * (component_2 - component_3))/var_y



    method = "exact"
    n_perms = None
    n_output = 10 ** 7
    n_outer = 10 ** 4
    n_inner = 10 ** 2

    col = ["X" + str(i) for i in np.arange(n_inputs) + 1]
    names = ["Shapley effects", "std. errors", "CI_min", "CI_max"]

    expected = pd.DataFrame(
        data=[
            [true_shapley_1, true_shapley_2, true_shapley_3]
        ],
        index=names,
        columns=col,
    ).T

    calculated = get_shapley(
        method,
        linear_model,
        x_all,
        x_cond,
        n_perms,
        n_inputs,
        n_output,
        n_outer,
        n_inner,