In [1]:
# Import modules.

import chaospy as cp
import numpy as np
import pandas as pd
from numpy.testing import assert_array_almost_equal as aaae

from shapley_econsa import _r_condmvn
from shapley_econsa import get_shapley


## Tests for linear case with two Gaussian inputs

### Exact

In [109]:
def test_get_shapley_exact_linear():
    def linear_model(x):
        beta = np.array([[beta_1], [beta_2]])
        return x.dot(beta)

    def x_all(n):
        return cp.MvNormal(mean, cov).sample(n)

    def x_cond(n, subset_j, subsetj_conditional, xjc):
        if subsetj_conditional is None:
            cov_int = np.array(cov).take(subset_j, axis=1)[subset_j]
            distribution = cp.MvNormal(mean[subset_j], cov_int)
            return distribution.sample(n)
        else:
            return _r_condmvn(
                n,
                mean=mean,
                cov=cov,
                dependent_ind=subset_j,
                given_ind=subsetj_conditional,
                x_given=xjc,
            )

    np.random.seed(123)
    n_inputs = 2
    mean = np.zeros(n_inputs)
    var_1 = 16
    var_2 = 4
    
    # rho is the correlation coefficient, and thus, in range [-1,1].
    rho = 0.3
    covariance = rho * np.sqrt(var_1) * np.sqrt(var_2)
    beta_1 = 1.3
    beta_2 = 1.5
    beta = (beta_1, beta_2)

    component_1 = beta_1**2 * var_1
    component_2 = beta_2**2 * var_2
    var_y = component_1 + 2 * covariance * beta_1 * beta_2 + component_2
    share = 0.5 * (rho**2)
    true_shapley_1 = (component_1 * (1 - share) + covariance * beta_1 * beta_2 + component_2 * share)/var_y
    true_shapley_2 = (component_2 * (1 - share) + covariance * beta_1 * beta_2 + component_1 * share)/var_y

    cov = np.array(
        [[var_1, covariance],
        [covariance, var_2]]
        )

    method = "exact"
    n_perms = None
    n_output = 10 ** 4
    n_outer = 10 ** 3
    n_inner = 10 ** 2

    col = ["X" + str(i) for i in np.arange(n_inputs) + 1]
    names = ["Shapley effects", "std. errors", "CI_min", "CI_max"]

    expected = pd.DataFrame(
        data=[
            [true_shapley_1, true_shapley_2],
            [0, 0],
            [true_shapley_1, true_shapley_2],
            [true_shapley_1, true_shapley_2],
        ],
        index=names,
        columns=col,
    ).T

    calculated = get_shapley(
        method,
        linear_model,
        x_all,
        x_cond,
        n_perms,
        n_inputs,
        n_output,
        n_outer,
        n_inner,
    )

    aaae(calculated["Shapley effects"], expected["Shapley effects"], 4)

In [110]:
test_get_shapley_exact_linear()

AssertionError: 
Arrays are not almost equal to 4 decimals

Mismatched elements: 2 / 2 (100%)
Max absolute difference: 0.00073519
Max relative difference: 0.00230321
 x: array([0.6815, 0.3185])
 y: array([0.6808, 0.3192])

### Random

In [None]:
def test_get_shapley_random_linear():
    def linear_model(x):
        beta = np.array([[beta_1], [beta_2]])
        return x.dot(beta)

    def x_all(n):
        return cp.MvNormal(mean, cov).sample(n)

    def x_cond(n, subset_j, subsetj_conditional, xjc):
        if subsetj_conditional is None:
            cov_int = np.array(cov).take(subset_j, axis=1)[subset_j]
            distribution = cp.MvNormal(mean[subset_j], cov_int)
            return distribution.sample(n)
        else:
            return _r_condmvn(
                n,
                mean=mean,
                cov=cov,
                dependent_ind=subset_j,
                given_ind=subsetj_conditional,
                x_given=xjc,
            )

    np.random.seed(123)
    n_inputs = 2
    mean = np.zeros(n_inputs)
    var_1 = 16
    var_2 = 4
    
    # rho is the correlation coefficient, and thus, in range [-1,1].
    rho = 0.3
    covariance = rho * np.sqrt(var_1) * np.sqrt(var_2)
    beta_1 = 1.3
    beta_2 = 1.5
    beta = (beta_1, beta_2)

    component_1 = beta_1**2 * var_1
    component_2 = beta_2**2 * var_2
    var_y = component_1 + 2 * covariance * beta_1 * beta_2 + component_2
    share = 0.5 * (rho**2)
    true_shapley_1 = (component_1 * (1 - share) + covariance * beta_1 * beta_2 + component_2 * share)/var_y
    true_shapley_2 = (component_2 * (1 - share) + covariance * beta_1 * beta_2 + component_1 * share)/var_y

    cov = np.array(
        [[var_1, covariance],
        [covariance, var_2]]
        )

    method = "exact"
    n_perms = None
    n_output = 10 ** 4
    n_outer = 10 ** 3
    n_inner = 10 ** 2

    col = ["X" + str(i) for i in np.arange(n_inputs) + 1]
    names = ["Shapley effects", "std. errors", "CI_min", "CI_max"]

    expected = pd.DataFrame(
        data=[
            [true_shapley_1, true_shapley_2],
            [0, 0],
            [true_shapley_1, true_shapley_2],
            [true_shapley_1, true_shapley_2],
        ],
        index=names,
        columns=col,
    ).T

    calculated = get_shapley(
        method,
        linear_model,
        x_all,
        x_cond,
        n_perms,
        n_inputs,
        n_output,
        n_outer,
        n_inner,
    )

    aaae(calculated["Shapley effects"], expected["Shapley effects"], 4)

## Test functions coded by Linda M.

In [96]:
def test_get_shapley_exact():
    def gaussian_model(x):
        return np.sum(x, 1)

    def x_all(n):
        distribution = cp.MvNormal(mean, cov)
        return distribution.sample(n)

    def x_cond(n, subset_j, subsetj_conditional, xjc):
        if subsetj_conditional is None:
            cov_int = np.array(cov)
            cov_int = cov_int.take(subset_j, axis=1)
            cov_int = cov_int[subset_j]
            distribution = cp.MvNormal(mean[subset_j], cov_int)
            return distribution.sample(n)
        else:
            return _r_condmvn(
                n,
                mean=mean,
                cov=cov,
                dependent_ind=subset_j,
                given_ind=subsetj_conditional,
                x_given=xjc,
            )

    np.random.seed(123)
    n_inputs = 3
    mean = np.zeros(3)
    cov = np.array([[1.0, 0, 0], [0, 1.0, 1.8], [0, 1.8, 4.0]])
    method = "exact"
    n_perms = None
    n_output = 10 ** 4
    n_outer = 10 ** 3
    n_inner = 10 ** 2

    col = ["X" + str(i) for i in np.arange(n_inputs) + 1]
    names = ["Shapley effects", "std. errors", "CI_min", "CI_max"]

    expected = pd.DataFrame(
        data=[
            [0.101309, 0.418989, 0.479701],
            [0.00241549, 0.16297, 0.163071],
            [0.096575, 0.0995681, 0.160083],
            [0.106044, 0.73841, 0.79932],
        ],
        index=names,
        columns=col,
    ).T

    calculated = get_shapley(
        method,
        gaussian_model,
        x_all,
        x_cond,
        n_perms,
        n_inputs,
        n_output,
        n_outer,
        n_inner,
    )

    aaae(calculated, expected)


def test_get_shapley_random():
    def gaussian_model(x):
        return np.sum(x, 1)

    def x_all(n):
        distribution = cp.MvNormal(mean, cov)
        return distribution.sample(n)

    def x_cond(n, subset_j, subsetj_conditional, xjc):
        if subsetj_conditional is None:
            cov_int = np.array(cov)
            cov_int = cov_int.take(subset_j, axis=1)
            cov_int = cov_int[subset_j]
            distribution = cp.MvNormal(mean[subset_j], cov_int)
            return distribution.sample(n)
        else:
            return _r_condmvn(
                n,
                mean=mean,
                cov=cov,
                dependent_ind=subset_j,
                given_ind=subsetj_conditional,
                x_given=xjc,
            )

    np.random.seed(123)
    n_inputs = 3
    mean = np.zeros(3)
    cov = np.array([[1.0, 0, 0], [0, 1.0, 1.8], [0, 1.8, 4.0]])
    method = "random"
    n_perms = 30000
    n_output = 10 ** 4
    n_outer = 1
    n_inner = 3

    col = ["X" + str(i) for i in np.arange(n_inputs) + 1]
    names = ["Shapley effects", "std. errors", "CI_min", "CI_max"]

    expected = pd.DataFrame(
        data=[
            [0.107543, 0.414763, 0.477694],
            [0.00307984, 0.0032332, 0.0031896],
            [0.101507, 0.408426, 0.471442],
            [0.11358, 0.4211, 0.483945],
        ],
        index=names,
        columns=col,
    ).T

    calculated = get_shapley(
        method,
        gaussian_model,
        x_all,
        x_cond,
        n_perms,
        n_inputs,
        n_output,
        n_outer,
        n_inner,
    )

    aaae(calculated, expected)


In [97]:
test_get_shapley_exact()

In [98]:
test_get_shapley_random()

## Calculate true Shapley values (as in test function).

In [75]:
    var_1 = 1.5
    var_2 = 2.5
    
    rho = 0.3
    covariance = rho * np.sqrt(var_1) * np.sqrt(var_2)
    beta_1 = 1.3
    beta_2 = 1.5
    beta = (beta_1, beta_2)

    component_1 = beta_1**2 * var_1
    component_2 = beta_2**2 * var_2
    var_y = component_1 + 2 * covariance * beta_1 * beta_2 + component_2
    share = 0.5 * (rho**2)
    true_shapley_1 = (component_1 * (1 - share) + covariance * beta_1 * beta_2 + component_2 * share)/var_y
    true_shapley_2 = (component_2 * (1 - share) + covariance * beta_1 * beta_2 + component_1 * share)/var_y
print(true_shapley_1, '\n', true_shapley_2)

0.3651456842665369 
 0.6348543157334631


## Look for rounding issues.

In [77]:
    var_1 = 1.5
    var_2 = 2.5
    
    rho = 0.3
    #covariance = 
    beta_1 = 1.3
    beta_2 = 1.5
    #beta = (beta_1, beta_2)

    component_1 = (beta_1**2 * var_1)
    component_2 = (beta_2**2 * var_2)
    var_y = ((beta_1**2 * var_1) + 2 * (rho * np.sqrt(var_1) * np.sqrt(var_2)) * beta_1 * beta_2 + (beta_2**2 * var_2))
    share = 0.5 * (rho**2)
    true_shapley_1 = ((beta_1**2 * var_1) * (1 - share) + (rho * np.sqrt(var_1) * np.sqrt(var_2)) * beta_1 * beta_2 + (beta_2**2 * var_2) * share)/((beta_1**2 * var_1) + 2 * (rho * np.sqrt(var_1) * np.sqrt(var_2)) * beta_1 * beta_2 + (beta_2**2 * var_2))
    true_shapley_2 = ((beta_2**2 * var_2) * (1 - share) + (rho * np.sqrt(var_1) * np.sqrt(var_2)) * beta_1 * beta_2 + (beta_1**2 * var_1) * share)/((beta_1**2 * var_1) + 2 * (rho * np.sqrt(var_1) * np.sqrt(var_2)) * beta_1 * beta_2 + (beta_2**2 * var_2))
print(true_shapley_1, '\n', true_shapley_2)

0.3651456842665369 
 0.6348543157334631
