In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Append base directory
import os,sys #,inspect
rootname = "pub-2020-exploratory-analysis"
#thispath = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
thispath = os.getcwd()
rootpath = os.path.join(thispath[:thispath.index(rootname)], rootname)
sys.path.append(rootpath)
print("Appended root directory", rootpath)

import lib.nullmodels.null3D as null3D
import lib.nullmodels.pidtest as pidtest

%load_ext autoreload
%autoreload 2

## Models
### Noisy Redundant Scenario

We want to check if white noise added to a purely redundant scenario results in correct identification of redundancy

$$X = T + \nu_X$$
$$Y = T + \nu_Y$$
$$Z = T + \nu_Z$$

where $Y$ is the target of $X$ and $Z$, and

$$T \sim \mathcal{N}(0, 1)$$
$$\nu_X, \nu_Y, \nu_Z \sim \mathcal{N}(0, \sigma)$$

and $\sigma$ is a free parameter, denoting the Noise-To-Signal ratio. So the signal should be a mixture of redundant signal and white noise.

Since the signal is continuous, we bin it using different bin counts.

### Noisy Unique Scenario

Same as before, but

$$X = T + \nu_X$$
$$Y = T + \nu_Y$$
$$Z = \nu_Z$$

In [None]:
funcDict = {
    'red':    null3D.gen_data_red_noisy,
    'unq_xz': null3D.gen_data_unq_noisy,
    'xor_z':  null3D.gen_data_xor_noisy
}

### Noisy Redundant Scenario - Discrete Case

It is important to test if false positives are caused by binning, or are an intrinsic property of the noise in the covariate. Here I propose a discretized noisy redundancy model. Instead of added noise, each variable has a random chance to produce the redundant outcome or a purely random outcome.

$$X \sim A_X \nu_X + (1 - A_X) T $$
$$Y \sim A_Y \nu_Y + (1 - A_Y) T $$
$$Z \sim A_Z \nu_Z + (1 - A_Z) T $$

where

$$T, \nu_X, \nu_Y, \nu_Z \sim Ber(0.5) $$
$$A_X \sim Ber(\alpha_X)$$
$$A_Y \sim Ber(\alpha_Y)$$
$$A_Z \sim Ber(\alpha_Z)$$

and $\alpha_X, \alpha_Y, \alpha_Z \in [0, 1]$ are flexible.

So, $\alpha = 0$ means purely redundant signal, and $\alpha=1$ means purely noisy signal.

In [None]:
def bernoulli(n, p):
    return (np.random.uniform(0, 1, n) < p).astype(int)

def gen_discrete_random(nSample, alphaX=0.5, alphaY=0.5, alphaZ=0.5):
    T = bernoulli(nSample, 0.5)
    nuX = bernoulli(nSample, 0.5)
    nuY = bernoulli(nSample, 0.5)
    nuZ = bernoulli(nSample, 0.5)
    aX = bernoulli(nSample, alphaX)
    aY = bernoulli(nSample, alphaY)
    aZ = bernoulli(nSample, alphaZ)
    
    x = aX*nuX + (1 - aX)*T
    y = aY*nuY + (1 - aY)*T
    z = aZ*nuZ + (1 - aZ)*T
    return x,y,z

### Testing binning-dependence

In [None]:
decompLabels = ['unq_s1', 'unq_s2', 'shd_s1_s2', 'syn_s1_s2']

In [None]:
taskDict = {
    'norand': np.array([0,0,1]),
    'randx': np.array([1,0,1]),
    'rand': np.array([1,1,1])
}

for taskName, params in taskDict.items():
    print(taskName)
    rezDict = {}

    # Do continuous tests
    for funcName, func in funcDict.items():
        for nBins in range(2, 6):        
            pid_bin = lambda x, y, z: pidtest.pid_bin(x,y,z, nBins)

            gen_data_eff = lambda: func(10000, *params)
            rezDF   = pidtest.run_tests(gen_data_eff, pid_bin, decompLabels, nTest=100)
            rezDFsh = pidtest.run_tests(gen_data_eff, pid_bin, decompLabels, nTest=100, haveShuffle=True)

            rezDict[(funcName, nBins)] = (rezDF, rezDFsh)
            
    # Do discrete tests
    pid_discr = lambda x, y, z: pidtest.pid(np.array([x,y,z]))

    gen_data_eff = lambda: gen_discrete_random(10000, *(0.5*params))
    rezDF   = pidtest.run_tests(gen_data_eff, pid_discr, decompLabels, nTest=100)
    rezDFsh = pidtest.run_tests(gen_data_eff, pid_discr, decompLabels, nTest=100, haveShuffle=True)

    rezDict[('red_discr', 2)] = (rezDF, rezDFsh)
    
    for k, v in rezDict.items():
        print(k)
        funcName, nBin = k
        rezDF, rezDFsh = v

        pidtest.plot_test_summary(rezDF, rezDFsh, suptitle=funcName, haveEff=False)
        plt.savefig(funcName + '_pid_nbin'+str(nBin)+'_summary_'+taskName+'.png', dpi=200)
        plt.show()

### Effect of variance

Continuous

In [None]:
pid_bin = lambda x, y, z: pidtest.pid_bin(x,y,z, 4)
pid_discr = lambda x, y, z: pidtest.pid(np.array([x,y,z]))

In [None]:
# Do continuous tests
for funcName, func in funcDict.items():
    print(funcName)
    
    funcEff = lambda alpha: func(n=1000, sigX=alpha, sigY=alpha, sigZ=alpha)
    
    pidtest.run_plot_param_effect(funcEff, pid_bin, decompLabels, nTest=200, alphaRange=(0, 2))
    plt.show()

In [None]:
nSample=10000
for funcName, func in funcDict.items():
    print(funcName)
    
    funcEff = lambda alpha: func(n=nSample, sigX=alpha, sigY=alpha, sigZ=alpha)
    pidtest.run_plot_param_effect_test(funcEff, pid_bin, decompLabels, nStep=10, nTest=400, alphaRange=(0, 2))
    
    plt.savefig(funcName + '_pid_nBin4_vareff_n'+str(nSample)+'.png', dpi=200)
    plt.show()

In [None]:
nSample=10000
func = lambda alpha: null3D.gen_data_xor_noisy(n=nSample, sigX=alpha, sigY=alpha, sigZ=alpha)
pidtest.run_plot_param_effect_test_single(func, pid_bin, decompLabels, 0, nTest=400)

Discrete

In [None]:
# Do discrete tests
funcEff = lambda alpha: gen_discrete_random(nSample=1000, alphaX=alpha, alphaY=alpha, alphaZ=alpha)
pidtest.run_plot_param_effect(funcEff, pid_discr, decompLabels, nTest=1000, alphaRange=(0, 1))

In [None]:
nSample=10000
funcEff = lambda alpha: gen_discrete_random(nSample=nSample, alphaX=alpha, alphaY=alpha, alphaZ=alpha)
pidtest.run_plot_param_effect_test(funcEff, pid_discr, decompLabels, nStep=10, nTest=400, alphaRange=(0, 1))

plt.savefig('redDiscr_pid_vareff_n'+str(nSample)+'.png', dpi=200)
plt.show()

### Effect of number of samples
Continuous

In [None]:
sig=1.0
for funcName, func in funcDict.items():
    print(funcName)

    funcEff = lambda n: func(n=n, sigX=sig, sigY=sig, sigZ=sig)
    pidtest.run_plot_data_effect_test(funcEff, pid_bin, decompLabels, nStep=10, nTest=400)
    
    plt.savefig(funcName + '_pid_nBin4_nEff_sig'+str(sig)+'.png', dpi=200)
    plt.show()

Discrete

In [None]:
alpha=0.5
funcEff = lambda n: gen_discrete_random(nSample=n, alphaX=alpha, alphaY=alpha, alphaZ=alpha)
pidtest.run_plot_data_effect_test(funcEff, pid_discr, decompLabels, nStep=10, nTest=400)

plt.savefig('redDiscr_pid_nEff_alpha'+str(alpha)+'.png', dpi=200)
plt.show()

### Test relationship of synergy and redundancy for fixed data size

#### 1. Finding max synergy parameters - GridSearch3D

In [None]:
for nSample in [1000, 3000, 5000, 7000, 10000]:
    print(nSample)
    pidtest.run_gridsearch_3D(null3D.gen_data_red_noisy, pid_bin, 'syn_s1_s2',
                              varLimits=(0, 2), nSample=nSample, nStep=20)

In [None]:
for nSample in [1000, 3000, 5000, 7000, 10000]:
    print(nSample)
    pidtest.run_gridsearch_3D(gen_discrete_random, pid_discr, 'syn_s1_s2',
                              varLimits=(0, 1), nSample=nSample, nStep=20)

#### 2. Finding max synergy parameters - GridSearch1D

Previous analysis found that in all cases maximal synergy is located at the diagonal $\alpha_x = \alpha_y$

In [None]:
for nSample in [1000, 3000, 5000, 7000, 10000]:
    print(nSample)
    pidtest.run_plot_1D_scan(null3D.gen_data_red_noisy, pid_bin, 'shd_s1_s2', 'syn_s1_s2',
                             varLimits=(0, 2), nSample=nSample, nStep=100, nTest=20)

In [None]:
for nSample in [1000, 3000, 5000, 7000, 10000]:
    print(nSample)
    pidtest.run_plot_1D_scan(gen_discrete_random, pid_discr, 'shd_s1_s2', 'syn_s1_s2',
                             varLimits=(0, 1), nSample=nSample, nStep=100, nTest=20)

#### 3. Determining Synergy-Redundancy Relationship

In [None]:
pidtest.run_plot_scatter_explore(null3D.gen_data_red_noisy, pid_bin, 'shd_s1_s2', 'syn_s1_s2', 3,
                         varLimits=(0, 0.5), nSample=1000, nTestDim=20)

In [None]:
pidtest.run_plot_scatter_explore(gen_discrete_random, pid_discr, 'shd_s1_s2', 'syn_s1_s2', 3,
                         varLimits=(0, 1), nSample=1000, nTestDim=20)

### Test relationship of unique and redundancy for fixed data size

In [None]:
for nSample in [1000, 3000, 5000, 7000, 10000]:
    print(nSample)
    pidtest.run_plot_1D_scan(null3D.gen_data_red_noisy, pid_bin, 'shd_s1_s2', 'unq_s1',
                             varLimits=(0, 1), nSample=nSample, nStep=10, nTest=200)