# Implementation of Probabilistic Containment for Normal Distributions

In this notebook, we'll implement the probabilistic definition of containment for band depth proposed by Yohei.

In [1]:
import statdepth
from scipy.integrate import quad
import numpy as np
import pandas as pd 
import scipy.stats as stats
from tqdm import tqdm

from numpy import exp
from scipy.special import erf
from scipy.integrate import quad
from scipy.special import binom

import bigfloat
from bigfloat import BigFloat

Now we'll define our probability, $D(f \mid f_1,...,f_n) = \binom{n}{2}^{-1}\sum_{1 \leq i < j \leq n} Pr(f_i \leq f \leq f_j)$. We'll define our probabilistic band depth function to take two $1\times n$ arrays for the mean/std, respectively. Then each $f_k = N(\mu_k, \sigma^2_k)$ for $k=1,...,n$.  

In [2]:
from statdepth.testing import generate_noisy_univariate

df = generate_noisy_univariate(n=2)

We'll use random samples from $N(0, 1)$ to generate our means and stds for now. 

In [49]:
N = 20
means, stds = np.random.randint(1, 15, N), np.random.randint(1, 15, N)

means, stds

(array([ 6,  7, 10, 11,  5,  8,  8,  5, 11,  1, 11,  4,  6, 11,  2, 11,  7,
        13,  4,  3]),
 array([ 2,  2,  1, 14, 11,  6, 11, 12,  5, 13,  2,  1,  2,  9, 12,  7, 13,
         6, 11,  8]))

We'll first write a function that calculates the depth for a single normal distribution with respect to a set of normals

In [63]:
from statdepth.depth.calculations._helper import _subsequences
from scipy.stats import norm

def f_normal(z: float, parameters):
    mu_i, sigma_i, mu_j, sigma_j, mu, sigma = parameters
    
    return exp(-(mu-z)**2 / (2*sigma))*\
            (1+erf((mu_j-z)/(np.sqrt(2)*np.sqrt(sigma_j))))*\
            (1-erf((mu_i-z)/(np.sqrt(2)*np.sqrt(sigma_i))))


def _normal_depth(means, stds, curr):
    n = len(means)
    cols = list(range(n))
    cols.remove(curr)
    S_nj = 0
    subseq = _subsequences(cols, 2)

    for sequence in subseq:
        i, j = sequence
        
        parameters = [
            means[i], stds[i], 
            means[j], stds[j], 
            means[curr], stds[curr]
        ]
        
        integral = quad(lambda x: f_normal(x, parameters), -np.inf, np.inf)[0]
        S_nj += (1/(4*np.sqrt(2*np.pi*stds[i]))) * integral
        
    return S_nj / binom(n, 2)

As a sanity check, we'll numerically approximate the exact triple integral with Monte Carlo simulation to make sure our simplified integral is correct

In [None]:
from scipy.integral import tplquad

def npdf(x, mu, sigma):
    return norm(loc=mu, scale=sigma).pdf(x)

def f_normal_mcmc(x: float, y: float, z: float, parameters: list):
    mu_i, sigma_i, mu_j, sigma_j, mu, sigma = parameters
    
    return npdf(x, mu_i, sigma_i)*npdf(y, mu_j, sigma_j)*npdf(z, mu, sigma)

parameters = [5,4,2,1,2,1]

tplquad(lambda x, y, z: f_normal_mcmc(x, y, z, parameters)) 

And finally our function that calculates depth for all functions in the set

In [53]:
def probabilistic_normal_depth(means, stds):
    if len(means) != len(stds):
        raise ValueError('Error, len(means) must equal len(stds)')

    depths = []
    for k in tqdm(range(len(means))):
        mc = np.delete(means, [k])
        stdsc = np.delete(stds, [k])
        
        depths.append(    
            _normal_depth(means, stds, k)
        )
    
    return pd.Series(index=range(len(means)), data=depths)

We can now test and plot it to see if things look correct at first glance

In [54]:
depths = probabilistic_normal_depth(means, stds)

df = pd.DataFrame()

for d, c in zip([means, stds, 1- depths], ['means', 'stds', 'depths']):
    df[c] = d
    
df

100%|██████████| 20/20 [00:31<00:00,  1.59s/it]


Unnamed: 0,means,stds,depths
0,6,2,0.896211
1,7,2,0.871952
2,10,1,0.902183
3,11,14,0.736546
4,5,11,0.792433
5,8,6,0.768478
6,8,11,0.719982
7,5,12,0.78185
8,11,5,0.823199
9,1,13,0.916028


Let's take a look at the depths and then visualize them

In [61]:
import plotly.graph_objects as go

def plot_data(N, df):
    d = []

    largest = df.nlargest(N, columns='depths')
    
    for i, (mu, sigma) in enumerate(zip(largest['means'], largest['stds'])):
        x = np.linspace(mu - 3*sigma, mu+3*sigma, 100)
        d.append(
            go.Scatter(
                x=x, 
                y=stats.norm.pdf(x, mu, sigma), 
                mode='lines', 
                line=dict(color='red', width=1)
            )
        )
    
    df = df.drop(largest.index)
    for i, (mu, sigma) in enumerate(zip(df['means'], df['stds'])):
        x = np.linspace(mu - 3*sigma, mu+3*sigma, 100)
        d.append(
            go.Scatter(
                x=x, 
                y=stats.norm.pdf(x, mu, sigma), 
                mode='lines', 
                line=dict(color='blue', width=1)
        ))
    
    go.Figure(
        data=d,
        layout=go.Layout(title=f'Normal Distributions ({N} deepest colored in red)', showlegend=False)
    ).show()

In [60]:
plot_data(5, df)

Let's now implement the probabilistic Poisson distribution. Suppose $f_1,...,f_n$ are independent and follow a Poisson distribution with distinct parameters. Consider $f \sim Poisson(\lambda) \neq f_i \sim Poisson(\lambda_i) \neq f_j \sim Poisson(\lambda_j)$.

In [9]:
from scipy.special import gamma, gammaincc, factorial
from numpy import exp

def gammainc(a, x):
    return gamma(a) * gammaincc(a, x)

def f_poisson(lambda_i: float, lambda_f: float, lambda_j: float, lim=100) -> float:
    '''
    Parameters:
    
    lambda_i: 
        mean of f_i (lower function)
    lambda_f: 
        mean of f (function to calculate probabilistic containment)
    lambda_j: 
        mean of f_j (upper function)
    lim=10000: Upper bound of discrete infinite sum
    
    Returns:
    
    float: Probability
    '''
    s = 0
    
    for z in range(1, lim):
        num = BigFloat(lambda_f**z*(gamma(z)-gammainc(z, lambda_j))*gammainc(1+z, lambda_i))
        denom = BigFloat(factorial(z)*gamma(z)*gamma(1+z))
        
        print(f'numerator is {num} \n\n and denominator is {denom}')
        s += BigFloat(num / denom)
        
    return exp(-lambda_f) * s

In [10]:
f_poisson(10, 1, 4)

numerator is 0.000490252411477194271864632479918100216 

 and denominator is 1.00000000000000000000000000000000000
numerator is 0.00503155891236997838966704676977315103 

 and denominator is 4.00000000000000000000000000000000000
numerator is 0.0945000341234263768752654755189723801 

 and denominator is 72.0000000000000000000000000000000000
numerator is 2.38644314714750294115219730883836746 

 and denominator is 3456.00000000000000000000000000000000
numerator is 71.7115149751412701562003348954021931 

 and denominator is 345600.000000000000000000000000000000
numerator is 2416.04093401910768079687841236591339 

 and denominator is 62208000.0000000000000000000000000000
numerator is 88443.6359273410052992403507232666016 

 and denominator is 18289152000.0000000000000000000000000
numerator is 3458331.02567039011046290397644042969 

 and denominator is 8193540096000.00000000000000000000000
numerator is 143137508.938522458076477050781250000 

 and denominator is 5309413982208000.0000000000000

BigFloat.exact('0.00147254315870933713622728428215942855', precision=113)

Finally, let's implement the binomial distribution version

And now a generalized version for pdfs