# Testing Notebook 03

This notebook is where I'm developing the containment methods for multivariate functions in $\mathbb{R}^n$

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

from statdepth.depth._containment import _is_in_simplex
from statdepth.depth._depthcalculations import _subsequences

First, create some multidimensional data

In [11]:
data = [pd.DataFrame(np.random.randint(0,5,size=(30, 3)), columns=list('ABC')) for _ in range(7)]

In [12]:
from scipy.special import binom
from statdepth.depth._containment import _is_in_simplex
from statdepth.depth._depthcalculations import _subsequences
from typing import List

def _simplex_containment(data: List[pd.DataFrame], curve: pd.DataFrame, J=2, relax=False):
    n = len(data)
    l, d = data[0].shape
    
    # Iterate over our subsequences of functions to form simplex with d+1 vertices
    containment = 0

    # For each time index, check containment 
    for idx in curve.index:
        containment += _is_in_simplex(simplex_points=np.array([df.loc[idx, :] for df in data]), 
                                point=np.array(curve.loc[idx, :]))
    
    # If relaxation, return proportion of containment, else return integer divion so that we 
    # only get 1 if all rows are contained
    return containment / l if relax else containment // l
    

In [13]:
def _simplex_depth(data: list, curve: pd.DataFrame, J=2, relax=False):
    l, d = data[0].shape
    n = len(data)
    depth = 0
    
    S_nj = 0
    subseq = _subsequences([i for i in range(n)], d + 1)

    for seq in subseq:
        cdata = [data[i] for i in seq]
        S_nj += _simplex_containment(data=data, curve=curve, relax=relax)

    depth += S_nj / binom(n, d + 1)

    return depth

def simplexdepth(data: list, J=2, relax=False):
    depths = []
    f = [i for i in range(len(data))]
    for cdf in data:
        cdata = [df for df in data if df is not cdf]
        depths.append(_simplex_depth(data=cdata, curve=cdf, J=J, relax=relax))
        
    return pd.Series(index=f, data=depths)

In [14]:
%%timeit
simplexdepth(data)

10 s ± 450 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
def pointwise_depth(data: pd.DataFrame, J=2, containment='simplex'):
    
    n, d = data.shape
    depths = []
    for time in data.index:
        S_nj = 0
        
        point = data.loc[time, :]
        
        subseq = _subsequences(list(data.drop(time, axis=0).index), d + 1)

        for seq in subseq:
            S_nj += _is_in_simplex(simplex_points=
                    np.array(data.loc[seq, :]), point=np.array(point))
            
        depths.append(S_nj / binom(n, d + 1))
        
    return pd.Series(index=data.index, data=depths)
        

In [42]:
df = pd.DataFrame(np.random.rand(12, 3), columns=list('ABC'))
px.scatter_3d(x=df['A'], y=df['B'], z=df['C'])

In [43]:
# depths = pointwise_depth(data=df).sort_values(ascending=False)


In [44]:
def _plot(df, deep_or_outlying: pd.Series) -> None:
    n = len(df.columns)
    cols = df.columns
    select = df.loc[deep_or_outlying.index, :]
    
    if n > 3:
        pass
    elif n == 3:
        fig = go.Figure(data=[
            go.Scatter3d(x=df[cols[0]], y=df[cols[1]], z=df[cols[2]], mode='markers', marker_color='blue', name=''),
            go.Scatter3d(x=select[cols[0]], y=select[cols[1]], z=select[cols[2]], mode='markers', 
                      marker_color='red', name='')

        ])
        
        fig.update_layout(showlegend=False)
        fig.show()
    elif n == 2: 
        fig = go.Figure(data=[
            go.Scatter(x=df[cols[0]], y=df[cols[1]], mode='markers', marker_color='blue', name=''),
            go.Scatter(x=select[cols[0]], y=select[cols[1]], mode='markers', 
                      marker_color='red', name='')
        ])
        
        fig.update_layout(showlegend=False)
        fig.show()
    else: # n = 1
        pass

# _plot(df, depths[0:1])

In [45]:
from statdepth import PointwiseDepth    
df = pd.DataFrame(np.random.rand(20, 3), columns=list('ABC'))


In [46]:
d = PointwiseDepth(df)


In [47]:
d

0     0.000000
1     0.028483
2     0.033024
3     0.000000
4     0.000000
5     0.000000
6     0.000000
7     0.000000
8     0.003302
9     0.000000
10    0.101135
11    0.000000
12    0.000000
13    0.000000
14    0.000000
15    0.000000
16    0.003302
17    0.000000
18    0.000000
19    0.000000
dtype: float64

In [55]:
d.median()

10    0.101135
dtype: float64

In [49]:
from statdepth.depth._containment import _is_in_simplex

In [50]:
s = _is_in_simplex

In [51]:
s == _is_in_simplex

True

In [61]:
df

Unnamed: 0,A,B,C
0,0.275182,0.113696,0.215249
1,0.463616,0.30882,0.267753
2,0.623271,0.214659,0.579178
3,0.582068,0.67739,0.018283
4,0.590587,0.844949,0.828108
5,0.057397,0.659594,0.400673
6,0.574153,0.925712,0.29367
7,0.433343,0.132712,0.783638
8,0.629156,0.172964,0.669006
9,0.025275,0.196187,0.576997


In [69]:
def _pointwisedepth(data: pd.DataFrame, points: pd.Index=None, J=2, containment='simplex'):
    """Compute pointwise depth for n points in R^p, where data is an nxp matrix of points. If points is not None,
    only compute depth for the given points (should be a subset of data.index)"""
    n, d = data.shape
    depths = []
    to_compute = data.index
    if points is not None:
        to_compute = points

    if containment == 'simplex':
        for time in to_compute:
            S_nj = 0
            point = data.loc[time, :]
            
            subseq = _subsequences(list(data.drop(time, axis=0).index), d + 1)
            print(f'len of subseq is {len(subseq)}')
            for seq in subseq:

                S_nj += _is_in_simplex(simplex_points=
                        np.array(data.loc[seq, :]), point=np.array(point))
                
            depths.append(S_nj / binom(n, d + 1))

            
    return pd.Series(index=to_compute, data=depths)

def _samplepointwisedepth(data: pd.DataFrame, points: pd.Index=None, K=2, J=2, containment='simplex'):
    n, d = data.shape 
    to_compute = data.index 
    depths = []
    if points is not None:
        to_compute = points 
    
    # K blocks of points (indices)
    ss = n // K 

    for time in to_compute:
        cd = []
        for _ in range(ss):
            sdata = data.sample(n=ss, axis=0)
            
            # If our current datapoint isnt in the sampled data, just append it since we need to sample it 
            # for _is_in_simplex()
            if not time in sdata.index:
                sdata = sdata.append(data.loc[time, :])
                
            cd.append(_pointwisedepth(data=sdata, points=[time], J=J, containment=containment))
        depths.append(np.mean(cd))
        
    print(f'depths is {depths}')
    return pd.Series(index=to_compute, data=depths)

ds = _samplepointwisedepth(data=df, K=2)

len of subseq is 210
len of subseq is 126
len of subseq is 126
len of subseq is 210
len of subseq is 126
len of subseq is 210
len of subseq is 210
len of subseq is 210
len of subseq is 210
len of subseq is 210
len of subseq is 126
len of subseq is 126
len of subseq is 126
len of subseq is 210
len of subseq is 210
len of subseq is 126
len of subseq is 126
len of subseq is 210
len of subseq is 210
len of subseq is 210
len of subseq is 210
len of subseq is 126
len of subseq is 210
len of subseq is 126
len of subseq is 210
len of subseq is 210
len of subseq is 210
len of subseq is 210
len of subseq is 126
len of subseq is 126
len of subseq is 210
len of subseq is 210
len of subseq is 126
len of subseq is 126
len of subseq is 210
len of subseq is 126
len of subseq is 126
len of subseq is 210
len of subseq is 210
len of subseq is 210
len of subseq is 210
len of subseq is 126
len of subseq is 210
len of subseq is 126
len of subseq is 210
len of subseq is 210
len of subseq is 126
len of subseq

In [71]:
ds.sort_values(ascending=False)

10    0.097273
2     0.030476
1     0.017576
16    0.002857
8     0.002121
19    0.000000
3     0.000000
4     0.000000
5     0.000000
6     0.000000
7     0.000000
9     0.000000
18    0.000000
11    0.000000
12    0.000000
13    0.000000
14    0.000000
15    0.000000
17    0.000000
0     0.000000
dtype: float64

In [75]:
import plotly.graph_objects as go 

_plot(df, d.ordered()[0:3])