# Testing Notebook

This notebook provides some test code/ experimentation for the implementation of centrality analysis from the paper *On the Concept of Depth from Functional Data*

Warning: the work here is very rough.

The following function produces all subsequences of length `l` from the list `s`. 

In [6]:
from itertools import combinations 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

from statdepth import FunctionalDepth

We now test the standard definition of $V(x_{i_1}, ... ,x_{i_j})$ containment in $\mathbb{R}^2$ 

And define a fake dataset to do so

In [12]:
l = [6,7,6.5,6,6,8]*3
df = pd.DataFrame()

for k in range(10):
    w = np.random.rand()
    df[k] = [k*w for k in l]
    
bd = FunctionalDepth([df], J=2)

In [16]:
bd.plot_deepest(n=2)

In [4]:
df

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5
x_0,1,2,3,6.0,9,8
x_1,2,4,4,7.0,9,8
x_2,3,5,4,6.5,12,10
x_3,2,6,2,6.0,11,10
x_4,1,2,1,7.0,11,9


In [8]:
from statdepth.depth import banddepth, samplebanddepth

banddepth([df], containment='r2', J=2, relax=True).sort_values(ascending=False)

f_3    0.426667
f_1    0.413333
f_2    0.333333
f_5    0.266667
f_0    0.106667
f_4    0.000000
dtype: float64

In [6]:
samplebanddepth([df], K=2)

f_0    0.000000
f_1    0.166667
f_2    0.166667
f_3    0.333333
f_4    0.000000
f_5    0.166667
dtype: float64

We now test the closed form solution

In [None]:
from scipy.special import comb, binom

def banddepthclosed(data: pd.DataFrame, method='MBD'):
    n, p = data.shape
    rv = np.argsort(data, axis=0)
    rmat = np.argsort(rv, axis=0) + 1

    # This code is an explicit solution for J=2 and therefore cannot be generalized. 
    # band depth
    def _fbd2():
        down = np.min(rmat, axis=1) - 1
        up = n - np.max(rmat, axis=1)
        return (up * down + n - 1) / comb(n, 2)

    # modified band depth
    def _fmbd():
        down = rmat - 1
        up = n - rmat
        return ((np.sum(up * down, axis=1) / p) + n - 1) / comb(n, 2)

    if method == 'BD2':
        depth = _fbd2()
    elif method == 'MBD':
        depth = _fmbd()
    else:
        raise ValueError("Unknown input value for parameter `method`.")

    return depth

banddepthclosed(df)

We now redefine our band depth to use a modified definition for containment

In [None]:
def _r2_containment(data: pd.DataFrame, curve: pd.Series, relax: bool) -> float:
    # Generate discrete band 
    containment = 0
    
    y_range = []
    
    mins = data.min(axis=1)
    maxs = data.max(axis=1)
    
    intervals = [[i, j] for i, j in zip(mins, maxs)]
    
    # Check if each value in the curve is entirely contained within the band 
    for index, val in enumerate(curve):
        # If any value is not, then break out. This is strict containment!
        if intervals[index][0] <= val <= intervals[index][1]:
            containment += 1
        
    return containment / len(curve) if relax else containment // len(curve)

In [None]:
def _band_depth(data: pd.DataFrame, curve: str, relax: bool, containment='r2', J=2) -> float:    
    # Initialize band depth, n (number of curves)
    band_depth = 0
    n = data.shape[1]
    
    # Select our containment definition if it is in our pre-defined list
    if containment == 'r2':
        cdef = _r2_containment
    elif containment == 'r2_enum':
        cdef = _r2_enum_containment
    elif containment == 'simplex':
        cdef = _simplex_containment
    else:
        # TODO: Allow user to pass in custom definition of containment
        raise ValueError('Error: Unknown or unspecified definition of containment')

    # Grab the data for our curve so numerical slicing is guaranteed to work
    curve_data = data.loc[:, curve]

    # Drop the curve (we don't want it used in defining our band/generalized band -- doesn't make sense)
    data = data.drop(curve, axis=1)

    # Define our index to be the columns of our dataset, excluding the last row (for indexing reasons)
    idx = list(data.columns)
    
    # Compute band depth
    for j in range(2, J + 1):
        
        # Initialize S_n^(j) as defined in the paper
        S_nj = 0

        # Get a list of all possible subsequences of samples (cols)
        subseq = subsequences(idx, j)

        # Get generalized containment for this value of J=j
        for sequence in subseq:
            subseq_df = data.loc[:, list(sequence)]

            S_nj += cdef(data=subseq_df, curve=curve_data, relax=relax)

        band_depth += S_nj / binom(n, j)
    
    return band_depth

def banddepth(data: list, J=2, containment='r2', relax=False):
    # If only one item in the list, it is the real-valued case
    
    if len(data) == 1:
        band_depths = []
        df = data[0]
        for col in df.columns:
            band_depths.append(_band_depth(data=df, curve=col, containment=containment, J=J, relax=relax))
        return pd.Series(index=df.columns, data=band_depths)
    else: 
        pass
    
    return None

In [None]:
df

In [None]:
banddepth(data=[df], J=2, relax=True)

In [None]:
plt.plot(df);

In [None]:
banddepthclosed(df)

In [None]:
df = pd.DataFrame({'x_0': [1, 2, 1, 1, 2], 
                    'x_1': [2, 3, 4, 2, 2], 
                    'x_2': [1, 3, 1, 3, 2],
                    'x_3': [1, 2, 1, 1, 2],
                    'x_4': [2, 3, 4, 2, 1]}, 
                   index=['f_1', 'f_2', 'f_3', 'f_4', 'f_5'])

In [None]:
df = pd.DataFrame(np.random.randint(0,101,size=(137, 4)), columns=list('ABCD'))
df

In [None]:
from typing import Callable 

def _r2_containment(data: pd.DataFrame, curve: pd.Series, relax: bool) -> float:
    containment = 0
    
    y_range = []
    
    mins = data.min(axis=1)
    maxs = data.max(axis=1)
    
    intervals = [[i, j] for i, j in zip(mins, maxs)]
    
    # Check if each value in the curve is entirely contained within the band 
    for index, val in enumerate(curve):
        # If any value is not, then break out. This is strict containment!
        if intervals[index][0] <= val <= intervals[index][1]:
            containment += 1
        
    return containment / len(curve) if relax else containment // len(curve)

def _univariate_band_depth(data: pd.DataFrame, curve: int, relax: bool, containment: Callable, J=2) -> float:
    
    # Initialize band depth, n (number of curves)
    band_depth = 0
    n = data.shape[1]
    
    # Grab the data for our curve so numerical slicing is guaranteed to work
    curve_data = data.loc[:, curve]

    # Drop the curve (we don't want it used in defining our band/generalized band -- doesn't make sense)
    data = data.drop(curve, axis=1)

    # Define our index to be the columns of our dataset, excluding the last row (for indexing reasons)
    idx = list(data.columns)
    
    # Compute band depth
    for j in range(2, J + 1):
        
        # Initialize S_n^(j) as defined in the paper
        S_nj = 0

        # Get a list of all possible subsequences of samples (cols)
        subseq = subsequences(idx, j)

        # Get generalized containment for this value of J=j
        for sequence in subseq:
            subseq_df = data.loc[:, list(sequence)]
            S_nj += containment(data=subseq_df, curve=curve_data, relax=relax)

        band_depth += S_nj / binom(n, j)
    
    return band_depth

def banddepth(data: List[pd.DataFrame], J=2, containment='r2', relax=False, deep_check=False):
    # Select containment definition
    cdef = _r2_containment

    # If only one item in the list, it is the real-valued case
    if len(data) == 1:
        band_depths = []
        df = data[0]
        for col in df.columns:
            band_depths.append(_univariate_band_depth(data=df, curve=col, relax=relax, containment=cdef, J=J))
        return band_depths
    else: 
        # Multivariate case
        pass
    
    return None


In [None]:
from typing import List 
import random

def samplebanddepth(data: List[pd.DataFrame], K: int, J=2, containment='r2', relax=False, deep_check=False):

    band_depth_samples = []

    # Handle common errros
#     _handle_depth_errors(data=data, J=J, deep_check=deep_check)

    cdef = _r2_containment
    
    # Univariate case
    if len(data) == 1:
        df = data[0]
        ss = df.shape[0] // K
        for _ in range(K):
            t = df.sample(n=ss)
            df = df.drop(t.index)
            band_depth_samples.append((banddepth(data=[t], J=J, containment=containment, relax=relax, deep_check=deep_check)))
    else:
        # Multivariate case: partition list of DataFrames randomly, compute band depth w.r.t those
        shuffled = data.copy()
        random.shuffle(shuffled)
        ss = len(data) // K 

        for _ in range(K ):
            pass
        

    return (band_depth_samples)


In [None]:
samples = samplebanddepth([df], K=2, relax=True)

In [None]:
plt.plot(df);

In [None]:
samples

In [None]:
K=2
depths = []

for k in range(df.shape[1]):
    t = [samples[i][k] for i in range(K)]
    depths.append(np.mean(t))
    
depths

In [None]:
banddepth([df], relax=True)

In [None]:
!pip install ..

In [None]:
from statdepth.depth import banddepth, samplebanddepth

In [None]:
from inspect import signature

def test(arg1, arg2, arg3) -> bool:
    return True

sig = signature(test)

In [None]:
len(sig.parameters)