### Decide whether a sample is from a discrete or continuous distribution

The code below computes the Normalized Nearest Neighbor Distribution (NNN) and then uses the Wasserstein distance to quantify how far from uniform it lives. Discrete distributions have an NNN that is much farther from uniform than continuous distributions.

It also examines cases where there are multiple repeated values, and tries to decide whether they represent legitimate sampling, or whether they result from some non-sampling-like process like for example clipping, using a numerical value as a placeholder for missing data, or from e.g. attaching per-image metadata to multiple targets iwthin an image.


In [None]:
import numbers

import numpy as np
from numpy.typing import NDArray
from scipy.stats import wasserstein_distance as emd
import copy
import pandas as pd

from scipy.stats import ks_2samp

# If the Wasserstein distance to a uniform distribution exceeds this, sample is discrete.
DISCRETE_MIN_EMD = 0.054

# samples smaller than this always look discrete, no need to test
CONTINUOUS_MIN_SAMPLE_SIZE = 20

In [None]:
def infer_discrete(X: NDArray, verbose=False) -> list[bool]:
    """Test for discreteness of a 1D sample.
    Imagine 1D data representing a sample of a continuous distribution, e.g. event times for emissions
    from a radioactive source. Think about the intervals between consecutive events; they have an exponential
    distribution. The most likely interval is zero, and longer intervals are exponentially less likely;
    the average interval is the reciprocal of the decay rate. This stands in stark contrast with the tick
    times of a clock; the distribution of intervals between clock ticks is extremely sharply peaked; the
    average and most likely intervals are in fact the same. Radioactive decay times and clock ticks
    illustrate the fundamental distinction between continuous and discrete distributions.

    Of course, any 1D sample can be sorted in the way that times naturally are, and so we can think
    about the intervals between adjacent points. For a continuous distribution, a point is equally likely
    to lie anywhere in the interval bounded by its two neighbors. Furthermore, we can put all "between-
    neighbor" locations on the same scale of 0 to 1 by subtracting the smaller neighbor and dividing out
    the length of the interval. (Duplicates are either assigned to zero or ignored, depending on context).
    These normalized locations will be much more uniformly distributed for continuous data than for discrete,
    and this gives us a way to distinguish them. Call this the Normalized Near Neighbor distribution (NNN),
    defined on the interval [0,1].

    The Wasserstein distance is available in scipy.stats.wasserstein_distance. We can use it to measure how close
    the NNN is to a uniform distribution over [0,1]. We found that as long as a sample has at least 20 points, and
    furthermore at least half as many points as there are discrete values, we can reliably distinguish
    discrete from continuous samples by testing that the Wasserstein distance is greater or less than 0.054,
    respectively.

    Example:

    >>> rng = np.random.default_rng(314159265)
    >>> Xnorm = rng.normal(100, scale=10, size=50)
    >>> print(f'Normal sample is discrete: {infer_discrete(Xnorm)[0]}')
    >>> Xfish = rng.poisson(100, size=50)
    >>> print(f'Poisson sample is discrete: {infer_discrete(Xfish)[0]}')
    >>> ks = ks_2samp(Xnorm, Xfish)
    >>> print(f'KS can distinguish Normal from Poisson: {ks.pvalue < 0.05}')
    Normal sample is discrete: False
    Poisson sample is discrete: True
    KS can distinguish Normal from Poisson: False
    """

    if X.ndim == 1:
        X = np.expand_dims(X, axis=1)
    n_examples, n_features = X.shape

    if n_examples < CONTINUOUS_MIN_SAMPLE_SIZE:
        print(
            f"All samples look discrete with so few data points (< {CONTINUOUS_MIN_SAMPLE_SIZE})"
        )
        return [True] * n_features

    shift = np.full(
        n_features, DISCRETE_MIN_EMD + 1
    )  # A shift of *more* than DISCRETE_MIN_EMD indicates discrete; so skipped features will be discrete
    looks_like_noisy_discrete = np.full(n_features, False) # re-checked when there are enough repeated values, may then become True. 
    for i in range(n_features):
        # Skip non-numerical features
        if not all(
            isinstance(xi, numbers.Number) for xi in X[:, i]
        ):  # NB: np.nan *is* a number in this context.
            continue

        # Require at least 3 unique values before bothering with NNN
        xu, nu = np.unique(X[:, i], return_counts=True, axis=None)
        if (
            len(xu) < 3
        ):  # Fewer than 3 unique values should definitely be called discrete.
            continue  # skip to next feature

        Xs = np.sort(X[:, i])

        # Xs may contain repeated values. If it does, these may indicate things we wish to exclude.
        # If repeats are nothing but extremes, exclude them (clipping values).
        repeats = xu[nu > 1]
        if len(repeats) > 1:
            just_clipping = all(
                np.logical_or(np.isclose(repeats, xu[0]), np.isclose(repeats, xu[-1]))
            )
            if just_clipping:
                Xs = xu[1:-1]  # Exclude the clipping values.
        else:  # Either no repeats, or just one value is repeated, likely a "missing data" indicator. So exclude the repeats.
            Xs = xu

        # If there are 3 or more distinct repeated values, we need to check whether non-repeats are in fact just very near
        #    to the repeats, but not exactly equal, due to noise. To do this, we need to bin the unique values according to
        #    bin boundaries put midway between repeated values, then form a normalized sample of locations within each bin. Repeated
        #    values all lie at 0.5 in such a sample, while unique values are distributed between 0 and 1.  We then compare the emd from this
        #    normalized distribution to two hypothetical distributions: (1) all values lie at 0.5, indicating a discrete distribution,
        #    and (2) values are uniform, indicating a continuous distribution. We choose the possibility with the smaller emd.
        #
        # In a case where there are many repeated values, yet the unique values appear more like a continuous sample than they appear like 
        #    noise-perturbed versions of the repeats, we conclude that the repeats are actual copies, e.g. they may be per-image
        #    metadata that were copied to multiple objects detected in the same image. infer_discrete() is agnostic, though, about how
        #    a sample comes to have many repeated values and yet also unique values that appear to be drawn from a continuous distribution; we are 
        #    merely asserting that data that should be treated as discrete will never have these properties.
        #
        if len(non_repeats:=xu[nu==1]) > 0 and n_examples - len(xu) >= 3:
            bin_centers = copy.deepcopy(repeats)

            # Most edges lie halfway between adjacent centers.
            bin_edges = (bin_centers[1:] + bin_centers[0:-1]) / 2

            # Find average point spacing and use it to position outer bin boundaries. 
            dx_avg = (xu[-1] - xu[0])/len(xu)
            b0, b1 = xu[0] - dx_avg, xu[-1] + dx_avg
            # Put it all together 
            bin_edges = np.concatenate((b0.reshape(1), bin_edges, b1.reshape(1)))

            # generate normalized repeat values, since not all bins necessarily have same width.
            itable = np.linspace(0, len(bin_edges) - 1, len(bin_edges))
            rindex = np.interp(bin_centers, bin_edges, itable)
            xindex = np.interp(non_repeats, bin_edges, itable)

            rfrac, _ = np.modf(rindex) # where does rindex lie relative to bin boundaries? Middle is rfrac = 0.5
            xint = np.floor(xindex).astype(np.intp)
            xfrac = xindex - xint
            rfracx = rfrac[xint]  # What is normalized x location, relative to the repeat in its bin? 

            denom = np.zeros_like(xindex)
            le, gt = xfrac <= rfracx, xfrac > rfracx # check which side of its repeat each x is on. 

            # when rfracx = 0.5, we want a denom of 1. 
            denom[le], denom[gt] = 2*rfracx[le], 2*(1 - rfracx[gt])

            xnorm = (xfrac - rfracx)/denom + 0.5
            dist_to_uni = emd(xnorm, np.linspace(0, 1, len(xnorm)))
            dist_to_discrete = emd(xnorm, np.zeros_like(xnorm) + 0.5)

            looks_like_noisy_discrete[i] = dist_to_discrete < dist_to_uni
            if verbose:
                print(f'looks_like_noisy_discrete: {looks_like_noisy_discrete}')
                print(f'\t distance to uniform: {dist_to_uni:.3f}')
                print(f'\t distance to discrete:{dist_to_discrete:.3f}')

        Xs = xu # NNN really only makes sense with unique values

        X0, X1 = Xs[0:-2], Xs[2:]  # left and right neighbors

        dx = np.zeros(len(Xs) - 2)  # no dx at end points, so len minus 2
        gtz = (X1 - X0) > 0  # check for repeats
        dx[np.logical_not(gtz)] = 0.0 #  set dx to zero for repeats

        # Finally, the core idea: dx is NNN samples.
        dx[gtz] = (Xs[1:-1] - X0)[gtz] / (X1 - X0)[gtz]  

        # how far is dx from uniform, for this feature?
        shift[i] = emd(
            dx, np.linspace(0, 1, len(dx))
        ) 

    looks_discrete = np.logical_or(shift > DISCRETE_MIN_EMD, looks_like_noisy_discrete)
    return list(looks_discrete)


In [None]:

rng = np.random.default_rng(314159265)

Xrep = rng.poisson(5, size=100)+ np.concatenate((rng.normal(scale=.11, size=50), np.zeros(50))) # 11% noise is a lot!  
infer_discrete(Xrep, verbose=True)

In [None]:
rng = np.random.default_rng(314159265)
npts = 800
loc = 100
Xnorm = rng.normal(loc, scale=np.sqrt(loc), size=npts)
print(f'Normal sample is discrete: {infer_discrete(Xnorm, verbose=True)[0]}')

Xfish = rng.poisson(loc, size=npts)
print(f'Poisson sample is discrete: {infer_discrete(Xfish, verbose=True)[0]}')

ks = ks_2samp(Xnorm, Xfish)
print(f'KS can distinguish Normal from Poisson: {ks.pvalue < 0.05}') # type: ignore


In [None]:
import matplotlib.pyplot as plt
plt.hist((Xnorm, Xfish));
plt.legend(['normal: continuous', 'Poisson: discrete'])
plt.title(f'Similar distributions (p = {ks.pvalue :.3f})');# type: ignore


Of course, the hist function bins both quantities and plots them as discrete. But the normal values are nevertheless from a continuous distribution.

The point is, the NNN and Wasserstein distance together provide a way to reasonably infer that a feature should be handled as discrete, for functions that want to know.


It's becoming apparent that we should deal with repeated values and non-repeated values separately. A sample with enough repeated values will always look discrete, even in cases where the non-repeated values are from a process that generates a continuous distribution, and the repeats are generated by something else entirely. For example, repeats can be causes by clipping, by using a numerical value for missing data, or by dealing the same value out to multiple examples, e.g. per image metadata for an image with multiple examples detected.

The next set of tests generate discrete samples without repeated values, and test the sensitivity of infer_discrete.


In [None]:
uni_d = np.random.permutation(2000) # no repeats
uni = np.random.uniform(-0.5, 0.5, size = len(uni_d))
print(len(np.unique(uni_d))/len(uni))

print(infer_discrete(uni_d))
print(infer_discrete(uni))

In [None]:
top_value = 2000
uni_d = np.random.permutation(top_value) # no repeats
# grab_some = np.random.uniform(size=len(uni_d)) > 0.7 # too sloppy
frac = 0.5
grab_some =  int(frac*top_value)
uni_d = uni_d[0:grab_some] # still random, but this way I know how many

noise = 0.5
uni = np.random.uniform(0, top_value, size = len(uni_d))

print(f'grabbed {len(uni_d)} values out of {top_value}')
print(f'What fraction are unique? {len(np.unique(uni_d))/len(uni_d)}')

print(f'uni_d looks_discrete: {infer_discrete(uni_d)}')
print(f'uni looks discrete: {infer_discrete(uni)}')

In [None]:
top_value = 2000 # will pick a permutation of integers from 0 to top_value
fracs = np.linspace(0.1, 0.35, 50)  # and keep this many of them to pass to infer_discrete(). 

ntry = 1000  # try it this many times to compute how accurate it is. 
c_acc = np.zeros(len(fracs))
d_acc = np.zeros(len(fracs))
# pval = np.zeros(ntry)

for ifrac, frac in enumerate(fracs):
    grab_some =  int(frac*top_value)

    udd = np.full(ntry, False)
    ud = np.full(ntry, False)

    for i in range(ntry):
        uni_d = np.random.permutation(top_value) # no repeats
        uni_d = uni_d[0:grab_some] # still random, but this way I know how many

        uni = np.random.uniform(0, top_value, size = len(uni_d))

        udd[i] = 1 if infer_discrete(uni_d)[0] else 0# returns a list of length 1. 
        ud[i] = 0 if infer_discrete(uni)[0] else 1
        # pval[i] = ks_2samp(uni, uni_d).pvalue

    d_acc[ifrac] = np.sum(udd).astype(np.float16)
    c_acc[ifrac] = np.sum(ud).astype(np.float16)

d_acc /= ntry
c_acc /= ntry

In [None]:
plt.plot(fracs, c_acc, fracs, d_acc)
plt.title('no repeats in sample: acc vs frac')
plt.xlabel('fraction of possible discrete values actually present in sample')
plt.ylabel('fraction of correct calls')
plt.legend(['continuous', 'discrete'])

Following are the tests that Ryan suggested, which got me thinking about repeats.


In [None]:
samples = 4000
objects = np.random.choice(2, 4000, p=[0.75,0.25])+1
metadata = np.random.rand(4000) * 50

# make a lot of metadata repeats. Should they be discrete? I think not! 
check = np.concatenate([np.repeat(metadata[i], objects[i]) for i in range(metadata.size)])
num_dups = check.size - np.unique(check).size
print(num_dups)

print(f'repeated metadata are discrete: {infer_discrete(check, verbose=True)}')
print(f'metadata are discrete: {infer_discrete(metadata, verbose=True)}')
print(f'rounding metadata makes them discrete: {infer_discrete(np.round(metadata), verbose=True)}')

In [None]:
ntests = 50
num_samples = [2000,4000,10000]
sample_number = np.array([[i]*ntests for i in num_samples]).flatten()[:,np.newaxis]
results = np.zeros((len(num_samples)*ntests,5))
for i,samples in enumerate(num_samples):
    for j in range(ntests):
        percent = (np.random.choice(4)+94)/100
        leftover = (1-percent)/3
        objects = np.random.choice(5, samples, p=[percent,0.0,leftover,leftover,leftover])+1
        num_img_with_objects = np.nonzero(objects>1)[0].size
        if (j+1) % 2 == 0:
            metadata = np.random.rand(int(samples*(100-(j+1))/100)) * 100
            icopy = np.random.choice(metadata.size, int(samples*(j+1)/100))
            metadata = np.concatenate([metadata, metadata[icopy]])
        else:
            metadata = np.random.rand(samples) * 100

        check = np.concatenate([np.repeat(metadata[i], objects[i]) for i in range(metadata.size)])
        unique = np.unique(check).size
        num_dups = check.size - unique

        obj_discrete = infer_discrete(check)
        meta_discrete = infer_discrete(metadata)

        results[int(i*ntests+j),:] = [num_img_with_objects/samples*100, num_dups, unique/samples*100, obj_discrete[0], meta_discrete[0]]

data = np.hstack((sample_number,results))
df = pd.DataFrame(data, columns=['Sample Number','Object Percentage', 'Total Duplicates', 'Percent Unique', 'Object Discrete', 'Meta Discrete'])

groups = df.groupby(by=['Sample Number','Meta Discrete'])
for group in groups:
    print(group[0], len(group[1]))
    print(np.array(group[1]['Percent Unique']))