astropy/timeseries/periodograms/lombscargle/_statistics.py

"""
Utilities for computing periodogram statistics.

This is an internal module; users should access this functionality via the
``false_alarm_probability`` and ``false_alarm_level`` methods of the
``astropy.timeseries.LombScargle`` API.
"""

from functools import wraps

import numpy as np

from astropy import units as u


def _weighted_sum(val, dy):
    if dy is not None:
        return (val / dy**2).sum()
    else:
        return val.sum()


def _weighted_mean(val, dy):
    if dy is None:
        return val.mean()
    else:
        return _weighted_sum(val, dy) / _weighted_sum(np.ones(val.shape), dy)


def _weighted_var(val, dy):
    return _weighted_mean(val**2, dy) - _weighted_mean(val, dy) ** 2


def _gamma(N):
    from scipy.special import gammaln

    # Note: this is closely approximated by (1 - 0.75 / N) for large N
    return np.sqrt(2 / N) * np.exp(gammaln(N / 2) - gammaln((N - 1) / 2))


def vectorize_first_argument(func):
    @wraps(func)
    def new_func(x, *args, **kwargs):
        x = np.asarray(x)
        return np.array([func(xi, *args, **kwargs) for xi in x.flat]).reshape(x.shape)

    return new_func


def pdf_single(z, N, normalization, dH=1, dK=3):
    """Probability density function for Lomb-Scargle periodogram.

    Compute the expected probability density function of the periodogram
    for the null hypothesis - i.e. data consisting of Gaussian noise.

    Parameters
    ----------
    z : array-like
        The periodogram value.
    N : int
        The number of data points from which the periodogram was computed.
    normalization : {'standard', 'model', 'log', 'psd'}
        The periodogram normalization.
    dH, dK : int, optional
        The number of parameters in the null hypothesis and the model.

    Returns
    -------
    pdf : np.ndarray
        The expected probability density function.

    Notes
    -----
    For normalization='psd', the distribution can only be computed for
    periodograms constructed with errors specified.
    All expressions used here are adapted from Table 1 of Baluev 2008 [1]_.

    References
    ----------
    .. [1] Baluev, R.V. MNRAS 385, 1279 (2008)
    """
    z = np.asarray(z)
    if dK - dH != 2:
        raise NotImplementedError("Degrees of freedom != 2")
    Nk = N - dK

    if normalization == "psd":
        return np.exp(-z)
    elif normalization == "standard":
        return 0.5 * Nk * (1 - z) ** (0.5 * Nk - 1)
    elif normalization == "model":
        return 0.5 * Nk * (1 + z) ** (-0.5 * Nk - 1)
    elif normalization == "log":
        return 0.5 * Nk * np.exp(-0.5 * Nk * z)
    else:
        raise ValueError(f"normalization='{normalization}' is not recognized")


def fap_single(z, N, normalization, dH=1, dK=3):
    """Single-frequency false alarm probability for the Lomb-Scargle periodogram.

    This is equal to 1 - cdf, where cdf is the cumulative distribution.
    The single-frequency false alarm probability should not be confused with
    the false alarm probability for the largest peak.

    Parameters
    ----------
    z : array-like
        The periodogram value.
    N : int
        The number of data points from which the periodogram was computed.
    normalization : {'standard', 'model', 'log', 'psd'}
        The periodogram normalization.
    dH, dK : int, optional
        The number of parameters in the null hypothesis and the model.

    Returns
    -------
    false_alarm_probability : np.ndarray
        The single-frequency false alarm probability.

    Notes
    -----
    For normalization='psd', the distribution can only be computed for
    periodograms constructed with errors specified.
    All expressions used here are adapted from Table 1 of Baluev 2008 [1]_.

    References
    ----------
    .. [1] Baluev, R.V. MNRAS 385, 1279 (2008)
    """
    z = np.asarray(z)
    if dK - dH != 2:
        raise NotImplementedError("Degrees of freedom != 2")
    Nk = N - dK

    if normalization == "psd":
        return np.exp(-z)
    elif normalization == "standard":
        return (1 - z) ** (0.5 * Nk)
    elif normalization == "model":
        return (1 + z) ** (-0.5 * Nk)
    elif normalization == "log":
        return np.exp(-0.5 * Nk * z)
    else:
        raise ValueError(f"normalization='{normalization}' is not recognized")


def inv_fap_single(fap, N, normalization, dH=1, dK=3):
    """Single-frequency inverse false alarm probability.

    This function computes the periodogram value associated with the specified
    single-frequency false alarm probability. This should not be confused with
    the false alarm level of the largest peak.

    Parameters
    ----------
    fap : array-like
        The false alarm probability.
    N : int
        The number of data points from which the periodogram was computed.
    normalization : {'standard', 'model', 'log', 'psd'}
        The periodogram normalization.
    dH, dK : int, optional
        The number of parameters in the null hypothesis and the model.

    Returns
    -------
    z : np.ndarray
        The periodogram power corresponding to the single-peak false alarm
        probability.

    Notes
    -----
    For normalization='psd', the distribution can only be computed for
    periodograms constructed with errors specified.
    All expressions used here are adapted from Table 1 of Baluev 2008 [1]_.

    References
    ----------
    .. [1] Baluev, R.V. MNRAS 385, 1279 (2008)
    """
    fap = np.asarray(fap)
    if dK - dH != 2:
        raise NotImplementedError("Degrees of freedom != 2")
    Nk = N - dK

    # No warnings for fap = 0; rather, just let it give the right infinity.
    with np.errstate(divide="ignore"):
        if normalization == "psd":
            return -np.log(fap)
        elif normalization == "standard":
            return 1 - fap ** (2 / Nk)
        elif normalization == "model":
            return -1 + fap ** (-2 / Nk)
        elif normalization == "log":
            return -2 / Nk * np.log(fap)
        else:
            raise ValueError(f"normalization='{normalization}' is not recognized")


def cdf_single(z, N, normalization, dH=1, dK=3):
    """Cumulative distribution for the Lomb-Scargle periodogram.

    Compute the expected cumulative distribution of the periodogram
    for the null hypothesis - i.e. data consisting of Gaussian noise.

    Parameters
    ----------
    z : array-like
        The periodogram value.
    N : int
        The number of data points from which the periodogram was computed.
    normalization : {'standard', 'model', 'log', 'psd'}
        The periodogram normalization.
    dH, dK : int, optional
        The number of parameters in the null hypothesis and the model.

    Returns
    -------
    cdf : np.ndarray
        The expected cumulative distribution function.

    Notes
    -----
    For normalization='psd', the distribution can only be computed for
    periodograms constructed with errors specified.
    All expressions used here are adapted from Table 1 of Baluev 2008 [1]_.

    References
    ----------
    .. [1] Baluev, R.V. MNRAS 385, 1279 (2008)
    """
    return 1 - fap_single(z, N, normalization=normalization, dH=dH, dK=dK)


def tau_davies(Z, fmax, t, y, dy, normalization="standard", dH=1, dK=3):
    """tau factor for estimating Davies bound (Baluev 2008, Table 1)."""
    N = len(t)
    NH = N - dH  # DOF for null hypothesis
    NK = N - dK  # DOF for periodic hypothesis
    Dt = _weighted_var(t, dy)
    Teff = np.sqrt(4 * np.pi * Dt)  # Effective baseline
    W = fmax * Teff
    Z = np.asarray(Z)
    if normalization == "psd":
        # 'psd' normalization is same as Baluev's z
        return W * np.exp(-Z) * np.sqrt(Z)
    elif normalization == "standard":
        # 'standard' normalization is Z = 2/NH * z_1
        return _gamma(NH) * W * (1 - Z) ** (0.5 * (NK - 1)) * np.sqrt(0.5 * NH * Z)
    elif normalization == "model":
        # 'model' normalization is Z = 2/NK * z_2
        return _gamma(NK) * W * (1 + Z) ** (-0.5 * NK) * np.sqrt(0.5 * NK * Z)
    elif normalization == "log":
        # 'log' normalization is Z = 2/NK * z_3
        return (
            _gamma(NK)
            * W
            * np.exp(-0.5 * Z * (NK - 0.5))
            * np.sqrt(NK * np.sinh(0.5 * Z))
        )
    else:
        raise NotImplementedError(f"normalization={normalization}")


def fap_naive(Z, fmax, t, y, dy, normalization="standard"):
    """False Alarm Probability based on estimated number of indep frequencies."""
    N = len(t)
    T = max(t) - min(t)
    N_eff = fmax * T
    fap_s = fap_single(Z, N, normalization=normalization)
    # result is 1 - (1 - fap_s) ** N_eff
    # this is much more precise for small Z / large N
    # Ignore divide by zero no np.log1p - fine to let it return -inf.
    with np.errstate(divide="ignore"):
        return -np.expm1(N_eff * np.log1p(-fap_s))


def inv_fap_naive(fap, fmax, t, y, dy, normalization="standard"):
    """Inverse FAP based on estimated number of indep frequencies."""
    fap = np.asarray(fap)
    N = len(t)
    T = max(t) - min(t)
    N_eff = fmax * T
    # fap_s = 1 - (1 - fap) ** (1 / N_eff)
    # Ignore divide by zero no np.log - fine to let it return -inf.
    with np.errstate(divide="ignore"):
        fap_s = -np.expm1(np.log(1 - fap) / N_eff)
    return inv_fap_single(fap_s, N, normalization)


def fap_davies(Z, fmax, t, y, dy, normalization="standard"):
    """Davies upper-bound to the false alarm probability.

    (Eqn 5 of Baluev 2008)
    """
    N = len(t)
    fap_s = fap_single(Z, N, normalization=normalization)
    tau = tau_davies(Z, fmax, t, y, dy, normalization=normalization)
    return fap_s + tau


@vectorize_first_argument
def inv_fap_davies(p, fmax, t, y, dy, normalization="standard"):
    """Inverse of the davies upper-bound."""
    from scipy import optimize

    args = (fmax, t, y, dy, normalization)
    z0 = inv_fap_naive(p, *args)
    func = lambda z, *args: fap_davies(z, *args) - p
    res = optimize.root(func, z0, args=args, method="lm")
    if not res.success:
        raise ValueError(f"inv_fap_baluev did not converge for p={p}")
    return res.x


def fap_baluev(Z, fmax, t, y, dy, normalization="standard"):
    """Alias-free approximation to false alarm probability.

    (Eqn 6 of Baluev 2008)
    """
    fap_s = fap_single(Z, len(t), normalization)
    tau = tau_davies(Z, fmax, t, y, dy, normalization=normalization)
    # result is 1 - (1 - fap_s) * np.exp(-tau)
    # this is much more precise for small numbers
    return -np.expm1(-tau) + fap_s * np.exp(-tau)


@vectorize_first_argument
def inv_fap_baluev(p, fmax, t, y, dy, normalization="standard"):
    """Inverse of the Baluev alias-free approximation."""
    from scipy import optimize

    args = (fmax, t, y, dy, normalization)
    z0 = inv_fap_naive(p, *args)
    func = lambda z, *args: fap_baluev(z, *args) - p
    res = optimize.root(func, z0, args=args, method="lm")
    if not res.success:
        raise ValueError(f"inv_fap_baluev did not converge for p={p}")
    return res.x


def _bootstrap_max(t, y, dy, fmax, normalization, random_seed, n_bootstrap=1000):
    """Generate a sequence of bootstrap estimates of the max."""
    from .core import LombScargle

    rng = np.random.default_rng(random_seed)
    power_max = []
    for _ in range(n_bootstrap):
        s = rng.integers(0, len(y), len(y))  # sample with replacement
        ls_boot = LombScargle(
            t, y[s], dy if dy is None else dy[s], normalization=normalization
        )
        freq, power = ls_boot.autopower(maximum_frequency=fmax)
        power_max.append(power.max())

    power_max = u.Quantity(power_max)
    power_max.sort()

    return power_max


def fap_bootstrap(
    Z, fmax, t, y, dy, normalization="standard", n_bootstraps=1000, random_seed=None
):
    """Bootstrap estimate of the false alarm probability."""
    pmax = _bootstrap_max(t, y, dy, fmax, normalization, random_seed, n_bootstraps)

    return 1 - np.searchsorted(pmax, Z) / len(pmax)


def inv_fap_bootstrap(
    fap, fmax, t, y, dy, normalization="standard", n_bootstraps=1000, random_seed=None
):
    """Bootstrap estimate of the inverse false alarm probability."""
    fap = np.asarray(fap)
    pmax = _bootstrap_max(t, y, dy, fmax, normalization, random_seed, n_bootstraps)

    return pmax[np.clip(np.floor((1 - fap) * len(pmax)).astype(int), 0, len(pmax) - 1)]


METHODS = {
    "single": fap_single,
    "naive": fap_naive,
    "davies": fap_davies,
    "baluev": fap_baluev,
    "bootstrap": fap_bootstrap,
}


def false_alarm_probability(
    Z, fmax, t, y, dy, normalization="standard", method="baluev", method_kwds=None
):
    """Compute the approximate false alarm probability for periodogram peaks Z.

    This gives an estimate of the false alarm probability for the largest value
    in a periodogram, based on the null hypothesis of non-varying data with
    Gaussian noise. The true probability cannot be computed analytically, so
    each method available here is an approximation to the true value.

    Parameters
    ----------
    Z : array-like
        The periodogram value.
    fmax : float
        The maximum frequency of the periodogram.
    t, y, dy : array-like
        The data times, values, and errors.
    normalization : {'standard', 'model', 'log', 'psd'}, optional
        The periodogram normalization.
    method : {'baluev', 'davies', 'naive', 'bootstrap'}, optional
        The approximation method to use.
    method_kwds : dict, optional
        Additional method-specific keywords.

    Returns
    -------
    false_alarm_probability : np.ndarray
        The false alarm probability.

    Notes
    -----
    For normalization='psd', the distribution can only be computed for
    periodograms constructed with errors specified.

    See Also
    --------
    false_alarm_level : compute the periodogram level for a particular fap

    References
    ----------
    .. [1] Baluev, R.V. MNRAS 385, 1279 (2008)
    """
    if method == "single":
        return fap_single(Z, len(t), normalization)
    elif method not in METHODS:
        raise ValueError(f"Unrecognized method: {method}")
    method = METHODS[method]
    method_kwds = method_kwds or {}

    return method(Z, fmax, t, y, dy, normalization, **method_kwds)


INV_METHODS = {
    "single": inv_fap_single,
    "naive": inv_fap_naive,
    "davies": inv_fap_davies,
    "baluev": inv_fap_baluev,
    "bootstrap": inv_fap_bootstrap,
}


def false_alarm_level(
    p, fmax, t, y, dy, normalization, method="baluev", method_kwds=None
):
    """Compute the approximate periodogram level given a false alarm probability.

    This gives an estimate of the periodogram level corresponding to a specified
    false alarm probability for the largest peak, assuming a null hypothesis
    of non-varying data with Gaussian noise. The true level cannot be computed
    analytically, so each method available here is an approximation to the true
    value.

    Parameters
    ----------
    p : array-like
        The false alarm probability (0 < p < 1).
    fmax : float
        The maximum frequency of the periodogram.
    t, y, dy : arrays
        The data times, values, and errors.
    normalization : {'standard', 'model', 'log', 'psd'}, optional
        The periodogram normalization.
    method : {'baluev', 'davies', 'naive', 'bootstrap'}, optional
        The approximation method to use.
    method_kwds : dict, optional
        Additional method-specific keywords.

    Returns
    -------
    z : np.ndarray
        The periodogram level.

    Notes
    -----
    For normalization='psd', the distribution can only be computed for
    periodograms constructed with errors specified.

    See Also
    --------
    false_alarm_probability : compute the fap for a given periodogram level

    References
    ----------
    .. [1] Baluev, R.V. MNRAS 385, 1279 (2008)
    """
    if method == "single":
        return inv_fap_single(p, len(t), normalization)
    elif method not in INV_METHODS:
        raise ValueError(f"Unrecognized method: {method}")
    method = INV_METHODS[method]
    method_kwds = method_kwds or {}

    return method(p, fmax, t, y, dy, normalization, **method_kwds)