# Null tests for high delay detections
This notebook constructs null tests between high-delay signals in power spectra with different prominence of systematic features. Null test gives the differences between power spectra of different data selection as a consistency check.

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import hera_pspec as hp
from pyuvdata import UVData
from scipy.stats import norm
from scipy.optimize import curve_fit
import pyuvdata.utils as uvutils
from scipy import integrate,stats

In [None]:
# Load beam model
beamfile = 'HERA_NF_dipole_power.beamfits'
cosmo = hp.conversions.Cosmo_Conversions()
uvb = hp.pspecbeam.PSpecBeamUV(beamfile, cosmo=cosmo)

In [None]:
# Load data into UVData objects
dfile = 'zen.2458101.clean-002.uvh5' 
uvd = UVData()
uvd.read(dfile)

In [None]:
# find conversion factor from Jy to mK
Jy_to_mK = uvb.Jy_to_mK(np.unique(uvd.freq_array), pol='xx')
uvd.data_array *= Jy_to_mK[None, None, :, None]

In [None]:
# slide the time axis by one integration
uvd1 = uvd.select(times=np.unique(uvd.time_array)[16:44:2], inplace=False)
uvd2 = uvd.select(times=np.unique(uvd.time_array)[17:45:2], inplace=False)

In [None]:
# Create a new PSpecData object
ds = hp.PSpecData(dsets=[uvd1, uvd2], wgts=[None, None], beam=uvb)
ds.rephase_to_dset(0)

# Define uvp for 'clean' data
baselines = [(66, 67), (83, 84), (37, 38)]
uvp = ds.pspec(baselines, baselines, (0, 1), [('xx', 'xx')], spw_ranges=[(520, 690)],  
               input_data_weight='identity',
               norm='I', taper='blackman-harris', verbose=False) 

In [None]:
spw = 0
dlys = uvp.get_dlys(spw) * 1e9

blp = ((66, 67), (66, 67))
key1 = (spw, blp, 'xx')
power1 = np.real(uvp.get_data(key1)) # power spectrum with clear cable reflection features

blp = ((83, 84), (83, 84))
key2 = (spw, blp, 'xx')
power2 = np.real(uvp.get_data(key2)) # power spectrum without clear cable reflection features

blp = ((37, 38), (37, 38))
key3 = (spw, blp, 'xx')
power3 = np.real(uvp.get_data(key3)) # power spectrum without clear cable reflection features

In [None]:
power = {0:power1, 1:power2, 2:power3}
key = {0:key1, 1:key2, 2:key3}

Define the PDFs and CDFs of the $\Delta\mathcal{CNN}$ and $\mathcal{N}$ distributions:

Step 2: Fit Gaussian distribution and the $\Delta\mathcal{CNN}$ distribution with differences 1 & 2.

Step 3: Perform KS goodness-of-fit test for each fit.

In [None]:
# PDF of the delta CNN distribution
def null_pdf(x, s):
    return (1/(2*s**2))*np.exp(-2*np.abs(x)/(s))*(s+2*np.abs(x))

# CDF of the delta CNN distribution
def null_cdf(x, sig, range_start, range_end):
    values = []
    for value in x:
        integral = integrate.quad(lambda k: null_pdf(k, sig),range_start,value)[0]
        normalized = integral/integrate.quad(lambda k: null_pdf(k, sig),range_start,range_end)[0]
        values.append(normalized)
    return np.array(values)

# PDF of the N distribution
def gaus_pdf(x, mu, sig):
    return np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))

# CDF of the N distribution
def gaus_cdf(x, mu, sig, range_start, range_end):
    values = []
    for value in x:
        integral = integrate.quad(lambda k: gaus_pdf(k, mu, sig),range_start,value)[0]
        normalized = integral/integrate.quad(lambda k: gaus_pdf(k, mu, sig),range_start,range_end)[0]
        values.append(normalized)
    return np.array(values)

Define KS test:

In [None]:
def ks_test(data, pdf, guess_args):
    """
    H0: the data follow a given distribution
    d < critical value (cv)  -->> accept H0
    
    Parameters
    ----------
    data: 1d array
        empirical data 
    pdf: callable
        probability density function
    guess_args: float, 1d array
        pdf parameters guess
        
    Returns
    -------
    d: float
        KS-test stats
    cv: float
        critical value (Significance level:  α = 0.05)
    m: Boolean
        KS-test result: Accept H0 (True); reject H0 (False)
    """
    
    # get CDF of empirical data
    counts, bin_edges = np.histogram (data, bins=len(data), density=True)
    ecdf = np.cumsum(counts)/(np.cumsum(counts)[-1])
    
    # fit data to the model 
    y, x = np.histogram(data, bins='auto', density=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0
    popt, pcov = curve_fit(pdf, x, y, p0=guess_args)
    
    # compute CDF from the fitted PDF
    cdf_scaled = np.cumsum(pdf(bin_edges[1:], *popt))
    cdf = cdf_scaled/cdf_scaled[-1]
    
    # perform KS-test
    gaps = np.column_stack([cdf - ecdf, ecdf - cdf])
    d = np.max(gaps)
    cv = 1.36/np.sqrt(len(data))
    
    if d < cv:
        m = True
    if d > cv:
        m = False
    
    return [d, cv, m]

### Null test

Step 1: construct two null tests:
    * null test #1: diff1 = high-delay signals in power2 - high-delay signals in power1
    * null test #2: diff1 = high-delay signals in power2 - high-delay signals in power3
    
Step 2: Fit Gaussian distribution and the $\Delta\mathcal{CNN}$ distribution with differences 1 & 2.
    
Step 3: Perform KS goodness-of-fit test for each fit.

In [None]:
def ks_test(data, pdf, guess_args):
    """
    H0: the data follow a given distribution
    d < critical value (cv)  -->> accept H0
    
    Parameters
    ----------
    data: 1d array
        empirical data 
    pdf: callable
        probability density function
    guess_args: float, 1d array
        pdf parameters guess
        
    Returns
    -------
    d: float
        KS-test stats
    cv: float
        critical value (Significance level:  α = 0.05)
    m: Boolean
        KS-test result: Accept H0 (True); reject H0 (False)
    """
    
    # get CDF of empirical data
    counts, bin_edges = np.histogram (data, bins=len(data), density=True)
    ecdf = np.cumsum(counts)/(np.cumsum(counts)[-1])
    
    # fit data to the model 
    y, x = np.histogram(data, bins='auto', density=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0
    popt, pcov = curve_fit(pdf, x, y, p0=guess_args)
    
    # compute CDF from the fitted PDF
    cdf_scaled = np.cumsum(pdf(bin_edges[1:], *popt))
    cdf = cdf_scaled/cdf_scaled[-1]
    
    # perform KS-test
    gaps = np.column_stack([cdf - ecdf, ecdf - cdf])
    d = np.max(gaps)
    cv = 1.36/np.sqrt(len(data))
    
    if d < cv:
        m = True
    if d > cv:
        m = False
    
    return [d, cv, m]

def null_pdf(x, s):
    return (1/(2*s**2))*np.exp(-2*np.abs(x)/(s))*(s+2*np.abs(x))

def null_cdf(x, sig, range_start, range_end):
    values = []
    for value in x:
        integral = integrate.quad(lambda k: null_pdf(k, sig),range_start,value)[0]
        normalized = integral/integrate.quad(lambda k: null_pdf(k, sig),range_start,range_end)[0]
        values.append(normalized)
    return np.array(values)

def gaussian(x, mu, sig):
    return np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))

def gaus_cdf(x, mu, sig, range_start, range_end):
    values = []
    for value in x:
        integral = integrate.quad(lambda k: gaussian(k, mu, sig),range_start,value)[0]
        normalized = integral/integrate.quad(lambda k: gaussian(k, mu, sig),range_start,range_end)[0]
        values.append(normalized)
    return np.array(values)

In [None]:
def null_test(dly1, dly2, avg='no'):
    """
    Perform null tests and KS tests. Plot CDFs of the data and the 
    fitted distribution. Plot KS test statistics of fittings.
    
    Parameter
    ---------
    dly1 : float
        Initial delay (ns).
    dly2 : float
        Final delay (ns).
    avg : string, optional
        Averaging the power spectrum or not.
        no averaging (default) : 'no'
        averaging by time : 'time'
        averaging by delays : 'delay'
    """
    
    # get index of selected delay range in dlys array
    dly_modes = [dly1, dly2] # ns
    dly_idx = [(np.abs(dlys - dly_modes[0])).argmin(), (np.abs(dlys - dly_modes[1])).argmin()]
               
    # Perform null test
    diff = {}
    for i in range(2):
        power_a = power[i] # i=0>>>power1, i=1>>>power2
        power_b = power[i+1] # i=0>>>power2, i=1>>>power3

        if avg == 'no':
           # get data in delay mode domain
            data_a = power_a[:][:,dly_idx[0]:dly_idx[1]] # i=0>>>data1, i=1>>>data2
            data_b = power_b[:][:,dly_idx[0]:dly_idx[1]] # i=0>>>data2, i=1>>>data3
        
            # subtraction
            if i == 0: # data2 - data1
                diff[i] = data_b.flatten() - data_a.flatten()
            if i == 1: # data2 - data3
                diff[i] = data_a.flatten() - data_b.flatten()
            
        if avg == 'delay':        
            # get data in delay mode domain
            data_a = np.mean(power_a[:][:,dly_idx[0]:dly_idx[1]], axis=0) # i=0>>>data1, i=1>>>data2
            data_b = np.mean(power_b[:][:,dly_idx[0]:dly_idx[1]], axis=0) # i=0>>>data2, i=1>>>data3
            # subtraction
            if i == 0:
                diff[i] = data_b-data_a
            if i == 1:
                diff[i] = data_a-data_b
                
        if avg == 'time':      
            # get data in delay mode domain
            data_a = np.mean(power_a[:][:,dly_idx[0]:dly_idx[1]], axis=1) # i=0>>>data1, i=1>>>data2
            data_b = np.mean(power_b[:][:,dly_idx[0]:dly_idx[1]], axis=1) # i=0>>>data2, i=1>>>data3
            # subtraction
            if i == 0:
                diff[i] = data_b-data_a
            if i == 1:
                diff[i] = data_a-data_b

    # Fit the differences with gaussian distribution   
    norm_popt = []
    norm_dn = []
    for i in range(len(diff)):
        norm_popt.append(norm.fit(diff[i]))
        # Perform KS-test to see if the null test results are Delta_{CNN} distributed
        norm_dn.append(stats.kstest(diff[i],lambda x: gaus_cdf(x, norm.fit(diff[i])[0],norm.fit(diff[i])[1], 
                                                               min(diff[i]), max(diff[i])))[0])
        
    # Fit the differences with complex double gaussian difference distribution
    s = []
    for i in range(len(diff)):
        y, x = np.histogram(diff[i], bins='auto', density=True)
        x = (x + np.roll(x, -1))[:-1] / 2.0
        popt, pcov = curve_fit(null_pdf, x, y, p0=np.std(diff[i]))
        s.append(popt)
        
    # Perform KS-test to see if the null test results are Delta_{CNN} distributed
    # get cdf for each dataset
    dn = []
    for i in range(len(diff)):
        dn.append(stats.kstest(diff[i],lambda x: null_cdf(x, s[i], min(diff[i]), max(diff[i])))[0])   
    
    # Compute the critical value at the 5% level
    cv = 1.36/np.sqrt(len(diff[0]))
    
    # Plot null-test CDFs
    fit_x = [np.linspace(min(diff[0]), max(diff[0]), 1000), np.linspace(min(diff[1]), max(diff[1]), 1000)]
    x = np.linspace(0, len(diff[0])-1, len(diff[0]))
    plt.subplots(4, 2, sharex=True, figsize=(12, 14))
    plt.subplots_adjust(hspace=.5, wspace=.7)
    for i in range(8):
        pos = 421+i
        plt.subplot(pos)
        if i == 0 or i == 1:
            idx = 1
            plt.plot(dlys, np.abs(power2.T))
        elif i == 2:
            idx = 0
            plt.plot(dlys, np.abs(power1.T))
        elif i == 3:
            idx = 2
            plt.plot(dlys, np.abs(power3.T))
        elif i == 4:
            # plot CDF of differences between power2 and power1
            counts, bin_edges = np.histogram (diff[0], bins='auto', density=True)
            cdf = np.cumsum(counts)
            plt.plot(bin_edges[1:], cdf/cdf[-1], label='$P_{blp1}-P_{blp2}$') 
            # plot CDF of fitted normal distribution
            norm_cdf = norm.cdf(np.sort(diff[0]), *norm_popt[0])            
            plt.plot(np.sort(diff[0]), norm_cdf, '--', label='Fit')
            plt.grid()
            plt.legend(loc='upper left', fontsize=12)
            plt.xlabel(r"Re{$\Delta P(k)\}\ \rm [mK^2\ h^{-3}\ Mpc^3]$", fontsize=14)
            plt.ylabel("$P_\mathcal{N}$ (Re{$P(k)$})", fontsize=14)
            plt.title("Fit $\mathcal{N}$ to differences", fontsize=14)
            continue
        elif i == 5:
            # plot CDF of differences between power2 and power3
            counts, bin_edges = np.histogram (diff[1], bins='auto', density=True)
            cdf = np.cumsum(counts)
            plt.plot(bin_edges[1:], cdf/cdf[-1], label='$P_{blp1}-P_{blp3}$')
            # plot CDF of fitted normal distribution
            norm_cdf = norm.cdf(np.sort(diff[1]), *norm_popt[1])               
            plt.plot(np.sort(diff[1]), norm_cdf, '--', label='Fit')
            plt.grid()
            plt.legend(loc='upper left', fontsize=12)
            plt.xlabel(r"Re{$\Delta P(k)\}\ \rm [mK^2\ h^{-3}\ Mpc^3]$", fontsize=14)
            plt.ylabel("$P_\mathcal{N}$ (Re{$P(k)$})", fontsize=14)
            plt.title("Fit $\mathcal{N}$ to differences", fontsize=14)
            continue
        elif i == 6:
            # plot CDF of differences between power2 and power1
            counts, bin_edges = np.histogram (diff[0], bins='auto', density=True)
            cdf = np.cumsum(counts)
            plt.plot(bin_edges[1:], cdf/cdf[-1], label='$P_{blp1}-P_{blp2}$')
            # plot CDF of fitted delta CNN distribution
            bin_edges_m = (bin_edges + np.roll(bin_edges, -1))[:-1] / 2.0
            cdf_fit = np.cumsum(null_pdf(bin_edges_m[:], s[0]))
            plt.plot(bin_edges[1:], cdf_fit/cdf_fit[-1], '--', label='Fit') 
            plt.grid()
            plt.legend(loc='upper left', fontsize=12)
            plt.xlabel(r"Re{$\Delta P(k)\}\ \rm [mK^2\ h^{-3}\ Mpc^3]$""\n\n"r"(a)", fontsize=14)
            plt.ylabel("$P_{\Delta\mathcal{CNN}}$ (Re{$P(k)$})", fontsize=14)
            plt.title("Fit $\Delta\mathcal{CNN}$ to differences", fontsize=14)
            continue
        else:
            # plot CDF of differences between power2 and power3
            counts, bin_edges = np.histogram (diff[1], bins='auto', density=True)
            cdf = np.cumsum(counts)
            plt.plot(bin_edges[1:], cdf/cdf[-1], label='$P_{blp1}-P_{blp3}$')
            # plot CDF of fitted delta CNN distribution
            bin_edges_m = (bin_edges + np.roll(bin_edges, -1))[:-1] / 2.0
            cdf_fit = np.cumsum(null_pdf(bin_edges_m[:], s[1]))
            plt.plot(bin_edges[1:], cdf_fit/cdf_fit[-1], '--', label='Fit')
            plt.grid()
            plt.legend(loc='upper left', fontsize=12)
            plt.xlabel(r"Re{$\Delta P(k)\}\ \rm [mK^2\ h^{-3}\ Mpc^3]$""\n\n"r"(b)", fontsize=14)
            plt.ylabel("$P_{\Delta\mathcal{CNN}}$ (Re{$P(k)$})", fontsize=14)
            plt.title("Fit $\Delta\mathcal{CNN}$ to differences", fontsize=14)
            continue
        # Box regions that went through the tests
        box = [3e6, 3e12]
        plt.yscale('log')
        plt.grid()
        plt.vlines(dly_modes[0], box[0], box[1])
        plt.vlines(dly_modes[1], box[0], box[1])
        plt.hlines(box[0], dly_modes[0], dly_modes[1])
        plt.hlines(box[1], dly_modes[0], dly_modes[1])
        plt.xlabel("$τ$ [ns]", fontsize=14)
        plt.ylabel(r"$P(k)\ \rm [mK^2\ h^{-3}\ Mpc^3]$", fontsize=14)
        t = [1, 1, 2, 3]
        ct = [key[idx][0], t[i], key[idx][1], key[idx][2]]
        plt.title("spw : {}, blp{} : {}, pol : {}".format(*ct), fontsize=14)
    plt.savefig('null_test_sub.png')
    plt.show()

    # plotting KS test statistics
    plt.figure(figsize=[6.4,3.4])
    plt.plot([1,2], [norm_dn[0], norm_dn[1]], 'x', label='$\mathcal{N}$ fit')
    plt.plot([1,2], [dn[0], dn[1]], 'x', label='$\Delta\mathcal{CNN}$ fit')
    plt.plot([0.5, 2.5], [cv, cv], 'k--', label='critical value ($α$=5%)')
    plt.legend(loc=0, fontsize=12)
    plt.grid()
    my_xticks = ['$P_{blp1}-P_{blp2}$','$P_{blp1}-P_{blp3}$']
    plt.xticks([1, 2], my_xticks, fontsize=14)
    plt.ylabel('test statistic', fontsize=14)
    plt.title('KS test', fontsize=14)

In [None]:
null_test(2000, 4000)