# Averaging Kolmogorov-Smirnov test statistics by baselines

This notebook computes the averaged Kolmogorov-Smirnov test (KS test) statistics and plots the averaged KS stats as a function of sample size. 

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
from pyuvdata import UVData
import hera_pspec as hp
from scipy import stats
from scipy.optimize import curve_fit
import operator
from scipy import integrate, stats
from scipy.stats import norm

In [None]:
# Load data into UVData objects
dfile = 'zen.2458101.clean-002.uvh5'
uvd = UVData()
uvd.read(dfile)

In [None]:
# get all the baseline pairs
bls = {}
for i in range(len(uvd.get_antpairs())):
    bls[i] = (uvd.get_antpairs()[i][0], uvd.get_antpairs()[i][1], 'xx')

In [None]:
# define spectral window
chan = [520, 690]

# create visibility data dictionary for this frequency range and filter out flagged visbilities
vis = {}
for i in range(len(bls)):
    vis[bls[i]] = uvd.get_data([bls[i]])[16:46,chan[0]:chan[1]] * ~uvd.get_flags([bls[i]])[16:46,chan[0]:chan[1]]

In [None]:
# remove data with zero visibilities
vis = {x:y for x,y in vis.items() if y[0][0]!=0}
# get a new array of baselines after zero visibilities removed
bls = list(vis.keys())

# get baselines of all the cross-correlation visibilities
non_auto_bls = []
for i in range(len(vis)):
    if bls[i][0] == bls[i][1]:
        continue
    else:
        non_auto_bls.append(bls[i])

In [None]:
# Load beam model
beamfile = 'HERA_NF_dipole_power.beamfits'
cosmo = hp.conversions.Cosmo_Conversions()
uvb = hp.pspecbeam.PSpecBeamUV(beamfile, cosmo=cosmo)

# find conversion factor from Jy to mK
Jy_to_mK = uvb.Jy_to_mK(np.unique(uvd.freq_array), pol='xx')
uvd.data_array *= Jy_to_mK[None, None, :, None]

In [None]:
# slide the time axis by one integration
uvd1 = uvd.select(times=np.unique(uvd.time_array)[16:44:2], inplace=False)
uvd2 = uvd.select(times=np.unique(uvd.time_array)[17:45:2], inplace=False)

In [None]:
# Create a new PSpecData object
ds = hp.PSpecData(dsets=[uvd1, uvd2], wgts=[None, None], beam=uvb)
ds.rephase_to_dset(0)

The following cell creats arrays to index in the non_auto_bls array in order to select baselines with prominent features of cable reflection in their power spectra. The indexes were identified manually (by eyes) from all cross-correlation auto-baseline power spectra in data set 'zen.2458101.clean-002.uvh5'. 

In [None]:
# get non_auto_bls array index of baselines with clear sys. features in their pspec
idx_ref = [33, 35, 36, 37, 47, 48, 49, 72, 73, 74, 75]

# by subtracting the elements, idx_ctr (which stands for 'index_control') is a control group of baselines
# without clear sys. features in their pspec
idx_ctr = list(set(np.arange(0, len(non_auto_bls)))-set(idx_ref))[:len(idx_ref)]

In [None]:
# create arrays of baselines for cross-correlation visibilities

# ref = reflection -> with clear sys. features in pspec
# ctr = control -> without clear sys. features in pspec

c_bls_ref = []
c_bls_ctr = []
for i in range(len(idx_ref)):
    c_bls_ref.append(non_auto_bls[idx_ref[i]])
for i in range(len(idx_ctr)):
    c_bls_ctr.append(non_auto_bls[idx_ctr[i]])

In [None]:
# estimate 11 power spectra with clear sys. features
power_ref = {}
key_ref = {}
for i in range(len(c_bls_ref)):
    uvp = ds.pspec([c_bls_ref[i][:2]], [c_bls_ref[i][:2]], (0, 1), [('xx', 'xx')], spw_ranges=[(520, 690)],  
               input_data_weight='identity',
               norm='I', taper='blackman-harris', verbose=False)
    blp = (c_bls_ref[i][:2], c_bls_ref[i][:2])
    a = (0, blp, 'xx')
    key_ref[i] = a
    power_ref[i] = np.real(uvp.get_data(a))

In [None]:
# estimate 11 power spectra with clear sys. features
power_ctr = {}
key_ctr = {}
for i in range(len(c_bls_ctr)):
    uvp = ds.pspec([c_bls_ctr[i][:2]], [c_bls_ctr[i][:2]], (0, 1), [('xx', 'xx')], spw_ranges=[(520, 690)],  
               input_data_weight='identity',
               norm='I', taper='blackman-harris', verbose=False)
    blp = (c_bls_ctr[i][:2], c_bls_ctr[i][:2])
    a = (0, blp, 'xx')
    key_ctr[i] = a
    power_ctr[i] = np.real(uvp.get_data(a))

In [None]:
# get delay modes
dlys = uvp.get_dlys(0) * 1e9

In [None]:
# plot the time-averaged pspec to check if they are indeed with/without clear sys. features
for i in range(len(power_ref)):
    plt.figure(figsize=(4, 3))
    plt.semilogy(dlys, np.abs(np.mean(power_ref[i],axis=0)))
    plt.title(str(i))
    plt.show()

In [None]:
power = {0:power_ctr, 1:power_ref}
key = {0:key_ctr, 1:key_ref}

Define PDF and CDF of models ($\mathcal{CNN}$, $\Delta\mathcal{CNN}$ and $\mathcal{N}$)

In [None]:
def real_pdf(z, s):
    a = 1/(s)
    b = (-np.abs(2*z))/(s)
    return a*np.exp(b)

def real_cdf(x, s, range_start, range_end):
    values = []
    for value in x:
        integral = integrate.quad(lambda k: real_pdf(k,s),range_start,value)[0]
        normalized = integral/integrate.quad(lambda k: real_pdf(k,s),range_start,range_end)[0]
        values.append(normalized)
    return np.array(values)

def null_pdf(x, s):
    return (1/(2*s**2))*np.exp(-2*np.abs(x)/(s))*(s+2*np.abs(x))

def null_cdf(x, s, range_start, range_end):
    values = []
    for value in x:
        integral = integrate.quad(lambda k: null_pdf(k,s),range_start,value)[0]
        normalized = integral/integrate.quad(lambda k: null_pdf(k,s),range_start,range_end)[0]
        values.append(normalized)
    return np.array(values)

def gaussian(x, mu, sig):
    return np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))

def gaus_cdf(x, mu, sig, range_start, range_end):
    values = []
    for value in x:
        integral = integrate.quad(lambda k: gaussian(k, mu, sig),range_start,value)[0]
        normalized = integral/integrate.quad(lambda k: gaussian(k, mu, sig),range_start,range_end)[0]
        values.append(normalized)
    return np.array(values)

In [None]:
def ks_test(dly1, dly2, p, avg='no', fit_norm=False):
    # get delay mode index
    dly_modes = [dly1, dly2] # ns
    dly_idx = [(np.abs(dlys - dly_modes[0])).argmin(), (np.abs(dlys - dly_modes[1])).argmin()]

    # get high-delay power
    if avg == 'no':
        data = p[:,dly_idx[0]:dly_idx[1]].flatten()
    elif avg == 'time':
        data = np.mean(p[:,dly_idx[0]:dly_idx[1]], axis=0)
    elif avg == 'delay':
        data = np.mean(p[:,dly_idx[0]:dly_idx[1]], axis=1)

   # KS test (norm) 
    if fit_norm:
        npt = norm.fit(data)
        norm_dn = stats.kstest(data, lambda x: gaus_cdf(x, npt[0], npt[1], min(data), max(data)))[0]
           
    # KS test (cnn)
    y, x = np.histogram(data, bins=15, density=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0
    popt = curve_fit(real_pdf, x, y, p0=np.std(data))[0]
    cnn_dn = stats.kstest(data,lambda x: real_cdf(x, popt, min(data), max(data)))[0]
    
    # get KS test critical value at the 5% level of significance
    cv = 1.36/np.sqrt(len(data))
    
    if fit_norm:
        a = [cnn_dn, cv, norm_dn]
    else:
        a = [cnn_dn, cv]
    return a

def null_ks_test(dly1, dly2, p1, p2, avg='no', fit_norm=False):
    # get delay mode index
    dly_modes = [dly1, dly2] # ns
    dly_idx = [(np.abs(dlys - dly_modes[0])).argmin(), (np.abs(dlys - dly_modes[1])).argmin()]

    # get high-delay power
    if avg == 'no':
        data1 = p1[:,dly_idx[0]:dly_idx[1]].flatten()
        data2 = p2[:,dly_idx[0]:dly_idx[1]].flatten()
    elif avg == 'time':
        data1 = np.mean(p1[:,dly_idx[0]:dly_idx[1]], axis=0)
        data2 = np.mean(p2[:,dly_idx[0]:dly_idx[1]], axis=0)
    elif avg == 'delay':
        data1 = np.mean(p1[:,dly_idx[0]:dly_idx[1]], axis=1)
        data2 = np.mean(p2[:,dly_idx[0]:dly_idx[1]], axis=1)
    
    # null test
    data = data1 - data2
    
    # KS test (norm) 
    if fit_norm:
        npt = norm.fit(data)
        norm_dn = stats.kstest(data, lambda x: gaus_cdf(x, npt[0], npt[1], min(data), max(data)))[0]
           
    # KS test (cnn)
    y, x = np.histogram(data, bins=15, density=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0
    popt = curve_fit(null_pdf, x, y, p0=np.std(data))[0]
    dcnn_dn = stats.kstest(data,lambda x: null_cdf(x, popt, min(data), max(data)))[0]
    
    # get KS test critical value at the 5% level of significance
    cv = 1.36/np.sqrt(len(data))
    
    if fit_norm:
        a = [dcnn_dn, cv, norm_dn]
    else:
        a = [dcnn_dn, cv]
    return a

### Averaging Kolmogorov-Smirnov test statistics for $\mathcal{CNN}$ fitting with fixed data size

Perform KS test for each power spectrum

In [None]:
ks_stats_ctr = {}
for i in range(len(power_ctr)):
    ks_stats_ctr[i] = ks_test(2000, 4000, power_ctr[i], fit_norm=True)

In [None]:
ks_stats_ref = {}
for i in range(len(power_ref)):
    ks_stats_ref[i] = ks_test(2000, 4000, power_ref[i], fit_norm=True)

Compute the average value of 11$\times$2 KS stats and its standard error

In [None]:
# # for data from pspec without clear sys. features fitted with Gaussian
# avg_norm_dn_ctr = np.mean(np.asarray(list(ks_stats_ctr.values()))[:, 2])
# err_norm_dn_ctr = np.std(np.asarray(list(ks_stats_ctr.values()))[:, 2])/len(ks_stats_ctr)

# # for data from pspec with clear sys. features fitted with Gaussian
# avg_norm_dn_ref = np.mean(np.asarray(list(ks_stats_ref.values()))[:, 2])
# err_norm_dn_ref = np.std(np.asarray(list(ks_stats_ref.values()))[:, 2])/len(ks_stats_ref)

# for data from pspec without clear sys. features fitted with CNN
avg_cnn_dn_ctr = np.mean(np.asarray(list(ks_stats_ctr.values()))[:, 0])
err_cnn_dn_ctr = np.std(np.asarray(list(ks_stats_ctr.values()))[:, 0])/len(ks_stats_ctr)

# for data from pspec with clear sys. features fitted with CNN
avg_cnn_dn_ref = np.mean(np.asarray(list(ks_stats_ref.values()))[:, 0])
err_cnn_dn_ref = np.std(np.asarray(list(ks_stats_ref.values()))[:, 0])/len(ks_stats_ref)

# compute critical value (alpha = 5%)
cv = np.asarray(list(ks_stats_ctr.values()))[:, 1][0]

Plot the results

In [None]:
plt.figure(figsize=(8, 4))
plt.plot([0.5, 2.5], [cv, cv], '--', label = 'critical value ($α$=5%)')
plt.errorbar([1, 2], [avg_norm_dn_ref, avg_norm_dn_ctr], [err_norm_dn_ref, err_norm_dn_ctr], 
            color = 'green', fmt = '.',markersize = '4', ecolor = 'green', capsize = 2, 
            elinewidth = 0.5, markeredgewidth = 0.5, label = '$\mathcal{N}$ fit')
plt.errorbar([1, 2], [avg_cnn_dn_ref, avg_cnn_dn_ctr], [err_norm_dn_ref, err_norm_dn_ctr], 
            color = 'red', fmt = '.',markersize = '4', ecolor = 'red', capsize = 2, 
            elinewidth = 0.5, markeredgewidth = 0.5, label = '$\mathcal{CNN}$ fit')
plt.xticks([1, 2], ['with clear sys. features', 'without clear sys. features'], fontsize=14, rotation=0)
plt.grid(linestyle='dotted')
plt.legend(loc=5, fontsize=10)
plt.ylabel('Avg. KS test stats $D$', fontsize=14)
plt.title('KS tests for multiple pairs of baselines', fontsize=14)
plt.show()

### Averaging Kolmogorov-Smirnov test statistics for $\Delta\mathcal{CNN}$ fitting with fixed data size

We have:
    * Group 1 : 11 pspec with clear cable reflection features
    * Group 2 : 11 pspec without clear cable reflection features
    * Group 3 : 11 pspec in Group 2 but shifted by one index -- for example, group2 = [ps1, ps2, ps3], then group3 = [ps2, ps3, ps1]

Step 1: construct null tests by:
    * the nth pspec (2000-3000ns) in Group 1 - the nth pspec (2000-3000ns) in Group 2 = differences 1
    * the nth pspec (2000-3000ns) in Group 2 - the nth pspec (2000-3000ns) in Group 3 = differences 2

Step 2: Fit the Gaussian distribution and the $\Delta\mathcal{CNN}$ distribution with differences 1 & 2.

Step 3: Perform KS goodness-of-fit test for each fit.

Step 4: Average KS stats in each group and compute standard errors.

In [None]:
# create Group 3 by shifting the elements in the Group 2
from itertools import cycle, islice
def shift(d, n):
    return dict(zip(d, islice(cycle(d.values()), n, None)))

power_ctr2 = shift(power_ctr, 1)

In [None]:
# null + KS tests between Group 1 and Group 2
null_ks_ctr = {}
for i in range(len(power_ctr)):
    null_ks_ctr[i] = null_ks_test(2000, 3000, power_ctr[i], power_ctr2[i], avg='no', fit_norm=True)

In [None]:
# null + KS tests between Group 2 and Group 3
null_ks_ref = {}
for i in range(len(power_ref)):
    null_ks_ref[i] = null_ks_test(2000, 3000, power_ctr[i], power_ref[i], avg='no', fit_norm=True)

In [None]:
# # for null test between Group 1 & 2 fitted with Gaussian
# avg_norm_null_ctr = np.mean(np.asarray(list(null_ks_ctr.values()))[:, 2])
# err_norm_null_ctr = np.std(np.asarray(list(null_ks_ctr.values()))[:, 2])/len(null_ks_ctr)

# # for null test between Group 2 & 3 fitted with Gaussian
# avg_norm_null_ref = np.mean(np.asarray(list(null_ks_ref.values()))[:, 2])
# err_norm_null_ref = np.std(np.asarray(list(null_ks_ref.values()))[:, 2])/len(null_ks_ref)

# for null test between Group 1 & 2 fitted with delta CNN
avg_cnn_null_ctr = np.mean(np.asarray(list(null_ks_ctr.values()))[:, 0])
err_cnn_null_ctr = np.std(np.asarray(list(null_ks_ctr.values()))[:, 0])/len(null_ks_ctr)

# for null test between Group 2 & 3 fitted with delta CNN
avg_cnn_null_ref = np.mean(np.asarray(list(null_ks_ref.values()))[:, 0])
err_cnn_null_ref = np.std(np.asarray(list(null_ks_ref.values()))[:, 0])/len(null_ks_ref)

cv = np.asarray(list(null_ks_ctr.values()))[:, 1][0]

In [None]:
# plot the results
plt.figure(figsize=(8, 3))
plt.plot([0.5, 2.5], [cv, cv], '--', label = 'critical value ($α$=5%)')
plt.errorbar([1, 2], [avg_norm_null_ref, avg_norm_null_ctr], [err_norm_null_ref, err_norm_null_ctr], 
            color = 'green', fmt = '.',markersize = '4', ecolor = 'green', capsize = 2, 
            elinewidth = 0.5, markeredgewidth = 0.5, label = '$\mathcal{N}$ fit')
plt.errorbar([1, 2], [avg_cnn_null_ref, avg_cnn_null_ctr], [err_cnn_null_ref, err_cnn_null_ctr], 
            color = 'red', fmt = '.',markersize = '4', ecolor = 'red', capsize = 2, 
            elinewidth = 0.5, markeredgewidth = 0.5, label = '$\Delta\mathcal{CNN}$ fit')
plt.xticks([1, 2], ['with clear sys. features', 'without clear sys. features'], fontsize=14, rotation=0)
plt.grid(linestyle='dotted')
plt.legend(loc=5, fontsize=10)
plt.ylabel('Avg. KS test stats $D$', fontsize=14)
plt.title('Null tests + KS tests for multiple pairs of baselines', fontsize=14)
plt.show()

### Averaging Kolmogorov-Smirnov test statistics for $\mathcal{CNN}$ fitting for different data sizes

Repeating the above process for data selected from different delay range (2000 ns - **** ns).

In [None]:
num_dlys = []
avg_ctr = []
err_ctr = []
avg_ref = []
err_ref = []
cv = []

for i in range(30):
    # get dlys range (corresponding to the sample size)
    d_start = 109
    d_end = 111+2*i
    num_dlys.append(dlys[d_end]-dlys[d_start])
    ks_stats_ctr = {}
    for j in range(len(power_ctr)):
        ks_stats_ctr[j] = ks_test(dlys[d_start], dlys[d_end], power_ctr[j])
    ks_stats_ref = {}
    for j in range(len(power_ref)):
        ks_stats_ref[j] = ks_test(dlys[d_start], dlys[d_end], power_ref[j])
    
    avg_ctr.append(np.mean(np.asarray(list(ks_stats_ctr.values()))[:, 0]))
    err_ctr.append(np.std(np.asarray(list(ks_stats_ctr.values()))[:, 0])/len(ks_stats_ctr))

    avg_ref.append(np.mean(np.asarray(list(ks_stats_ref.values()))[:, 0]))
    err_ref.append(np.std(np.asarray(list(ks_stats_ref.values()))[:, 0])/len(ks_stats_ref))

    cv.append(np.asarray(list(ks_stats_ctr.values()))[:, 1][0])

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(num_dlys, cv, '--', label = 'critical value ($α$=5%)')
plt.errorbar(num_dlys, avg_ctr, err_ctr, 
            color = 'green', fmt = '.',markersize = '4', ecolor = 'green', capsize = 2, 
            elinewidth = 0.5, markeredgewidth = 0.5, label = 'without clear sys. features')
plt.errorbar(num_dlys, avg_ref, err_ref, 
            color = 'red', fmt = '.',markersize = '4', ecolor = 'red', capsize = 2, 
            elinewidth = 0.5, markeredgewidth = 0.5, label = 'with clear sys. features')
plt.grid(linestyle='dotted')
plt.legend(fontsize=10)
plt.yscale("log")
plt.xlabel('Delay width [ns]', fontsize=14)
plt.ylabel('Avg. KS test stats $D$', fontsize=14)
plt.title('KS tests for multiple pairs of baselines', fontsize=14)
plt.show()

### Averaging Kolmogorov-Smirnov test statistics for $\Delta\mathcal{CNN}$ fitting for different data sizes

In [None]:
null_num_dlys = []
null_avg_ctr = []
null_err_ctr = []
null_avg_ref = []
null_err_ref = []
null_cv = []

for i in range(30):
    # get dlys range (corresponding to the sample size)
    d_start = 109
    d_end = 111+2*i
    null_num_dlys.append(dlys[d_end]-dlys[d_start])
    null_ks_ctr = {}
    for j in range(len(power_ctr)):
        null_ks_ctr[j] = null_ks_test(dlys[d_start], dlys[d_end], power_ctr[j], power_ctr2[j])
    null_ks_ref = {}
    for j in range(len(power_ref)):
        null_ks_ref[j] = null_ks_test(dlys[d_start], dlys[d_end], power_ctr[j], power_ref[j])
    
    null_avg_ctr.append(np.mean(np.asarray(list(null_ks_ctr.values()))[:, 0]))
    null_err_ctr.append(np.std(np.asarray(list(null_ks_ctr.values()))[:, 0])/len(null_ks_ctr))

    null_avg_ref.append(np.mean(np.asarray(list(null_ks_ref.values()))[:, 0]))
    null_err_ref.append(np.std(np.asarray(list(null_ks_ref.values()))[:, 0])/len(null_ks_ref))

    null_cv.append(np.asarray(list(null_ks_ctr.values()))[:, 1][0])

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(null_num_dlys, null_cv, '--', label = 'critical value ($α$=5%)')
plt.errorbar(null_num_dlys, null_avg_ctr, null_err_ctr, 
            color = 'green', fmt = '.',markersize = '4', ecolor = 'green', capsize = 2, 
            elinewidth = 0.5, markeredgewidth = 0.5, label = 'without clear sys. features')
plt.errorbar(null_num_dlys, null_avg_ref, null_err_ref, 
            color = 'red', fmt = '.',markersize = '4', ecolor = 'red', capsize = 2, 
            elinewidth = 0.5, markeredgewidth = 0.5, label = 'with clear sys. features')
plt.grid(linestyle='dotted')
plt.legend(fontsize=10)
plt.yscale("log")
plt.xlabel('Delay width [ns]', fontsize=14)
plt.ylabel('Avg. KS test stats $D$', fontsize=14)
plt.title('KS tests for multiple pairs of baselines', fontsize=14)
plt.show()