In [455]:
import numpy as np
from matplotlib import pyplot as plt
import logging
logging.basicConfig(level=logging.INFO)
import dask

In [478]:
def gen_params(lr=17, lb=19, nMonte=1) :
    """
    Generating experiment parameters
    
    
    """
    rr=np.concatenate([np.array([0.0]), np.linspace(0.1, 3, lr)]) 
    bb=np.linspace(0.45, 0.99, lb)
    
    N = 1e4
    nn = np.array([1e3, 1e4])
    ee = np.round(N ** (-bb),6)
    mm = np.round(np.sqrt(2*np.log(N) * rr),3)
    xx = np.array([0, .5])
    for itr in range(nMonte) :
        for n in nn :
            for eps in ee :
                for mu in mm :
                    for xi in xx :
                        yield {'itr' : itr, 'n' : n, 'N': N, 'ep' : eps, 'mu' : mu, 'xi' : xi} 


In [479]:
def sample_from_mixture(lmd0, lmd1, eps) :
    N = len(lmd0)
    idcs = np.random.rand(N) < eps
    #idcs = np.random.choice(np.arange(N), k)
    lmd = np.array(lmd0.copy())
    lmd[idcs] = np.array(lmd1)[idcs]
    return np.random.poisson(lam=lmd)

def power_law(n, xi) :
    p = np.arange(1.,n+1) ** (-xi)
    return p / p.sum()


In [484]:
from TwoSampleHC import two_sample_pvals, HC
import pandas as pd

def evaluate_iteration(n = 10, N = 10, ep = .1, mu = 1, xi = 0, metric = 'Hellinger') :
    logging.info(f"n={n}, N={N}, ep={ep}, mu={mu}, xi={xi}")
    P = power_law(N, xi)
    print(metric)
    
    if metric == 'Hellinger' :
      QP = (np.sqrt(P) + np.sqrt(mu))**2

    if metric == 'ChiSq' :
      QP = P + 2 * np.sqrt(P * mu)

    if metric == 'proportional' :
      QP = P *( 1 + r * np.log(N))

    if metric == 'power' :
      QP = P * (np.log(N) ** r)

    smp1 = sample_from_mixture(n*P, n*QP, ep)
    smp2 = sample_from_mixture(n*P, n*QP, ep)

    min_cnt = 0
    stbl = False
    gamma = 0.25

    pv = two_sample_pvals(smp1, smp2, randomize=True, sym=True)
    pv = pv[(smp1 == 0) | (smp2 == 0)]

    if len(pv) > 0 :
        hc, _ = HC(pv[pv < 1], stbl=stbl).HC(gamma=gamma)
        MinPv = -np.log(pv.min())
    else :
        print("empty")
        hc = np.nan
        MinPv = np.nan

    pv_NR = two_sample_pvals(smp1, smp2, randomize=False)
    pv_NR = pv_NR[(smp1 == 0) | (smp2 == 0)]
    
    if len(pv_NR) > 0 :
        hc_NR, _ = HC(pv_NR[pv_NR < 1], stbl=stbl).HC(gamma=gamma)
        MinPvNR = -np.log(pv_NR.min())
    else :
        print("empty")
        hc_NR = np.nan
        MinPvNR = np.nan

    return {'HC_NR' : hc_NR, 'minPv_NR' : MinPvNR,
            'HC' : hc, 'minPv' : MinPv}

In [485]:
#
df_sm = pd.DataFrame(gen_params(4, 3))
res_sm = df_sm[.apply(lambda row : evaluate_iteration(n=row['n'], N=row['N'], ep=row['ep'],
                                                      mu=row['mu'], xi=row['xi'], metric='Hellinger'
                                                      ), axis=1)

0.0


UnboundLocalError: local variable 'QP' referenced before assignment

In [465]:
import dask.dataframe as dd
from dask.distributed import Client
client = Client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 56522 instead


In [466]:
df = pd.DataFrame(gen_params(27, 29))
ddf = dd.from_pandas(df, npartitions=4)

In [467]:
# compute
x = ddf.apply(lambda row : evaluate_iteration(*row), axis=1, meta=dict)
y = x.compute()

In [468]:
results = pd.concat([df, pd.json_normalize(y)], axis=1)
results

Unnamed: 0,n,N,ep,mu,xi,HC_NR,minPv_NR,HC,minPv
0,1000.0,10000.0,0.015849,0.000,0.0,-12.605035,1.442691,2.037217,10.086182
1,1000.0,10000.0,0.015849,0.000,0.5,-3.304817,1.384825,2.204664,9.097858
2,1000.0,10000.0,0.015849,1.357,0.0,9404.084837,inf,29830.028342,inf
3,1000.0,10000.0,0.015849,1.357,0.5,1820.347374,inf,28394.783600,inf
4,1000.0,10000.0,0.015849,1.974,0.0,9766.580978,inf,29040.433870,inf
...,...,...,...,...,...,...,...,...,...
3243,10000.0,10000.0,0.000110,7.152,0.5,49.376108,inf,81.424812,inf
3244,10000.0,10000.0,0.000110,7.294,0.0,1248.249170,19.067122,18.501538,10.401861
3245,10000.0,10000.0,0.000110,7.294,0.5,49.839743,inf,81.559794,inf
3246,10000.0,10000.0,0.000110,7.434,0.0,44.011362,inf,77.801028,inf
