# RnD-2: spark-like сравнение стратегий агрегации для AA-test

In [1]:
import random, time, statistics
SEED=7
random.seed(SEED)

In [2]:
def make_data(n, k):
    g=[random.randint(0,1) for _ in range(n)]
    x=[[random.gauss(0,1) for _ in range(k)] for _ in range(n)]
    return g, x

def naive(g, x, k):
    passes=0
    out=[]
    for j in range(k):
        a=[row[j] for row,t in zip(x,g) if t==0]
        b=[row[j] for row,t in zip(x,g) if t==1]
        out.append(statistics.mean(b)-statistics.mean(a))
        passes += 2
    return out, passes

def batch(g, x, k):
    sum0=[0.0]*k; sum1=[0.0]*k; c0=0; c1=0
    for row,t in zip(x,g):
        if t==0:
            c0 += 1
            for j,v in enumerate(row): sum0[j]+=v
        else:
            c1 += 1
            for j,v in enumerate(row): sum1[j]+=v
    out=[sum1[j]/c1 - sum0[j]/c0 for j in range(k)]
    return out, 1

def partitioned(g, x, k, parts=12):
    n=len(g)
    size=max(1,n//parts)
    sum0=[0.0]*k; sum1=[0.0]*k; c0=0; c1=0
    for s in range(0,n,size):
        for row,t in zip(x[s:s+size], g[s:s+size]):
            if t==0:
                c0+=1
                for j,v in enumerate(row): sum0[j]+=v
            else:
                c1+=1
                for j,v in enumerate(row): sum1[j]+=v
    out=[sum1[j]/c1 - sum0[j]/c0 for j in range(k)]
    return out, parts

In [3]:
summary=[]
for n in [3000, 6000, 9000]:
    for k in [8, 12, 16]:
        g,x=make_data(n,k)
        for name,fn in [('naive',naive),('batch',batch),('partitioned',partitioned)]:
            t0=time.perf_counter(); _, passes = fn(g,x,k); dt=time.perf_counter()-t0
            summary.append({'N':n,'K':k,'method':name,'time_sec':round(dt,4),'passes':passes})
print('Результаты эксперимента 1 (первые 9 строк):')
for row in summary[:9]:
    print(row)

Результаты эксперимента 1 (первые 9 строк):
{'N': 3000, 'K': 8, 'method': 'naive', 'time_sec': 0.016, 'passes': 16}
{'N': 3000, 'K': 8, 'method': 'batch', 'time_sec': 0.0018, 'passes': 1}
{'N': 3000, 'K': 8, 'method': 'partitioned', 'time_sec': 0.0017, 'passes': 12}
{'N': 3000, 'K': 12, 'method': 'naive', 'time_sec': 0.0246, 'passes': 24}
{'N': 3000, 'K': 12, 'method': 'batch', 'time_sec': 0.0023, 'passes': 1}
{'N': 3000, 'K': 12, 'method': 'partitioned', 'time_sec': 0.0023, 'passes': 12}
{'N': 3000, 'K': 16, 'method': 'naive', 'time_sec': 0.0346, 'passes': 32}
{'N': 3000, 'K': 16, 'method': 'batch', 'time_sec': 0.0028, 'passes': 1}
{'N': 3000, 'K': 16, 'method': 'partitioned', 'time_sec': 0.0031, 'passes': 12}


In [4]:
# Эксперимент 2: агрегирование по методам
methods=['naive','batch','partitioned']
for m in methods:
    rows=[r for r in summary if r['method']==m]
    avg_t=round(sum(r['time_sec'] for r in rows)/len(rows),4)
    avg_p=round(sum(r['passes'] for r in rows)/len(rows),2)
    print(m, 'avg_time=', avg_t, 'avg_passes=', avg_p)

naive avg_time= 0.0488 avg_passes= 24.0
batch avg_time= 0.0045 avg_passes= 1.0
partitioned avg_time= 0.0047 avg_passes= 12.0
