## TODO

- vectorize `tconfint()`
- vectorize `passToThread()`
  - generate 7300xn1 and 7300xn2 for the matrices of data
  - 7300 x n1 -> 7300 x 1 (`lower`)
  - 7300 x n2 -> 7300 x 1 (`upper`)
  - 7300 x 1 boolean vector, i-th entry is True iff `lower[i] <= delta-true <= upper[i]`
  - ouptut p-value which is proportion of boolean vector that is True

---

In [213]:
import numpy as np
import statsmodels.stats.api as sms
import scipy.stats as stats
import math
from itertools import combinations
import threading
#from tqdm import tqdm

In [214]:
from src.search import search
from src.bootstrap import bootstrap_ci
from src.perm_test import ttest_ind_vectorized, pval_vectorized

In [215]:
def get_partitions(n1, n2):
    total_length = n1 + n2

    def get_groups(idxs):
        i = 0
        idxs2 = []
        for j in idxs:
            while i < total_length and i < j:
                idxs2.append(i)
                i += 1

            if i == j:
                i += 1

        idxs2 += range(i, total_length)
        return list(idxs) + idxs2

    partitions = np.array([get_groups(idxs) for idxs in combinations(range(total_length), n1)])
    return partitions

In [216]:
def tconfint(alpha, pooled, x1, x2):
    cm = sms.CompareMeans(sms.DescrStatsW(x1), sms.DescrStatsW(x2))
    return cm.tconfint_diff(alpha, usevar="pooled" if pooled else "unequal")

In [217]:
def passToThread(n_samples):
    global n_captured
    
    for _ in range(n_samples):
        x1 = np.random.gamma(gamma1[0], gamma1[1], n1)
        x2 = np.random.gamma(gamma2[0], gamma2[1], n2)

        # TODO vectorize
        t99 = tconfint(0.001, pooled, x1, x2)
        t90 = tconfint(0.20, pooled, x1, x2)

        try:
            lower = search(x1, x2, partitions, t99[0], t90[0])
            upper = search(x1, x2, partitions, t90[1], t99[1])
        except AssertionError:
            continue

        intervals.append((lower, upper))
        n_captured += (lower <= delta_true) * (delta_true <= upper)

In [218]:
alpha = 0.05
alternative = "less"
pooled = True

gamma1 = (2, 5)  # shape k, scale theta
gamma2 = (4, 3)
delta_true = (gamma1[0] * gamma1[1]) - (gamma2[0] * gamma2[1])  # true mean difference

n1, n2 = 12, 8
partitions = get_partitions(n1, n2)

intervals = []
n_captured = 0

In [219]:
%%time
# time how long it takes to find one confidence interval
passToThread(1)

CPU times: user 342 ms, sys: 26.8 ms, total: 369 ms
Wall time: 368 ms


In [220]:
delta_true

-2

In [221]:
n_captured, len(intervals)

(1, 1)

In [222]:
thread_count = 8
threads = [0] * thread_count

remaining = 128
batch_size = remaining // thread_count

intervals = []
n_captured = 0

In [223]:
%%time

for i in range(thread_count):
    n_samples = batch_size if i < thread_count-1 else remaining
    threads[i] = threading.Thread(target=passToThread, args=(n_samples,))
    threads[i].start()
    remaining -= n_samples
    
for thread in threads:
    thread.join()

CPU times: user 1min 5s, sys: 1.4 s, total: 1min 7s
Wall time: 9.64 s


In [224]:
n_captured, len(intervals)

(125, 128)

Multithreading allows us to compute one confidence interval in \~80 ms, even faster than it took to compute the one confidence interval above (\~360 ms).

## Scrap Code

In [157]:
# https://www.statsmodels.org/stable/generated/statsmodels.stats.weightstats.CompareMeans.html
np.random.seed(124)
n1, n2 = 12, 8
x1 = np.random.gamma(gamma1[0], gamma1[1], (5, n1))
x2 = np.random.gamma(gamma2[0], gamma2[1], (5, n2))

In [158]:
from scipy.stats import ttest_ind as ttest
from scipy.stats import t as t

In [159]:
ttest(x1, x2, axis=1)

Ttest_indResult(statistic=array([-1.41652211, -0.2703295 , -1.2391753 , -0.88257159, -1.42359231]), pvalue=array([0.17369686, 0.78998195, 0.2311958 , 0.3891026 , 0.1716681 ]))

In [163]:
cm = sms.CompareMeans(sms.DescrStatsW(x1[0]), sms.DescrStatsW(x2[0]))
print(cm.tconfint_diff(0.05, usevar="pooled" if pooled else "unequal"))

(-9.036252057023685, 1.758211350664376)


In [164]:
# TODO alternative="unequal", etc.
def interval_vectorized(x1s, x2s, alpha=0.05, pooled=True):
    n1, n2 = x1s.shape[-1], x2s.shape[-1]
    #print("n1 =", n1, "n2 =", n2)
    sum1 = np.sum(x1s, axis=-1)
    sum2 = np.sum(x2s, axis=-1)
    #print("sums", sum1, sum2)

    mean1 = sum1 / n1
    mean2 = sum2 / n2
    #print("means", mean1, mean2)

    sample_var = lambda x, mean, n: (np.sum(x**2, axis=-1) - n*mean**2) / (n-1)
    var1 = sample_var(x1s, mean1, n1)
    var2 = sample_var(x2s, mean2, n2)
    #print("sample variances", var1, var2)

    pooled_var = ((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2)
    #print("pooled var", pooled_var)
    denom = np.sqrt(pooled_var * (1/n1 + 1/n2))

    t_crit = t.ppf(q=1-alpha/2, df=n1+n2-2)
    lower = (mean1 - mean2) - t_crit*denom
    upper = (mean1 - mean2) + t_crit*denom
    return lower, upper

In [165]:
interval_vectorized(x1[0], x2[0])

(-9.036252057023685, 1.7582113506643742)

In [166]:
interval_vectorized(x1, x2)

(array([-9.03625206, -5.73727455, -7.35169269, -6.57976376, -9.46387131]),
 array([1.75821135, 4.42914293, 1.89674028, 2.68693657, 1.81873608]))

In [193]:
t_wide = interval_vectorized(x1, x2, alpha=0.01)
t_narrow = interval_vectorized(x1, x2, alpha=0.1)

In [194]:
t_wide

(array([-11.03368321,  -7.61849071,  -9.06304298,  -8.29449428,
        -11.5516297 ]),
 array([3.7556425 , 6.31035908, 3.60809058, 4.4016671 , 3.90649447]))

In [195]:
t_narrow

(array([-8.0937992 , -4.84965571, -6.54422207, -5.77069824, -8.47879914]),
 array([0.8157585 , 3.54152408, 1.08926967, 1.87787105, 0.83366391]))

In [196]:
lower = search(x1[0], x2[0], partitions, t_wide[0][0], t_narrow[0][0])

KeyboardInterrupt: 

In [None]:
    global n_captured
    
    for _ in range(n_samples):
        x1 = np.random.gamma(gamma1[0], gamma1[1], n1)
        x2 = np.random.gamma(gamma2[0], gamma2[1], n2)

        # TODO vectorize
        t99 = tconfint(0.001, pooled, x1, x2)
        t90 = tconfint(0.20, pooled, x1, x2)

        try:
            lower = search(x1, x2, partitions, t99[0], t90[0])
            upper = search(x1, x2, partitions, t90[1], t99[1])
        except AssertionError:
            continue

        intervals.append((lower, upper))
        n_captured += (lower <= delta_true) * (delta_true <= upper)

In [197]:
t_wide[0][1], t_wide[1][1]

(-7.618490712386961, 6.310359084631237)

In [206]:
cm = sms.CompareMeans(sms.DescrStatsW(x1[1]), sms.DescrStatsW(x2[1]))
print(cm.tconfint_diff(0.001, usevar="pooled" if pooled else "unequal"))
print(cm.tconfint_diff(0.20, usevar="pooled" if pooled else "unequal"))

(-10.142539885476689, 8.83440825772096)
(-3.8729643459064005, 2.5648327181506727)


In [211]:
pval_vectorized(x1[1], x2[1], partitions, delta=-2)

1.0

In [207]:
search(x1[1], x2[1], partitions, -10.142539885476689, -3.8729643459064005)

KeyboardInterrupt: 

In [203]:
t_wide

(array([-11.03368321,  -7.61849071,  -9.06304298,  -8.29449428,
        -11.5516297 ]),
 array([3.7556425 , 6.31035908, 3.60809058, 4.4016671 , 3.90649447]))