In [2]:
from typing import Dict

import numpy as np
from sklearn.linear_model import LinearRegression
from pickle import load, dump, HIGHEST_PROTOCOL
from joblib import Parallel, delayed

BOOTSTRAP_SIZE = 1000
RANDOM_SEED = 375

In [3]:
with open("alc_stepone.pickle", "rb") as handle:
    alc_stepone = load(handle)

In [4]:
test_range = alc_stepone[:30]

In [5]:
def run_ols(X: np.array, Y: np.array) -> np.array:
    assert X.shape[0] == Y.shape[0]
    # LinearRegression does not impose penalty by default
    full_sample_out = LinearRegression(fit_intercept=False).fit(X, Y)
    beta_coefficients = full_sample_out.coef_
    normed_betas = np.linalg.norm(beta_coefficients, axis=0)
    return normed_betas

In [9]:
def test1(
    token: Dict[str, np.array], rng: np.random.Generator, confidence: float = 90
) -> Dict[str, np.array]:
    def sample():
        Y_permuted = rng.permutation(Y)
        return run_ols(X, Y_permuted)

    X, Y = token["X"], token["Y"]
    assert X.shape[0] == Y.shape[0]
    normed_betas = [sample() for _ in range(BOOTSTRAP_SIZE)]
    print("done samples")

    # Getting confidence interval
    offset = (100 - confidence) / 2
    ci_normed_betas = np.percentile(normed_betas, [offset, confidence + offset], axis=0)

    # Conducting permutation test: p-value is %values "more extreme" than ground_truth
    ground_truth = run_ols(X, Y)
    empirical_pvalue = np.apply_along_axis(
        lambda a: sum(1 if x > ground_truth else 0 for x in a) / len(a),
        axis=0,
        arr=normed_betas,
    )

    return {
        "token": token["token"],
        "ground truth": ground_truth,
        "CI": ci_normed_betas,
        "p-value": empirical_pvalue,
    }

In [11]:
ss = np.random.SeedSequence(RANDOM_SEED)
child_seeds = ss.spawn(len(test_range))
streams = [np.random.default_rng(seed) for seed in child_seeds]

Parallel(n_jobs=24, verbose=1)(
    delayed(test1)(token, rng) for token, rng in zip(test_range, streams)
)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.


done samples
done samples
done samples
done samples
done samples
done samples
done samples
done samples
done samples
done samples
done samples
done samples
done samples
done samples


[Parallel(n_jobs=24)]: Done  14 out of  30 | elapsed:   13.7s remaining:   15.7s


done samples
done samples
done samples
done samples
done samples
done samples
done samples
done samples
done samples
done samples
done samples
done samples


KeyboardInterrupt: 