In [8]:
from collections import defaultdict
from pickle import load, dump, HIGHEST_PROTOCOL
from typing import List, TypeVar

import numpy as np
import pandas as pd
from joblib import Parallel, delayed

In [2]:
from final import get_corpus, get_counts, get_table, process_corpus

%load_ext autoreload
%autoreload 2

In [3]:
RANDOM_SEED = 375
BOOTSTRAP_SIZE = 10
STATISTIC = "PPMI"
N_JOBS = 16
Comparable = TypeVar("Comparable")

In [4]:
def get_table_from_corpus(processed_corpus: List[dict]) -> defaultdict:
    counts = get_counts(processed_corpus)
    table = get_table(counts)
    return table


def bootstrap_corpus(
    rng: np.random.Generator, processed_corpus: List[dict]
) -> List[dict]:
    # Miraculously no type casting needed! :)
    chosen = rng.choice(processed_corpus, size=len(processed_corpus), replace=True)
    return chosen


def bootstrap_table(seed: int, processed_corpus: List[dict]) -> defaultdict:
    rng = np.random.default_rng(seed)
    return get_table_from_corpus(bootstrap_corpus(rng, processed_corpus))

In [5]:
def bootstrap(
    ground_truth: List[dict],
) -> List[defaultdict]:
    return Parallel(n_jobs=N_JOBS, verbose=100)(
        delayed(bootstrap_table)(seed, ground_truth)
        for seed in range(RANDOM_SEED, RANDOM_SEED + BOOTSTRAP_SIZE)
    )


def get_bootstrap_statistic(
    results: List[defaultdict], tokens: List[str], statistic: str
) -> defaultdict:
    return {k: [table[k][statistic] for table in results if k in table] for k in tokens}

In [6]:
def get_p_values(
    results: defaultdict, ground_truth: defaultdict, statistic: str
) -> defaultdict:
    p_values = defaultdict(float)
    for token in results:
        if ground_truth[token][statistic] == None:
            continue

        p = significance_test(results[token], ground_truth[token][statistic])
        p_values[token] = p
    return p_values


def significance_test(results: List[Comparable], ground_truth: Comparable) -> float:
    return sum(1 if x >= ground_truth else 0 for x in results) / len(results)

In [32]:
with open("processed_corpus_list.pickle", "rb") as handle:
        ground_truth_corpus = load(handle)

ground_truth_table = get_table_from_corpus(ground_truth_corpus)
ground_truth_tokens = ground_truth_table.keys()

results = bootstrap(ground_truth_corpus)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   1 tasks      | elapsed:    7.1s
[Parallel(n_jobs=16)]: Done   2 out of  10 | elapsed:    7.2s remaining:   28.8s
[Parallel(n_jobs=16)]: Done   3 out of  10 | elapsed:    7.3s remaining:   16.9s
[Parallel(n_jobs=16)]: Done   4 out of  10 | elapsed:    7.6s remaining:   11.4s
[Parallel(n_jobs=16)]: Done   5 out of  10 | elapsed:    7.7s remaining:    7.7s
[Parallel(n_jobs=16)]: Done   6 out of  10 | elapsed:    8.0s remaining:    5.3s
[Parallel(n_jobs=16)]: Done   7 out of  10 | elapsed:    8.1s remaining:    3.5s
[Parallel(n_jobs=16)]: Done   8 out of  10 | elapsed:    9.1s remaining:    2.3s
[Parallel(n_jobs=16)]: Done  10 out of  10 | elapsed:   10.1s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  10 out of  10 | elapsed:   10.1s finished


In [9]:
df0 = pd.DataFrame.from_dict(results[0], orient='index')
df1 = pd.DataFrame.from_dict(results[1], orient='index')
df2 = pd.DataFrame.from_dict(results[2], orient='index')

In [24]:
df0.sort_index(ascending=False, inplace=True)
df0[0:10]

Unnamed: 0,M count,F count,total,M ratio,F ratio,F - M,PPMI
zwickler,1,0,1,1.0,0.0,-1.0,
zubik,4,0,4,1.0,0.0,-1.0,
zoning,10,0,10,1.0,0.0,-1.0,
zoned,2,0,2,1.0,0.0,-1.0,
zone-of-interests,1,0,1,1.0,0.0,-1.0,
zone-of-interest,4,0,4,1.0,0.0,-1.0,
zone,31,1,32,0.96875,0.03125,-0.9375,0.0
zivot,7,0,7,1.0,0.0,-1.0,
zipes,3,0,3,1.0,0.0,-1.0,
zinke,1,0,1,1.0,0.0,-1.0,


In [25]:
df1.sort_index(ascending=False, inplace=True)
df1[0:10]

Unnamed: 0,M count,F count,total,M ratio,F ratio,F - M,PPMI
zwickler,1,0,1,1.0,0.0,-1.0,
zurko,1,0,1,1.0,0.0,-1.0,
zubik,4,0,4,1.0,0.0,-1.0,
zuber,2,0,2,1.0,0.0,-1.0,
zoning,5,2,7,0.714286,0.285714,-0.428571,1.049567
zoned,1,0,1,1.0,0.0,-1.0,
zone-of-interests,1,0,1,1.0,0.0,-1.0,
zone-of-interest,11,0,11,1.0,0.0,-1.0,
zone,23,1,24,0.958333,0.041667,-0.916667,0.0
zivot,10,0,10,1.0,0.0,-1.0,


In [33]:
results = get_bootstrap_statistic(
    results, tokens=ground_truth_tokens, statistic=STATISTIC
)

# We quickly filter out any token where a bootstrap iteration leads to a
# NaN value for STATISTIC
results = {
    token: results[token] for token in results if not (None in results[token])
}

p_values = get_p_values(results, ground_truth_table, STATISTIC)
p_values_df = pd.DataFrame.from_dict(p_values, orient='index')

Unnamed: 0,0
zero-sum,0.5
zero,1.0
zarda,0.6
youth,0.6
yourself,0.5
your,0.5
you,1.0
york,1.0
ymca,0.555556
yet,1.0


In [39]:
p_values_df.sort_index(ascending=True, inplace=True)
p_values_df = p_values_df.loc[p_values_df[0] < 0.1]
p_values_df[0:30]

Unnamed: 0,0
elevating,0.0
irretrievably,0.0
morality-free,0.0
remnant,0.0
scrub,0.0
sponsorship,0.0
subsumes,0.0
uproots,0.0
vicious,0.0
