In [2]:
from typing import List

import pandas as pd
import numpy as np
from convokit import Corpus, Utterance

In [3]:
from final import *

%load_ext autoreload
%autoreload 2

In [4]:
RANDOM_SEED = 375
BOOTSTRAP_SIZE = 10
STATISTIC = "PPMI"

In [5]:
def get_table_from_corpus(corpus: Corpus) -> pd.DataFrame:
    # We turn into pandas df to quickly access column data later
    counts = get_counts(corpus)
    table = get_table(counts)
    df = pd.DataFrame.from_dict(table, orient="index")
    return df


def bootstrap_corpus(rng: np.random.Generator, corpus: Corpus) -> Corpus:
    # Miraculously no type casting needed! :)
    utts = [utt for utt in corpus.iter_utterances()]
    chosen_utts = rng.choice(utts, size=len(utts), replace=True)
    new_corpus = Corpus(utterances=chosen_utts)
    return new_corpus

In [6]:
ground_truth_corpus = get_corpus()
rng = np.random.default_rng(RANDOM_SEED)

In [7]:
# Testing bootstrap_corpus
bootstrap_corpus(rng, ground_truth_corpus)

<convokit.model.corpus.Corpus at 0x7f36a129dd00>

In [12]:
def bootstrap(statistic: str, size: int = BOOTSTRAP_SIZE) -> np.array:
    results = []
    for _ in range(BOOTSTRAP_SIZE):
        corpus = bootstrap_corpus(rng, ground_truth_corpus)
        table = get_table_from_corpus(corpus)
        column = np.asarray(table[statistic].array)
        results.append(column)
    # Shape: (v * b), where v is size of vocabulary and b is bootstrap iterations
    return results

In [13]:
results = bootstrap(statistic=STATISTIC)

In [14]:
print(results)

[<PandasArray>
[                0.0, 0.04322326657129941,  0.3542707989050328,
 0.02539740600682824,  0.0165649653785753,    0.32473881582695,
 0.44212182565831654,                 0.0,                 0.0,
 0.02843036049709437,
 ...
                 nan,    2.77404621719054,    2.77404621719054,
    2.77404621719054,                 nan,                 nan,
                 nan,                 nan,                 nan,
                 nan]
Length: 16205, dtype: float64, <PandasArray>
[                 0.0,   0.3283469101916376, 0.022136203327381344,
                  0.0, 0.019737569373768353,    0.536092555563684,
                  0.0,   0.8172993361401019,  0.05418149726115731,
                  0.0,
 ...
                  nan,                  nan,                  nan,
                  nan,                  nan,                  nan,
                  nan,                  nan,                  nan,
   2.7731317528645336]
Length: 16183, dtype: float64, <PandasArray>
[0.017645

In [None]:
def get_p_values(results: np.array, ground_truth: np.array) -> np.array:
    """This notion of an empirical p-value borrowed from final project"""
    empirical_pvalues = np.divide(results, ground_truth)
    empirical_pvalues = np.apply_along_axis(
        lambda a: sum(1 if x > 1 else 0 for x in a) / len(a), 0, empirical_pvalues
    )
    # Shape: (v * 1), where v is the size of the vocabulary
    return empirical_pvalues

In [None]:
ground_truth_statistic = get_table_from_corpus(ground_truth_corpus)[STATISTIC].to_numpy()
p_values = get_p_values(results, ground_truth_statistic)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.