# PiPP frequency estimates

In [1]:
__author__ = "Christopher Potts"

## Set-up

In [2]:
import numpy as np
from scipy import stats
import pandas as pd

## Utilities

In [3]:
def get_classical_ci(vals):
    if len(set(vals)) == 1:
        return (vals[0], vals[0])
    loc = np.mean(vals)
    scale = np.std(vals) / np.sqrt(len(vals))
    l, h = stats.t.interval(0.95, len(vals)-1, loc=loc, scale=scale)
    return loc - l, h - loc

In [4]:
def experiment(df, total_candidates, n_experiments=10000, n_samples=100, cats=("AS", "ASAS", "THOUGH")):
    data = []
    for _ in range(n_experiments):
        samp = df.sample(n=n_samples)
        n_matches = samp[samp.PiPP.isin(cats)].shape[0]
        data.append((n_matches / n_samples) * total_candidates)
    ci = get_classical_ci(data)
    mu = np.mean(data)
    return {"data": data, "ci": ci, "mu": mu}

In [5]:
def analysis(df, total_candidates, total_examples):
    conds = (("AS", "ASAS", "THOUGH"), ("ASAS",), ("AS",), ("THOUGH",))
    for cond in conds:
        print("=" * 50)
        print(cond)
        results = experiment(df, total_candidates, cats=cond)
        mu = int(results['mu'].round(0))
        ci = int(round(results['ci'][0]))
        print(f"{mu:,} \pm {ci:,}")
        per = (results['mu'] / total_examples) * 100
        print(f"Freq: {per}")
        if  results['mu'] > 0:
            n_sent = int(round(total_examples / results['mu']))
            n_sent = f"{n_sent:,}"
        else: 
            n_sent = "inf"
        print(f"1 in {n_sent} sentences")

## OpenBooks

In [6]:
book_ann_df = pd.read_csv("annotated/pipp-sample-bookcorpusopen-annotated.csv", index_col=0)

In [7]:
dist = book_ann_df.PiPP.value_counts()

In [8]:
dist

F       995
AS        3
ASAS      2
Name: PiPP, dtype: int64

In [9]:
# This file is not included. It needs to be created by running
# the notebook `books.ipynb`:

books_total_candidates = pd.read_csv("samples/pipp-bookcorpusopen.csv").shape[0]

In [10]:
books_total_candidates

5814960

In [11]:
round((5 / 1000) * books_total_candidates)

29075

In [12]:
# From the notebook `books.ipynb`:

books_total_examples = 90739117

In [13]:
analysis(book_ann_df, books_total_candidates, books_total_examples)

('AS', 'ASAS', 'THOUGH')
29,249 \pm 761
Freq: 0.032234442836819754
1 in 3,102 sentences
('ASAS',)
12,054 \pm 494
Freq: 0.013284691849051164
1 in 7,527 sentences
('AS',)
17,137 \pm 584
Freq: 0.018885666608371335
1 in 5,295 sentences
('THOUGH',)
0 \pm 0
Freq: 0.0
1 in inf sentences


## C4

In [14]:
c4_df = pd.read_csv("annotated/pipp-sample-c4-annotated.csv")

In [15]:
c4_df.PiPP.value_counts()

F       996
AS        2
ASAS      2
Name: PiPP, dtype: int64

In [16]:
# From the notebook `c4.ipynb`:

c4_total_candidates = 540516902

In [17]:
round((4/1000) * c4_total_candidates)

2162068

In [18]:
# From the notebook `c4.ipynb`:

c4_total_examples = 7546154665

In [19]:
analysis(c4_df, c4_total_candidates, c4_total_examples)

('AS', 'ASAS', 'THOUGH')
2,108,556 \pm 63,370
Freq: 0.027942131168895146
1 in 3,579 sentences
('ASAS',)
1,067,521 \pm 44,738
Freq: 0.01414655448822556
1 in 7,069 sentences
('AS',)
1,086,979 \pm 45,133
Freq: 0.01440441573459322
1 in 6,942 sentences
('THOUGH',)
0 \pm 0
Freq: 0.0
1 in inf sentences
