In [1]:
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import modin.pandas as pd
import numpy as np
from modin.config import ProgressBar
from tqdm.auto import tqdm

ProgressBar.enable()

In [2]:
# Quick annotation progress check
pd.read_parquet("wforms-ann.parquet")["status"].value_counts(dropna=False)


    import ray
    ray.init()

2023-07-30 09:30:14,267	INFO worker.py:1636 -- Started a local Ray instance.


Estimated completion of line 2:   0%           Elapsed time: 00:00, estimated remaining time: ?

status
-1.0    11089
 1.0      359
 NaN       76
Name: count, dtype: int64

In [3]:
wforms = (
    pd.read_parquet("wforms-bat.parquet")
    .join(pd.read_parquet("wforms-ann.parquet"), how="left")
    .join(
        pd.read_parquet("wforms-occ.parquet", columns=["rho"]).add_suffix("_occ"),
        how="left",
    )
    .join(
        pd.read_parquet("wforms-usr.parquet", columns=["rho"]).add_suffix("_usr"),
        how="left",
    )    
    
)

wforms["is_hashtag"] = wforms.index.str.startswith("#")

stats = [
    [wforms.shape[0], "total"],
    [wforms["fst_batch"].sum(), "1st batch"],
    [wforms["snd_batch"].sum(), "2nd batch"],
    [(wforms["fst_batch"] & wforms["snd_batch"]).sum(), "overlap"],
]

# We mark previously attested forms and hashtags as special
wforms.loc[wforms["tt_att"], "status"] = +np.inf
wforms.loc[wforms["is_hashtag"], "status"] = -np.inf

assert ~wforms["status"].eq(0).any()  # just in case

Estimated completion of line 2:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 3:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 5:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 9:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 15:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 19:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 20:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 21:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 25:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 26:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 28:   0%           Elapsed time: 00:00, estimated remaining time: ?

In [4]:
# To dry up code:
fll = np.repeat(True, len(wforms.index))
att = wforms["tt_att"]
hsh = wforms["is_hashtag"]
fst = wforms["fst_batch"]
snd = wforms["snd_batch"]
rho = wforms["rho_occ"].gt(0.2)

# Define interesting subsets.
masks = {
    "*": fll,
    "g": rho,
    "g+": abs(wforms["rho_occ"]).gt(0.2),
    "1": fst,
    "2": snd,
    "1∪2": fst | snd,
    "1∩2": fst & snd,
    "*-A": ~att,
    "g-A": ~att & rho,
    "1-A": ~att & fst,
    "2-A": ~att & snd,
    "1∪2-A": ~att & (fst | snd),
    "1∩2-A": ~att & fst & snd,
    "*-#": ~hsh,
    "g-#": ~hsh & rho,
    "1-#": ~hsh & fst,
    "2-#": ~hsh & snd,
    "1∪2-#": ~hsh & (fst | snd),
    "1∩2-#": ~hsh & fst & snd,
    "*-A-#": ~att & ~hsh,
    "g-A-#": ~att & ~hsh & rho,
    "1-A-#": ~att & ~hsh & fst,
    "2-A-#": ~att & ~hsh & snd,
    "1∪2-A-#": ~att & ~hsh & (fst | snd),
    "1∩2-A-#": ~att & ~hsh & fst & snd,
}

Estimated completion of line 7:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 13:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 16:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 17:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 18:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 19:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 20:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 21:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 22:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 23:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 24:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 25:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 26:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 27:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 28:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 29:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 30:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 31:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 32:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 33:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 34:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 35:   0%           Elapsed time: 00:00, estimated remaining time: ?

In [5]:
stats = pd.DataFrame(
    {k: wforms[v]["status"].value_counts(dropna=False) for (k, v) in masks.items()}
)

display(
    pd.concat([stats.T, stats.sum().rename("TOT")], axis=1)
    .T.rename(index={-np.inf: "#", -1.0: "N", 1.0: "Y", np.inf: "A"})
    .loc[["A", "#", "N", "Y", np.nan, "TOT"]]
    .fillna(0)
    .astype(int)
)

del stats

Estimated completion of line 2:   0%           Elapsed time: 00:00, estimated remaining time: ?



Estimated completion of line 1:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 6:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 7:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 9:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 10:   0%           Elapsed time: 00:00, estimated remaining time: ?

Unnamed: 0,*,g,g+,1,2,1∪2,1∩2,*-A,g-A,1-A,...,1-#,2-#,1∪2-#,1∩2-#,*-A-#,g-A-#,1-A-#,2-A-#,1∪2-A-#,1∩2-A-#
A,157319,1214,1921,2093,12256,14138,211,0,0,0,...,2093,12256,14138,211,0,0,0,0,0,0
#,249076,1617,2382,2422,2621,4619,424,225673,1352,2074,...,0,0,0,0,0,0,0,0,0,0
N,7697,1182,2020,2118,5907,7696,329,7697,1182,2118,...,2118,5907,7696,329,7697,1182,2118,5907,7696,329
Y,359,70,91,92,281,359,14,359,70,92,...,92,281,359,14,359,70,92,281,359,14
,511392,7,12,12,67,78,1,511392,7,12,...,12,67,78,1,511392,7,12,67,78,1
TOT,925843,4090,6426,6737,21132,26890,979,745121,2611,4296,...,4315,18511,22271,555,519448,1259,2222,6255,8133,344


In [6]:
stats = pd.DataFrame(
    {
        k: wforms[v]["status"].value_counts(dropna=False, normalize=True)
        for (k, v) in masks.items()
    }
)


with pd.option_context("display.float_format", "{:.2%}".format, "display.max_columns", 100):
    display(
        pd.concat([stats.T, stats.sum().rename("TOT")], axis=1)
        .T.rename(index={-np.inf: "#", -1.0: "N", 1.0: "Y", np.inf: "A"})
        .loc[["A", "#", "N", "Y", np.nan, "TOT"]]
        .fillna(0)
    )

del stats

Estimated completion of line 2:   0%           Elapsed time: 00:00, estimated remaining time: ?



Estimated completion of line 1:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 11:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 12:   0%           Elapsed time: 00:00, estimated remaining time: ?

Estimated completion of line 14:   0%           Elapsed time: 00:00, estimated remaining time: ?

Unnamed: 0,*,g,g+,1,2,1∪2,1∩2,*-A,g-A,1-A,2-A,1∪2-A,1∩2-A,*-#,g-#,1-#,2-#,1∪2-#,1∩2-#,*-A-#,g-A-#,1-A-#,2-A-#,1∪2-A-#,1∩2-A-#
A,16.99%,29.68%,29.89%,31.07%,58.00%,52.58%,21.55%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,23.25%,49.09%,48.51%,66.21%,63.48%,38.02%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
#,26.90%,39.54%,37.07%,35.95%,12.40%,17.18%,43.31%,30.29%,51.78%,48.28%,20.88%,29.43%,49.26%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
N,0.83%,28.90%,31.43%,31.44%,27.95%,28.62%,33.61%,1.03%,45.27%,49.30%,74.72%,66.78%,48.53%,1.14%,47.80%,49.08%,31.91%,34.56%,59.28%,1.48%,93.88%,95.32%,94.44%,94.63%,95.64%
Y,0.04%,1.71%,1.42%,1.37%,1.33%,1.34%,1.43%,0.05%,2.68%,2.14%,3.55%,3.12%,2.06%,0.05%,2.83%,2.13%,1.52%,1.61%,2.52%,0.07%,5.56%,4.14%,4.49%,4.41%,4.07%
,55.24%,0.17%,0.19%,0.18%,0.32%,0.29%,0.10%,68.63%,0.27%,0.28%,0.85%,0.68%,0.15%,75.56%,0.28%,0.28%,0.36%,0.35%,0.18%,98.45%,0.56%,0.54%,1.07%,0.96%,0.29%
TOT,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%


In [7]:
ProgressBar.disable()

gri = wforms[~att&(rho)]["status"].value_counts(dropna=False)
bfs = wforms[~att&(snd)]["status"].value_counts(dropna=False)
foo = wforms[~att&(fst|snd)]["status"].value_counts(dropna=False)
bar = wforms[~att&(rho&snd)]["status"].value_counts(dropna=False)

print("\nYield (annotated only)")
gri_tot = gri.loc[[1, -1]].sum()
bfs_tot = bfs.loc[[1, -1]].sum()
foo_tot = foo.loc[[1, -1]].sum()
bar_tot = bar.loc[[1, -1]].sum()
gri_yld = gri.loc[1] / gri_tot
bfs_yld = bfs.loc[1] / bfs_tot
foo_yld = foo.loc[1] / foo_tot
bar_yld = bar.loc[1] / bar_tot
print("GRI: {:.3%}".format(gri_yld))
print("BFS: {:.3%}".format(bfs_yld))
print("ADV: {:.0%}".format(bfs_yld / gri_yld - 1))
print("A|B: {:.3%}".format(foo_yld))
print("G&B: {:.3%}".format(bar_yld))

print("\nYield (projected on missing)")
gri_tot = gri.loc[[1, -1, np.nan]].sum()
bfs_tot = bfs.loc[[1, -1, np.nan]].sum()
foo_tot = foo.loc[[1, -1, np.nan]].sum()
bar_tot = bar.loc[[1, -1, np.nan]].sum()
gri_yld = gri.loc[1] / gri_tot * (1 + gri.loc[np.nan] / gri_tot)
bfs_yld = bfs.loc[1] / bfs_tot * (1 + bfs.loc[np.nan] / bfs_tot)
foo_yld = foo.loc[1] / foo_tot * (1 + foo.loc[np.nan] / foo_tot)
bar_yld = bar.loc[1] / bar_tot * (1 + bar.loc[np.nan] / bar_tot)
print("GRI: {:.3%}".format(gri_yld))
print("BFS: {:.3%}".format(bfs_yld))
print("ADV: {:.0%}".format(bfs_yld / gri_yld - 1))
print("A|B: {:.3%}".format(foo_yld))
print("G&B: {:.3%}".format(bar_yld))

print("\nYield (projected on missing and hashtags)")
gri_tot = gri.loc[[1, -1, np.nan, -np.inf]].sum()
bfs_tot = bfs.loc[[1, -1, np.nan, -np.inf]].sum()
foo_tot = foo.loc[[1, -1, np.nan, -np.inf]].sum()
bar_tot = bar.loc[[1, -1, np.nan, -np.inf]].sum()
gri_yld = gri.loc[1] / gri_tot * (1 + gri.loc[[np.nan, -np.inf]].sum() / gri_tot)
bfs_yld = bfs.loc[1] / bfs_tot * (1 + bfs.loc[[np.nan, -np.inf]].sum() / bfs_tot)
foo_yld = foo.loc[1] / foo_tot * (1 + foo.loc[[np.nan, -np.inf]].sum() / foo_tot)
bar_yld = bar.loc[1] / bar_tot * (1 + bar.loc[[np.nan, -np.inf]].sum() / bar_tot)
print("GRI: {:.3%}".format(gri_yld))
print("BFS: {:.3%}".format(bfs_yld))
print("ADV: {:.0%}".format(bfs_yld / gri_yld - 1))
print("A|B: {:.3%}".format(foo_yld))
print("G&B: {:.3%}".format(bar_yld))

ProgressBar.enable()


Yield (annotated only)
GRI: 5.591%
BFS: 4.541%
ADV: -19%
A|B: 4.457%
G&B: 4.430%

Yield (projected on missing)
GRI: 5.591%
BFS: 4.541%
ADV: -19%
A|B: 4.456%
G&B: 4.430%

Yield (projected on missing and hashtags)
GRI: 4.076%
BFS: 4.327%
ADV: 6%
A|B: 4.053%
G&B: 3.365%
