In [1]:
import modin.pandas as pd
import numpy as np

import warnings; warnings.simplefilter('ignore')

In [2]:
annos = (
    pd.read_parquet("wforms-bat.parquet")
    .join(pd.read_parquet("wforms-ann.parquet"), how="left")
    .join(
        pd.read_parquet("wforms-occ.parquet", columns=["rho", "tot"]).add_suffix("_occ"),
        how="left",
    )
    .join(
        pd.read_parquet("wforms-usr.parquet", columns=["rho"]).add_suffix("_usr"),
        how="left",
    )    
    
)

annos["is_hashtag"] = annos.index.str.startswith("#")

# Previously attested forms and hashtags get a special status value.
annos.loc[annos["is_hashtag"], "status"] = -np.inf
annos.loc[annos["tt_att"], "status"] = +np.inf

2023-08-10 13:38:54,850	INFO worker.py:1621 -- Started a local Ray instance.


In [3]:
# Dry up code with masks.
fll = np.repeat(True, len(annos.index))
att = annos["tt_att"]
hsh = annos["is_hashtag"]
fst = annos["fst_batch"]
snd = annos["snd_batch"]
rho = annos["rho_occ"].gt(0.2)

# Forms

In [4]:
# Check progress.
assert ~annos["status"].eq(0).any()
annos[annos["fst_batch"] | annos["snd_batch"]]["status"].value_counts(dropna=False)

status
 inf    15366
-1.0     7711
-inf     3391
 1.0      346
 NaN       76
Name: count, dtype: int64

In [5]:
# Count forms per category.
annos.loc[annos["status"].eq(1), "category"].value_counts(dropna=False)

category
variante grafica         109
univerbazione             48
suffissazione             45
forestierismo             40
macedonia                 33
prestito adattato         24
alterazione               17
prefissazione              8
acronimo                   6
transcategorizzazione      6
composizione               3
deonimico                  3
ridefinizione              2
deacronimico               1
tmesi                      1
Name: count, dtype: int64

In [6]:
# List all interesting forms.
with pd.option_context('display.max_rows', 500, "display.max_colwidth", None):
    display(annos[annos["status"].eq(1)].sort_values("tot_occ", ascending=False)[["tot_occ","category","categories","attestation","notes"]])

Unnamed: 0_level_0,tot_occ,category,categories,attestation,notes
wf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rdc,4987,acronimo,acronimo,,
csx,1064,variante grafica,accorciamento,,centro-sinistra
buonagiornata,526,univerbazione,univerbazione,,
twitteri,506,suffissazione,deonimico; suffissazione,,
piddini,496,suffissazione,suffissazione; deacronimico,,
tuitteri,393,suffissazione,deonimico; suffissazione,,
pidioti,377,macedonia,macedonia,,
reel,315,forestierismo,forestierismo,,
lho,307,univerbazione,variante grafica; accorciamento,,l'ho
xe,270,variante grafica,accorciamento,,


# Subsets

In [7]:
# Define masks for interesting subsets.
masks = {
    "*": fll,
    "g": rho,
    "g+": abs(annos["rho_occ"]).gt(0.2),
    "1": fst,
    "2": snd,
    "1∪2": fst | snd,
    "1∩2": fst & snd,
    "*-A": ~att,
    "g-A": ~att & rho,
    "1-A": ~att & fst,
    "2-A": ~att & snd,
    "1∪2-A": ~att & (fst | snd),
    "1∩2-A": ~att & fst & snd,
    "*-#": ~hsh,
    "g-#": ~hsh & rho,
    "1-#": ~hsh & fst,
    "2-#": ~hsh & snd,
    "1∪2-#": ~hsh & (fst | snd),
    "1∩2-#": ~hsh & fst & snd,
    "*-A-#": ~att & ~hsh,
    "g-A-#": ~att & ~hsh & rho,
    "1-A-#": ~att & ~hsh & fst,
    "2-A-#": ~att & ~hsh & snd,
    "1∪2-A-#": ~att & ~hsh & (fst | snd),
    "1∩2-A-#": ~att & ~hsh & fst & snd,
}

In [8]:
# Infodump with counts.

stats = pd.DataFrame(
    {k: annos[v]["status"].value_counts(dropna=False) for (k, v) in masks.items()}
)

display(
    pd.concat([stats.T, stats.sum().rename("TOT")], axis=1)
    .T.rename(index={-np.inf: "#", -1.0: "N", 1.0: "Y", np.inf: "A"})
    .loc[["A", "#", "N", "Y", np.nan, "TOT"]]
    .fillna(0)
    .astype(int)
)

del stats

Unnamed: 0,*,g,g+,1,2,1∪2,1∩2,*-A,g-A,1-A,...,1-#,2-#,1∪2-#,1∩2-#,*-A-#,g-A-#,1-A-#,2-A-#,1∪2-A-#,1∩2-A-#
A,180722,1479,2263,2441,13226,15366,301,0,0,0,...,2093,12256,14138,211,0,0,0,0,0,0
#,225673,1352,2040,2074,1651,3391,334,225673,1352,2074,...,0,0,0,0,0,0,0,0,0,0
N,7711,1187,2027,2125,5916,7711,330,7711,1187,2125,...,2125,5916,7711,330,7711,1187,2125,5916,7711,330
Y,346,65,85,86,273,346,13,346,65,86,...,86,273,346,13,346,65,86,273,346,13
,511391,7,11,11,66,76,1,511391,7,11,...,11,66,76,1,511391,7,11,66,76,1
TOT,925843,4090,6426,6737,21132,26890,979,745121,2611,4296,...,4315,18511,22271,555,519448,1259,2222,6255,8133,344


In [9]:
# Infodump with percents.

stats = pd.DataFrame(
    {
        k: annos[v]["status"].value_counts(dropna=False, normalize=True)
        for (k, v) in masks.items()
    }
)


with pd.option_context("display.float_format", "{:.2%}".format, "display.max_columns", 100):
    display(
        pd.concat([stats.T, stats.sum().rename("TOT")], axis=1)
        .T.rename(index={-np.inf: "#", -1.0: "N", 1.0: "Y", np.inf: "A"})
        .loc[["A", "#", "N", "Y", np.nan, "TOT"]]
        .fillna(0)
    )

del stats

Unnamed: 0,*,g,g+,1,2,1∪2,1∩2,*-A,g-A,1-A,2-A,1∪2-A,1∩2-A,*-#,g-#,1-#,2-#,1∪2-#,1∩2-#,*-A-#,g-A-#,1-A-#,2-A-#,1∪2-A-#,1∩2-A-#
A,19.52%,36.16%,35.22%,36.23%,62.59%,57.14%,30.75%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,23.25%,49.09%,48.51%,66.21%,63.48%,38.02%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
#,24.37%,33.06%,31.75%,30.79%,7.81%,12.61%,34.12%,30.29%,51.78%,48.28%,20.88%,29.43%,49.26%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
N,0.83%,29.02%,31.54%,31.54%,28.00%,28.68%,33.71%,1.03%,45.46%,49.46%,74.83%,66.91%,48.67%,1.14%,48.00%,49.25%,31.96%,34.62%,59.46%,1.48%,94.28%,95.63%,94.58%,94.81%,95.93%
Y,0.04%,1.59%,1.32%,1.28%,1.29%,1.29%,1.33%,0.05%,2.49%,2.00%,3.45%,3.00%,1.92%,0.05%,2.63%,1.99%,1.47%,1.55%,2.34%,0.07%,5.16%,3.87%,4.36%,4.25%,3.78%
,55.24%,0.17%,0.17%,0.16%,0.31%,0.28%,0.10%,68.63%,0.27%,0.26%,0.83%,0.66%,0.15%,75.56%,0.28%,0.25%,0.36%,0.34%,0.18%,98.45%,0.56%,0.50%,1.06%,0.93%,0.29%
TOT,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%


# Yield

In [10]:
gri = annos[~att&(rho)]["status"].value_counts(dropna=False)
bfs = annos[~att&(snd)]["status"].value_counts(dropna=False)
foo = annos[~att&(fst|snd)]["status"].value_counts(dropna=False)
bar = annos[~att&(rho&snd)]["status"].value_counts(dropna=False)

print("\nYield (annotated only)")
gri_tot = gri.loc[[1, -1]].sum()
bfs_tot = bfs.loc[[1, -1]].sum()
foo_tot = foo.loc[[1, -1]].sum()
bar_tot = bar.loc[[1, -1]].sum()
gri_yld = gri.loc[1] / gri_tot
bfs_yld = bfs.loc[1] / bfs_tot
foo_yld = foo.loc[1] / foo_tot
bar_yld = bar.loc[1] / bar_tot
print("GRI: {:.3%}".format(gri_yld))
print("BFS: {:.3%}".format(bfs_yld))
print("ADV: {:.0%}".format(bfs_yld / gri_yld - 1))
print("A|B: {:.3%}".format(foo_yld))
print("G&B: {:.3%}".format(bar_yld))

print("\nYield (projected on missing)")
gri_tot = gri.loc[[1, -1, np.nan]].sum()
bfs_tot = bfs.loc[[1, -1, np.nan]].sum()
foo_tot = foo.loc[[1, -1, np.nan]].sum()
bar_tot = bar.loc[[1, -1, np.nan]].sum()
gri_yld = gri.loc[1] / gri_tot * (1 + gri.loc[np.nan] / gri_tot)
bfs_yld = bfs.loc[1] / bfs_tot * (1 + bfs.loc[np.nan] / bfs_tot)
foo_yld = foo.loc[1] / foo_tot * (1 + foo.loc[np.nan] / foo_tot)
bar_yld = bar.loc[1] / bar_tot * (1 + bar.loc[np.nan] / bar_tot)
print("GRI: {:.3%}".format(gri_yld))
print("BFS: {:.3%}".format(bfs_yld))
print("ADV: {:.0%}".format(bfs_yld / gri_yld - 1))
print("A|B: {:.3%}".format(foo_yld))
print("G&B: {:.3%}".format(bar_yld))

print("\nYield (projected on missing and hashtags)")
gri_tot = gri.loc[[1, -1, np.nan, -np.inf]].sum()
bfs_tot = bfs.loc[[1, -1, np.nan, -np.inf]].sum()
foo_tot = foo.loc[[1, -1, np.nan, -np.inf]].sum()
bar_tot = bar.loc[[1, -1, np.nan, -np.inf]].sum()
gri_yld = gri.loc[1] / gri_tot * (1 + gri.loc[[np.nan, -np.inf]].sum() / gri_tot)
bfs_yld = bfs.loc[1] / bfs_tot * (1 + bfs.loc[[np.nan, -np.inf]].sum() / bfs_tot)
foo_yld = foo.loc[1] / foo_tot * (1 + foo.loc[[np.nan, -np.inf]].sum() / foo_tot)
bar_yld = bar.loc[1] / bar_tot * (1 + bar.loc[[np.nan, -np.inf]].sum() / bar_tot)
print("GRI: {:.3%}".format(gri_yld))
print("BFS: {:.3%}".format(bfs_yld))
print("ADV: {:.0%}".format(bfs_yld / gri_yld - 1))
print("A|B: {:.3%}".format(foo_yld))
print("G&B: {:.3%}".format(bar_yld))


Yield (annotated only)
GRI: 5.192%
BFS: 4.411%
ADV: -15%
A|B: 4.294%
G&B: 4.114%

Yield (projected on missing)
GRI: 5.192%
BFS: 4.411%
ADV: -15%
A|B: 4.294%
G&B: 4.114%

Yield (projected on missing and hashtags)
GRI: 3.785%
BFS: 4.203%
ADV: 11%
A|B: 3.906%
G&B: 3.125%
