In [1]:
import pandas as pd
import numpy as np

# Read batch dataset.
bat = pd.read_parquet("wforms-bat.parquet", columns=["tt_att", "fst_batch", "snd_batch"])
bat = bat[(bat["fst_batch"] | bat["snd_batch"]) & ~bat["tt_att"]]
bat.drop(columns=["tt_att"], inplace=True)

# Read annotated dataset.
dtype = { "candidate": str, "status": float, "category": str, "pos": str, "hashtag_type": str, "notes_category": str, "notes_attestation": str, "notes_general": str }
ann = pd.read_csv("wforms-ann-phase-2.gsheet.csv", encoding="UTF-8", dtype=dtype, usecols=dtype.keys())
ann.rename(columns={"candidate":"wf"}, inplace=True)
ann.set_index("wf", inplace=True)

# A handful of candidates went missing...
missing = list(set(bat.index) - set(ann.index))
assert len(missing) == 31

# ... and came back botched spreadheets are evil.
botched = list(set(ann.index) - set(bat.index))
assert len(botched) == 27

# Luckily, it's all uninteresting spurious tokens...
ann[ann.index.isin(botched)]["status"].eq(-1).all()
ann.drop(botched, inplace=True)

# ... so we can simply rebuild the correct rows.
for candidate in missing:
    ann.loc[candidate] = [-1,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]

# Quick sanity check.
bat.sort_index(inplace=True)
ann.sort_index(inplace=True)
assert (ann.index == bat.index).all()

ann.to_parquet("wforms-ann.parquet")
bat.join(ann).rename(columns={"fst_batch": "subset_a", "snd_batch": "subset_b"}).sort_index()[["subset_a","subset_b","status","category","pos","hashtag_type","notes_category","notes_attestation","notes_general"]].to_csv("wforms-ann.csv", encoding="utf-8")


In [4]:
bat.join(ann)

Unnamed: 0_level_0,fst_batch,snd_batch,status,category,pos,notes_category,notes_attestation,notes_general,hashtag_type
wf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
#100anni,False,True,-1.0,,,,,,
#100giornidaleoni,True,False,-1.0,,,,,,
#10dicembre,True,False,-1.0,,,,,,
#10gennaio,True,False,-1.0,,,,,,
#11dicembre,True,False,-1.0,,,,,,
...,...,...,...,...,...,...,...,...,...
üÖ∫,True,False,-1.0,,,,,,
üÜÅ,False,True,-1.0,,,,,,
üÜÇ,False,True,-1.0,,,,,,
üÜÉ,False,True,-1.0,,,,,,


In [5]:
pd.read_csv("wforms-ann.csv")

Unnamed: 0,wf,subset_a,subset_b,status,category,pos,hashtag_type,notes_category,notes_attestation,notes_general
0,#100anni,False,True,-1.0,,,,,,
1,#100giornidaleoni,True,False,-1.0,,,,,,
2,#10dicembre,True,False,-1.0,,,,,,
3,#10gennaio,True,False,-1.0,,,,,,
4,#11dicembre,True,False,-1.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...
11519,üÖ∫,True,False,-1.0,,,,,,
11520,üÜÅ,False,True,-1.0,,,,,,
11521,üÜÇ,False,True,-1.0,,,,,,
11522,üÜÉ,False,True,-1.0,,,,,,


In [15]:
ann[ann["status"].eq(1)]["category"].value_counts()#.sum()

category
loanword                  318
orthographic variation    115
univerbation               98
suffixation                60
portmanteau                54
loanword adaptation        24
acronym                    19
compounding                 9
prefixation                 9
transcategorisation         7
deonymic derivation         3
redefinition                2
acronymic derivation        1
tmesis                      1
Name: count, dtype: int64

In [40]:
ann[ann.index.str.startswith("#") & ann["status"].eq(1)]["category"].value_counts()#.sum()

category
loanword                  279
univerbation               50
portmanteau                21
acronym                    13
compounding                 5
orthographic variation      4
prefixation                 1
Name: count, dtype: int64

In [35]:
ann[ann["status"].eq(1) & ann.index.str.startswith("#")]["pos"].str.split("; ").explode().value_counts()
#  & ~ann.index.str.startswith("#")


pos
NOM    189
ADJ     23
VER     17
INT      5
ADV      1
Name: count, dtype: int64