In [1]:
from pathlib import Path
from pprint import pprint
import pickle

from pos import evaluate
from pos import data
EXPERIMENT_DIR=Path(".")/"out"

## Read the experiment files
Here are some useful functions for gathering experimental data and doing averages across folds.

In [2]:
def gather_experiments(experiment_paths):
    return [
        evaluate.Experiment(experiment_path) for experiment_path in experiment_paths
    ]

def print_errors(experiments):
    for experiment_name in experiments:
        print(experiment_name)
        errors = evaluate.all_errors(experiments[experiment_name]["examples"])
        pprint(errors.most_common(20))

def average_accuracy(experiments, filter_vocab=None):
    if filter_vocab is not None:
        sum(accuracy_filter(experiment, filter_vocab=filter_vocab
            ) for experiment in experiments.values()) / len(experiments)
    return sum((evaluate.calculate_accuracy(experiment['examples']) for experiment in experiments.values())) / len(experiments)

### Folds

In [11]:
fold_experiments = {
    "baseline": gather_experiments(EXPERIMENT_DIR / "gold-baseline" / f"{i:02}" for i in range(1, 10)),
    "only-char": gather_experiments(EXPERIMENT_DIR / "gold-only-char" / f"{i:02}" for i in range(1, 10)),
    "only-morph": gather_experiments(EXPERIMENT_DIR / "gold-only-morph" / f"{i:02}" for i in range(1, 10)),
    "only-morph+freeze": gather_experiments(EXPERIMENT_DIR / "gold-only-morph+freeze" / f"{i:02}" for i in range(1, 10)),
    "only-wemb": gather_experiments(EXPERIMENT_DIR / "gold-only-wemb" / f"{i:02}" for i in range(1, 10)),
    "wemb+morph": gather_experiments(EXPERIMENT_DIR / "gold-wemb+morph" / f"{i:02}" for i in range(1, 10)),
    "wemb+morph-freeze": gather_experiments(EXPERIMENT_DIR / "gold-wemb+morph+freeze" / f"{i:02}" for i in range(1, 10)),
    "wemb+morph-freeze+extra": gather_experiments(EXPERIMENT_DIR / "gold-wemb+morph+freeze+extra-32" / f"{i:02}" for i in range(1, 10)),
}

In [12]:
for name, experiments in fold_experiments.items():
    print(name)
    print(evaluate.report_experiments(experiments))

baseline
total=94.543% / 5462,             unk=80.286% / 1678,             known=95.868% / 3785,             
only-char
total=91.164% / 8844,             unk=71.398% / 2434,             known=93.001% / 6410,             
only-morph
total=85.419% / 14595,             unk=58.647% / 3519,             known=87.907% / 11075,             
only-morph+freeze
total=82.000% / 18017,             unk=57.323% / 3632,             known=84.293% / 14385,             
only-wemb
total=88.509% / 11502,             unk=20.267% / 6786,             known=94.851% / 4716,             
wemb+morph
total=92.819% / 7188,             unk=62.381% / 3202,             known=95.647% / 3986,             
wemb+morph-freeze
total=92.760% / 7247,             unk=61.517% / 3275,             known=95.663% / 3972,             
wemb+morph-freeze+extra
total=92.641% / 7366,             unk=61.356% / 3289,             known=95.549% / 4077,             


## Individual experiments
Define individual experiments and do analysis.


In [60]:
baseline = "abl-tagger-baseline"
experiment_names = [
    baseline,
    "baseline",
]
experiments = gather_experiments(experiment_names)

In [62]:
for experiment in experiments.values():
    print(f"total={len(experiment['test_vocab'])}, \
                train={len(experiment['train_vocab'])}, \
                morphlex={len(experiment['morphlex_vocab'])}, \
                both={len(experiment['both_vocab'])}, \
                neither={len(experiment['neither_vocab'])}")
    print(f"Total acc={accuracy_filter(experiment):.4f}, known acc={accuracy_filter(experiment, filter_vocab='both_vocab'):.4f}, \
        unk acc={accuracy_filter(experiment, filter_vocab='neither_vocab'):.4f}")



total=12324, train=55737, morphlex=54068,         both=58988, neither=385
Total acc=0.9224, known acc=0.9273,         unk acc=0.3075
total=12324, train=51824, morphlex=54068,         both=56562, neither=412
Total acc=0.9584, known acc=0.9603,         unk acc=0.7832


### Errors
Proposed tag -> gold tag

In [None]:
print_errors(experiments)


In [None]:
def get_error_diff(baseline, compare_to):
    result = {}
    for key, value in baseline.items():
        if key not in compare_to:
            result[key] = basline[key]
            continue
        diff = baseline[key] - compare_to[key]
        if diff != 0:
            result[key] = diff
    return result

            

In [None]:
from collections import Counter
c_1 = Counter((1, 1, 2))
c_2 = Counter((1, 1, 3))
print(c_1 - c_2)

In [None]:
baseline = "sgd+morph_lex-freeze+wemb-pretrained-300-reduced-lr"
baseline_errors = evaluate.all_errors(experiments[baseline]["examples"])
for experiment_name in experiments:
    if experiment_name == baseline:
        continue
    print(f"{experiment_name}: First, in basline not in {experiment_name}, then in {experiment_name} not in basline.")
    pprint((evaluate.all_errors(experiments[baseline]["examples"]) - evaluate.all_errors(experiments[experiment_name]["examples"])).most_common(20))
    pprint((evaluate.all_errors(experiments[experiment_name]["examples"]) - evaluate.all_errors(experiments[baseline]["examples"])).most_common(20))