In [2]:
from pathlib import Path
from pprint import pprint
import pickle

from pos import evaluate
from pos import data
EXPERIMENT_DIR=Path(".")/"out"

## Read the experiment files
Here are some useful functions for gathering experimental data and doing averages across folds.

In [3]:
def get_examples(experiment_name):
    predictions = EXPERIMENT_DIR / experiment_name / "predictions.tsv"
    return evaluate.analyse_examples(evaluate.flatten_data(data.read_tsv(str(predictions))))

def get_dicts(experiment_name):
    with (EXPERIMENT_DIR / experiment_name / "dictionaries.pickle").open('rb') as f:
        return pickle.load(f)

def gather_experiments(experiments):
    experiments = { 
        experiment_name: {
            "examples": get_examples(experiment_name),
            "dicts": get_dicts(experiment_name),
        } for experiment_name in experiments
    }
    for experiment_name, experiment in experiments.items():
        experiment["test_vocab"] = evaluate.get_vocab(experiment["examples"])
        experiment["train_vocab"] = set(experiment["dicts"]["w_map"].w2i.keys())
        experiment["morphlex_vocab"] = set(experiment["dicts"]["m_map"].w2i.keys())
        experiment["both_vocab"] = experiment["train_vocab"].union(experiment["morphlex_vocab"])
        experiment["neither_vocab"] = experiment["test_vocab"].difference(experiment["both_vocab"])
    return experiments

def accuracy_filter(experiment, filter_vocab=None):
    if filter_vocab is not None:
        return evaluate.calculate_accuracy(evaluate.filter_examples(experiment['examples'], experiment[filter_vocab]))
    return evaluate.calculate_accuracy(experiment['examples'])

def print_errors(experiments):
    for experiment_name in experiments:
        print(experiment_name)
        errors = evaluate.all_errors(experiments[experiment_name]["examples"])
        pprint(errors.most_common(20))

def average_accuracy(experiments, filter_vocab=None):
    if filter_vocab is not None:
        sum(accuracy_filter(experiment, filter_vocab=filter_vocab
            ) for experiment in experiments.values()) / len(experiments)
    return sum((evaluate.calculate_accuracy(experiment['examples']) for experiment in experiments.values())) / len(experiments)

### Folds

In [5]:
fold_experiments = {
    "baseline": gather_experiments(f"gold-baseline/{i:02}" 
        for i in range(1, 10))
}

FileNotFoundError: [Errno 2] No such file or directory: 'out/gold-baseline/01/predictions.tsv'

In [66]:
for fold_experiment in fold_experiments:
        for experiment in fold_experiment.values():
            print(f"total={len(experiment['test_vocab'])}, \
                train={len(experiment['train_vocab'])}, \
                morphlex={len(experiment['morphlex_vocab'])}, \
                both={len(experiment['both_vocab'])}, \
                neither={len(experiment['neither_vocab'])}")
for fold_experiment in fold_experiments:
    print(f"Total acc={average_accuracy(fold_experiment):.4f}, known acc={average_accuracy(fold_experiment, filter_vocab='both_vocab'):.4f}, \
    unk acc={average_accuracy(fold_experiment, filter_vocab='neither_vocab'):.4f}")

total=13277,                 train=116,                 morphlex=54068,                 both=54178,                 neither=864
total=12615,                 train=116,                 morphlex=54068,                 both=54178,                 neither=750
total=12551,                 train=114,                 morphlex=54068,                 both=54176,                 neither=811
total=12431,                 train=116,                 morphlex=54068,                 both=54178,                 neither=872
total=12379,                 train=116,                 morphlex=54068,                 both=54178,                 neither=807
total=12626,                 train=114,                 morphlex=54068,                 both=54176,                 neither=830
total=12567,                 train=115,                 morphlex=54068,                 both=54177,                 neither=846
total=12806,                 train=114,                 morphlex=54068,                 both=54176,     

## Individual experiments
Define individual experiments and do analysis.


In [60]:
baseline = "abl-tagger-baseline"
experiment_names = [
    baseline,
    "baseline",
]
experiments = gather_experiments(experiment_names)

In [62]:
for experiment in experiments.values():
    print(f"total={len(experiment['test_vocab'])}, \
                train={len(experiment['train_vocab'])}, \
                morphlex={len(experiment['morphlex_vocab'])}, \
                both={len(experiment['both_vocab'])}, \
                neither={len(experiment['neither_vocab'])}")
    print(f"Total acc={accuracy_filter(experiment):.4f}, known acc={accuracy_filter(experiment, filter_vocab='both_vocab'):.4f}, \
        unk acc={accuracy_filter(experiment, filter_vocab='neither_vocab'):.4f}")



total=12324, train=55737, morphlex=54068,         both=58988, neither=385
Total acc=0.9224, known acc=0.9273,         unk acc=0.3075
total=12324, train=51824, morphlex=54068,         both=56562, neither=412
Total acc=0.9584, known acc=0.9603,         unk acc=0.7832


### Errors
Proposed tag -> gold tag

In [None]:
print_errors(experiments)


In [None]:
def get_error_diff(baseline, compare_to):
    result = {}
    for key, value in baseline.items():
        if key not in compare_to:
            result[key] = basline[key]
            continue
        diff = baseline[key] - compare_to[key]
        if diff != 0:
            result[key] = diff
    return result

            

In [None]:
from collections import Counter
c_1 = Counter((1, 1, 2))
c_2 = Counter((1, 1, 3))
print(c_1 - c_2)

In [None]:
baseline = "sgd+morph_lex-freeze+wemb-pretrained-300-reduced-lr"
baseline_errors = evaluate.all_errors(experiments[baseline]["examples"])
for experiment_name in experiments:
    if experiment_name == baseline:
        continue
    print(f"{experiment_name}: First, in basline not in {experiment_name}, then in {experiment_name} not in basline.")
    pprint((evaluate.all_errors(experiments[baseline]["examples"]) - evaluate.all_errors(experiments[experiment_name]["examples"])).most_common(20))
    pprint((evaluate.all_errors(experiments[experiment_name]["examples"]) - evaluate.all_errors(experiments[baseline]["examples"])).most_common(20))