# 🧪 Experiment: Gazetteer Attributes
This notebook evaluate the test set for the task `Gazetteer Attributes`.

**Note**: Before conducting experiments, you need to install `kaner` package first. Otherwise, this notebook will raise an *import error*.

```bash
cd ../
python setup.py install
```

In [26]:
import json
import os
import gc
from copy import deepcopy
from typing import List, Dict, Any
from datetime import datetime
import pprint

import tqdm
import torch.nn as nn

from kaner.context import GlobalContext as gctx
from kaner.adapter.tokenizer import CharTokenizer
from kaner.adapter.knowledge import Gazetteer
from kaner.adapter.in_adapter import split_dataset
from kaner.adapter.out_adapter import BaseOutAdapter
from kaner.trainer import NERTrainer, TrainerConfig
from kaner.tracker import NERTracker, NERTrackerRow
from kaner.common import load_json, load_jsonl, save_json
from kaner.common.func import query_time


gctx.init()

## 1. Gazetteer Size

In [27]:
def get_masked_gazetteer_by_pp(lexicon_pp: float, gazetteer_folder: str) -> Gazetteer:
    assert isinstance(lexicon_pp, float) and 0. < lexicon_pp <= 1.0
    gazetteer = Gazetteer(gazetteer_folder)
    lexicons = []
    with open(os.path.join(gazetteer_folder, "lexicons.txt"), "r", encoding="utf-8") as fin:
        for line in fin.readlines():
            lexicon = line.replace("\n", "").split("\t")[0]
            if lexicon == "[PAD]":
                continue
            lexicons.append(lexicon)
    base_length = int(len(lexicons)*lexicon_pp)
    masked_lexicons = list(set(lexicons) - set(lexicons[:base_length]))
    print("Total: {0}, Remaining: {1}, Masked: {2}".format(len(lexicons), base_length, len(masked_lexicons)))
    gazetteer.mask(masked_lexicons, True)

    return gazetteer


def train(config: TrainerConfig, lexicon_pp: float) -> Dict[str, Any]:
    """
    Given a configuration, train a model on a dataset with gazetteer modification.

    Args:
        config (TrainerConfig): Trainer Configuration.
    """

    def update_hyperparameters(tokenizer: CharTokenizer, out_adapter: BaseOutAdapter, gazetteer: Gazetteer):
        """
        Update hyper parameters.

        Args:
            tokenizer (CharTokenizer): Tokenizer.
            out_adapter (BaseOutAdapter): Output adapter.
            gazetteer (Gazetteer): Gazetteer.
        """
        partial_configs = {"n_tags": len(out_adapter)}
        partial_configs.update(tokenizer.configs())
        partial_configs.update(gazetteer.configs())

        return partial_configs

    raw_datasets = split_dataset(config.dataset_folder, dataset_pp=config.dataset_pp)
    tokenizer = CharTokenizer(config.tokenizer_model_folder)
    tokenizer.save(config.output_folder)
    gazetteer = get_masked_gazetteer_by_pp(lexicon_pp, config.gazetteer_model_folder)
    gazetteer.save(config.output_folder)
    out_adapter = gctx.create_outadapter(config.out_adapter, dataset_folder=config.dataset_folder, file_name="labels")
    out_adapter.save(config.output_folder, "labels")
    in_adapters = (
        gctx.create_inadapter(
            config.in_adapter, dataset=dataset, tokenizer=tokenizer, out_adapter=out_adapter, gazetteer=gazetteer,
            **config.hyperparameters
        )
        for dataset in raw_datasets
    )
    token_embeddings = tokenizer.embeddings()
    lexicon_embeddings = gazetteer.embeddings()
    config.hyperparameters = update_hyperparameters(tokenizer, out_adapter, gazetteer)
    collate_fn = gctx.create_batcher(
        config.model, input_pad=tokenizer.pad_id, output_pad=out_adapter.unk_id, lexicon_pad=gazetteer.pad_id, device=config.device
    )
    model = gctx.create_model(config.model, **config.hyperparameters, token_embeddings=token_embeddings, lexicon_embeddings=lexicon_embeddings)
    trainer = NERTrainer(
        config, tokenizer, in_adapters, out_adapter, collate_fn, model, nn.CrossEntropyLoss(),
        gctx.create_traincallback(config.model), gctx.create_testcallback(config.model)
    )
    results = trainer.train()

    return results, trainer

In [28]:
def trainall(labpath: str, cfgdir: str, m: List[str], d: List[str], n: int, **kwargs) -> None:
    """
    Experiments for all model's training.

    Args:
        labpath (str): The file path of recording experimental results.
        cfgdir (str): Configuration folder.
        m (List[str]): All specific models to be trained.
        d (List[str]): All specific datasets to be tested.
        n (int): The number of training repeating times.
        tag (str): Experimental tags.
    """

    def update_names(names: List[str], all_names: List[str], name_type: str) -> List[str]:
        """
        Check whether the name that user inputs is correct.

        Args:
            names (List[str]): The names (dataset, model, gazetteer) that user inputs.
            all_names (List[str]): All names (dataset, model, gazetteer) that this libary provides.
            name_type (str): The type of the name (Dataset, Model, Gazetteer).
        """
        if len(names) == 0:
            names = all_names
        else:
            for name in names:
                if name not in all_names:
                    print("[{0}] {1} is not in {2}".format(name_type, name, all_names))
                    exit(0)
        return names

    tracker = NERTracker.load(labpath)
    models = update_names(m, gctx.get_model_names(), "Model")
    datasets = update_names(d, gctx.get_dataset_names(), "Dataset")

    print("--------------------- Laboratory Configuration ---------------------")
    print("Models: {0}".format(models))
    print("Datasets: {0}".format(datasets))
    print("--------------------------------------------------------------------")

    for dataset in datasets:
        for model in models:
            for _ in range(n):
                for pp in [0.2, 0.4, 0.6, 0.8, 1.0]:
                    tag = "lexicon-pp:{0}".format(pp)
                    if len(tracker.query(dataset=dataset, model=model, tag=tag)) >= n:
                        continue
                    config = TrainerConfig(os.path.join(cfgdir, model + ".yml"), dataset=dataset, **kwargs)
                    start = str(datetime.now())
                    try:
                        results, trainer = train(config, pp)
                    except RuntimeError as error:
                        print(error)
                        continue
                    tracker.insert(
                        NERTrackerRow(
                            start, model, dataset, config.tokenizer_model, config.gazetteer_model, config.output_folder, query_time(trainer.train),
                            results["f1-score"], results["precision-score"], results["recall-score"], results["epoch_count"], results["test-loss"], tag
                        )
                    )
                    tracker.save(labpath)
                    del trainer

In [None]:
labpath = "../data/logs/experiments_gazetteer_size.csv"
cfgdir = "../configs"
models = ["ses", "cgn", "mdgg"]
datasets = ["weiboner"]
n = 1
kwargs = {"data_folder": "../data"}

trainall(labpath, cfgdir, models, datasets, n, **kwargs)

--------------------- Laboratory Configuration ---------------------
Models: ['ses', 'cgn', 'mdgg']
Datasets: ['weiboner']
--------------------------------------------------------------------
[Dataset: ../data/datahub/weiboner] 1350 train, 270 dev, 270 test. (resplit: False)
Total: 704368, Remaining: 140873, Masked: 563495


Text2Tensor: 100%|██████████| 1350/1350 [00:00<00:00, 1834.29it/s]
Text2Tensor: 100%|██████████| 270/270 [00:00<00:00, 1828.44it/s]
Text2Tensor: 100%|██████████| 270/270 [00:03<00:00, 83.16it/s]
Epoch 0: 100%|██████████| 22/22 [00:05<00:00,  3.91it/s]
2020-11-19 06:27:44 [ses, weiboner] epoch: 0, no_improvement: 0, dev-f1: 0.16335, dev-precision: 0.36283, dev-recall: 0.1054, dev-loss: 11.10511, train-loss: 19.79544
Epoch 1: 100%|██████████| 22/22 [00:05<00:00,  3.83it/s]
2020-11-19 06:27:51 [ses, weiboner] epoch: 1, no_improvement: 0, dev-f1: 0.42493, dev-precision: 0.47319, dev-recall: 0.3856, dev-loss: 7.98017, train-loss: 9.97929
Epoch 2: 100%|██████████| 22/22 [00:05<00:00,  3.94it/s]
2020-11-19 06:27:59 [ses, weiboner] epoch: 2, no_improvement: 0, dev-f1: 0.51223, dev-precision: 0.51289, dev-recall: 0.51157, dev-loss: 7.14867, train-loss: 6.49991
Epoch 3: 100%|██████████| 22/22 [00:05<00:00,  3.95it/s]
2020-11-19 06:28:06 [ses, weiboner] epoch: 3, no_improvement: 0, dev-f1: 0.5245

[Timing] kaner.trainer.base.train function took 208.714 sec
# Save experimental data into ../data/logs/experiments_gazetteer_size.csv
[Dataset: ../data/datahub/weiboner] 1350 train, 270 dev, 270 test. (resplit: False)
Total: 704368, Remaining: 281747, Masked: 422621


Text2Tensor: 100%|██████████| 1350/1350 [00:03<00:00, 357.78it/s]
Text2Tensor: 100%|██████████| 270/270 [00:00<00:00, 1730.86it/s]
Text2Tensor: 100%|██████████| 270/270 [00:00<00:00, 1767.22it/s]
Epoch 0: 100%|██████████| 22/22 [00:09<00:00,  2.41it/s]
2020-11-19 06:31:28 [ses, weiboner] epoch: 0, no_improvement: 0, dev-f1: 0.18785, dev-precision: 0.33117, dev-recall: 0.13111, dev-loss: 11.84804, train-loss: 20.37469
Epoch 1: 100%|██████████| 22/22 [00:05<00:00,  3.96it/s]
2020-11-19 06:31:35 [ses, weiboner] epoch: 1, no_improvement: 0, dev-f1: 0.41252, dev-precision: 0.46178, dev-recall: 0.37275, dev-loss: 8.19686, train-loss: 10.16248
Epoch 2: 100%|██████████| 22/22 [00:05<00:00,  3.90it/s]
2020-11-19 06:31:42 [ses, weiboner] epoch: 2, no_improvement: 0, dev-f1: 0.48969, dev-precision: 0.49096, dev-recall: 0.48843, dev-loss: 7.26791, train-loss: 6.87067
Epoch 3: 100%|██████████| 22/22 [00:05<00:00,  3.95it/s]
2020-11-19 06:31:49 [ses, weiboner] epoch: 3, no_improvement: 0, dev-f1: 0.