# Install Necessary Updates & Tools

In [None]:
!pip install allennlp-light
!pip install -U datasets
!pip install -U torch torchvision torchaudio
!pip install transformers -U

In [None]:
!pip install tner
# or
# !git clone https://github.com/asahi417/tner
# !cd tner

In [None]:
from tner import GridSearcher
from tner import TransformersNER
import os

In [3]:
def gridsearcher_init(CHECKPOINT_DIR, MODEL_PATH):
    return GridSearcher(
       checkpoint_dir= CHECKPOINT_DIR,
       dataset='tner/fin',  # either of `dataset` (huggingface dataset) or `local_dataset` (custom dataset) should be given
       model=MODEL_PATH,  # language model to fine-tune
       epoch=10,  # the total epoch (`L` in the figure)
       epoch_partial=5,  # the number of epoch at 1st stage (`M` in the figure)
       n_max_config=3,  # the number of models to pass to 2nd stage (`K` in the figure)
       batch_size=16,
       gradient_accumulation_steps=[4, 8],
       crf=[True, False],
       lr=[1e-4, 1e-5],
       weight_decay=[1e-7],
       random_seed=[42],
       lr_warmup_step_ratio=[0.1],
       max_grad_norm=[10]  
    )

# Run finetuning & evaluation via TNER

In [None]:
# see https://huggingface.co/datasets/tner/fin for more details on the dataset
DATASET_PATH = 'tner/fin'

## phi base model

In [4]:
MODEL_PATH = 'microsoft/phi-1_5'
CHECKPOINT_PATH = './resulting_models/phi-1_5/'

In [7]:
searcher = gridsearcher_init(CHECKPOINT_PATH, MODEL_PATH)
searcher.train()

In [None]:
best_checkpoint = '' # specify best checkpoint after the searcher results

In [None]:
model = TransformersNER(os.path.join(CHECKPOINT_PATH, best_checkpoint))  # provide model alias on huggingface or path to the best epoch model
metric = model.evaluate(DATASET_PATH, dataset_split='test') # huggingface dataset evaluation
# metric = model.evaluate(local_dataset={"test": "examples/local_dataset_sample/test.txt"}, dataset_split='test') # local dataset evaluation

## pythia base model

In [None]:
MODEL_PATH = 'EleutherAI/pythia-1.4b'
CHECKPOINT_PATH = './resulting_models/pythia/'

In [None]:
searcher = gridsearcher_init(CHECKPOINT_PATH, MODEL_PATH)
searcher.train()

In [None]:
best_checkpoint = '' # specify best checkpoint after the searcher results

In [None]:
model = TransformersNER(os.path.join(CHECKPOINT_PATH, best_checkpoint))  # provide model alias on huggingface or path to the best epoch model
metric = model.evaluate(DATASET_PATH, dataset_split='test') # huggingface dataset evaluation
# metric = model.evaluate(local_dataset={"test": "examples/local_dataset_sample/test.txt"}, dataset_split='test') # local dataset evaluation

## phi-beancounter model

In [None]:
MODEL_PATH = 'bradfordlevy/phi-1_5-bc-cp'
CHECKPOINT_PATH = './resulting_models/phi_bc/'

In [None]:
searcher = gridsearcher_init(CHECKPOINT_PATH, MODEL_PATH)
searcher.train()

In [None]:
best_checkpoint = '' # specify best checkpoint after the searcher results

In [None]:
model = TransformersNER(os.path.join(CHECKPOINT_PATH, best_checkpoint))  # provide model alias on huggingface or path to the best epoch model
metric = model.evaluate(DATASET_PATH, dataset_split='test') # huggingface dataset evaluation
# metric = model.evaluate(local_dataset={"test": "examples/local_dataset_sample/test.txt"}, dataset_split='test') # local dataset evaluation

## pythia-beancounter model

In [None]:
CHECKPOINT_PATH = './resulting_models/pythia_bc/' # directory where checkpoints will be saved during finetuning
MODEL_PATH = 'bradfordlevy/pythia-1.4b-bc-cp' # language model to fine-tune, local or via huggingface

In [None]:
searcher = gridsearcher_init(CHECKPOINT_PATH, MODEL_PATH)
searcher.train()

In [None]:
best_checkpoint = '' # specify best checkpoint after the searcher results

In [None]:
model = TransformersNER(os.path.join(CHECKPOINT_PATH, best_checkpoint))  # provide model alias on huggingface or path to the best epoch model
metric = model.evaluate(DATASET_PATH, dataset_split='test') # huggingface dataset evaluation
# metric = model.evaluate(local_dataset={"test": "examples/local_dataset_sample/test.txt"}, dataset_split='test') # local dataset evaluation

# Potential trouble-shooting

Pad token related error: go into tner/tner/ner_tokenizer.py and change line 43 to: \
    `self.tokenizer.pad_token = self.tokenizer.unk_token` \
    `self.tokenizer.pad_token_id = self.tokenizer.unk_token_id`

OR

Save tokenizer with pad_token and pad_token_id configured in MODEL_PATH