# Annotation by Model

This notebook takes a pre-trained local model and uses it for NER tasks.
The model's goal is to identify and label text that references sections within the act.

Prior to this notebook, you must have run `Training a Model` to generate the trained model.

Ensure to define the labels you wish the model to use in the `label_list` variable.

Input file should be a JSONL file with objects matching this format:
```json
{"meta": {"identity": 73955, "sectionId": "12", "sectionName": "Repealed", "ActId": "Civil Forfeiture Act"}, "text": "repealed 12 [ repealed 2023 - 13 - 11. ]", "label": []}
```

In [None]:
%pip install transformers

In [None]:
# intializing tokenizer with help of bert model
# must use "fast" version to get start and end indices
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [None]:
import json

# Adjust the trained model's config
config = json.load(open("exported_models/fine_tuned_ner_model/config.json"))
label_list = ["O", "B_ACT", "I_ACT", "B_REF_IN", "I_REF_IN", "B_REF_EX", "I_REF_EX"]
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("exported_models/fine_tuned_ner_model/config.json","w"), indent=2)

# Load our trained model
from transformers import AutoModelForTokenClassification     # This class is responsible for load model into memory
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("exported_models/fine_tuned_ner_model")

# Pipeline handles the NER process for a single chunk of text
from transformers import pipeline
nlp = pipeline(
    "ner",
    model=model_fine_tuned,
    tokenizer=tokenizer,
    ignore_labels=["O"] # ignore_labels is ["O"] by default
) 

# Enable offsets so we have a start and end for labels
tokenizer_kwargs = {"return_offsets_mapping": True}
nlp.tokenizer_kwargs = tokenizer_kwargs

In [None]:
# How to use the pipeline
example = "This can be found in section 14.07 in this act."
ner_results = nlp(example)
print(ner_results)

In [None]:
# Converts the label output from the model back to doccano's label format
def convert_ner_result(ner_result):
    start = ner_result["start"]
    end = ner_result["end"]
    tag = ner_result["entity"][2:]
    return [start, end, tag]

In [None]:
def merge_overlapping_tags(tags):
    # This works because we assume they are already ordered
    # If not, need to sort first
    merged_tags = []
    for tag in tags:
        # If merged_tags is empty or there's no overlap or labels differ
        if not merged_tags or tag[0] > merged_tags[-1][1] + 1 or tag[2] != merged_tags[-1][2]:
            merged_tags.append(tag)
        else:  # Overlap and labels are the same, merge the tags
            merged_tags[-1][1] = max(merged_tags[-1][1], tag[1])  # Extend end

    return merged_tags

In [None]:
# From input we would have imported in doccano,
# Replace the empty labels with model-generated labels and write to file
with open("./doccano_import.jsonl", "r") as input:
  with open("./model_annotated_output.jsonl", "w") as output:
    for index, line in enumerate(input):
      object = json.loads(line)
      ner_results = nlp(object["text"])
      label_list = list(map(convert_ner_result, ner_results))
      object["label"] = merge_overlapping_tags(label_list)
      json.dump(object, output, ensure_ascii=False)
      output.write("\n")
    