<a href="https://colab.research.google.com/github/cdrc1103/NER/blob/main/Bert_for_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers

In [2]:
%%capture
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

In [20]:
ner = pipeline("ner", model=model, tokenizer=tokenizer)
example = """
My name is Wolfgang and I live in Berlin. Recently, I started working 
at Capgemini were I work as a data scientist.
"""

ner_results = ner(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.99913836, 'index': 4, 'word': 'Wolfgang', 'start': 12, 'end': 20}, {'entity': 'B-LOC', 'score': 0.9996517, 'index': 9, 'word': 'Berlin', 'start': 35, 'end': 41}, {'entity': 'B-ORG', 'score': 0.99671835, 'index': 17, 'word': 'Cap', 'start': 75, 'end': 78}, {'entity': 'I-ORG', 'score': 0.9911644, 'index': 18, 'word': '##ge', 'start': 78, 'end': 80}, {'entity': 'I-ORG', 'score': 0.9921948, 'index': 19, 'word': '##mini', 'start': 80, 'end': 84}]


In [55]:
import copy

def join_results(results):
    joined_results = []
    for result in results:
        if "##" in result["word"] and joined_results:
            joined_results[-1]["end"] = result["end"]
            joined_results[-1]["word"] += remove_prefix(result["word"], "##")
            joined_results[-1]["score"] = min(joined_results[-1]["score"], result["score"])
        else:
            joined_results.append(result)
    return joined_results


def clean_result(result):
    result["label"] = remove_prefix(result["entity"], "-")
    return result


def remove_prefix(word, prefix):
    if prefix in word:
        return word.split(prefix, 1)[1]
    return " " + word


def convert_to_displacy_format(example, ner_results, threshold=0.9):
    results = copy.deepcopy(ner_results)
    joined_results = join_results(results)
    filtered_results = [r for r in joined_results if r["score"] > threshold]
    cleaned_results = [clean_result(r) for r in filtered_results]
    return [{
        "text": example,
        "ents": cleaned_results,
        "title": None
    }]

In [16]:
from spacy import displacy
displacy_results = convert_to_displacy_format(example, ner_results)
displacy.render(displacy_results, style="ent", jupyter=True, manual=True)

In [18]:
%%capture
!pip install gradio
import gradio as gr

In [34]:
def inference_pipeline(input_text: str) -> str:
    """Run NER model and return annotated text"""
    ner_results = ner(input_text)
    displacy_results = convert_to_displacy_format(input_text, ner_results)
    return displacy.render(displacy_results, style="ent", manual=True)

In [None]:
EXAMPLE_INPUT = (
    "My name is Wolfgang and I live in Berlin. Recently, I started working " 
    + "at Brainlab were I work as a data scientist."
)

TITLE = "Named Entity Recognition"
DESCRIPTION = (
        "Insert a text of your choice and let it be processed by the "
        + "NER Model for Organizations, Persons, and Locations."
)

inputs = gr.Textbox(label="Input Text")
outputs = gr.HTML()

demo = gr.Interface(
    fn=inference_pipeline,
    inputs=inputs,
    outputs=outputs,
    title=TITLE,
    examples=[[EXAMPLE_INPUT]],
    description=DESCRIPTION,
    allow_flagging="never"
)
demo.launch(debug=True)