#### Assistive Auto Labelling
Before we import our sentences into doccano for manual labelling, we'll use the auto_tagger to automatically label them. This will make it easier to do the manual labelling as a few entities will already be correctly or almost correctly identified.

In [None]:
# Load the auto_tagger model
from transformers import pipeline
assistive_auto_tagger = pipeline(
    "token-classification",
    model='auto_tagger',
    aggregation_strategy="average",
    ignore_labels=[""],
    device=0
)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Reformat the classified entities text to match the original sentence.
df_unique_sentences['tagged_entities'] = df_unique_sentences.apply(lambda x: data_processing.reformat_sentence(x['sentence'], x['tagged_entities']), axis=1)

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Function that takes a Hugging Face NER Pipeline dictionary output and formats it in a format that can be used by Doccano for manual labelling.
# This is useful when using the NER Pipeline as an assistive auto labelling before doing the manual work.
def ner_entity_dict_to_doccano_jsonl(tagged_entities):
    # Join the entities texts to get the full sentence
    full_sentence_text = ""
    entities_metadata = []
    for entity in tagged_entities:
        full_sentence_text = full_sentence_text + entity['word'] + ' '
        if entity['entity_group'] != 'O':
            entities_metadata.append([entity['start'], entity['end'], entity['entity_group']])
    full_sentence_text = re.sub(' ,', ',', full_sentence_text)
    return full_sentence_text, entities_metadata

    # Get a list of the entities in the format [START, END, TYPE] for each of them.

In [None]:
df_unique_sentences.head()

In [None]:
df_unique_sentences['tagged_entities'].loc[2]
print(ner_entity_dict_to_doccano_jsonl(df_unique_sentences['tagged_entities'].loc[2]))
df_unique_sentences['label'] = df_unique_sentences.apply(lambda x: ner_entity_dict_to_doccano_jsonl(x['tagged_entities'])[1],axis=1)

In [None]:
df_unique_sentences.to_json(os.path.join(path_to_train, 'unique_sentences.jsonl'),lines=True, orient = 'records')

In [None]:
df_unique_sentences[df_unique_sentences['sentence_id'] == 2507]