# Inference

This notebook produces the output required for a submission to the competition.

In [1]:
from biome.text import Pipeline, Dataset
from typing import List, Dict
from helper import get_custom_tokenizer_v1
from spacy.gold import offsets_from_biluo_tags
from allennlp.data.dataset_readers.dataset_utils.span_utils import to_bioul
import warnings
from pathlib import Path
import glob
import spacy
from transformers import AutoTokenizer
from tqdm.auto import tqdm

In [10]:
model_path = "../experiments/final_transformer_model/model.tar.gz"
#model_path = "../experiments/final_rnn_model/model.tar.gz"

pipeline = Pipeline.from_pretrained(model_path)

**Transformer model**

In [4]:
pipeline.num_trainable_parameters

110446890

In [11]:
%%timeit
pipeline.predict(["test", "this"])

24.5 ms ± 854 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


**RNN model**

In [7]:
pipeline.num_trainable_parameters

15187102

In [9]:
%%timeit
pipeline.predict(["test", "this"])

3.7 ms ± 103 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
110446890 / 15187102, 24.5 / 3.7

(7.272413789016496, 6.621621621621621)

# Preprocess input data

Apply the same preprocessing as we did for the training data

In [57]:
nlp = get_custom_tokenizer_v1()

In [58]:
bert_tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased') 

In [59]:
txt_files = list(map(Path, sorted(glob.glob("../raw_data/profner_test+background/subtask-1/test-background-txt-files/*.txt"))))

In [60]:
def get_inference_dataset(
    txt_files: List[Path], 
    nlp: spacy.language.Language, 
    replace_antibert_token_with: str = None, 
    bert_tokenizer: "transformers.AutoTokenizer" = None
):
    data = {
        "raw_text": [],
        "tokens": [],
        "file_name": [],
    }
    
    for txt in tqdm(txt_files, total=len(txt_files)):
        doc = nlp(txt.read_text())

        tokens_str = list(map(str, doc))
        if replace_antibert_token_with is not None:
            for i, token in enumerate(tokens_str):
                input_ids = bert_tokenizer([token], is_split_into_words=True)["input_ids"]
                if len(input_ids) <= 2:
                    tokens_str[i] = replace_antibert_token_with

        data["raw_text"].append(doc.text)
        data["tokens"].append(tokens_str)
        data["file_name"].append(txt.name)
        
    return Dataset.from_dict(data)

In [61]:
dataset = get_inference_dataset(
    txt_files,
    nlp,
    replace_antibert_token_with="æ",
    bert_tokenizer=bert_tokenizer,
)

HBox(children=(FloatProgress(value=0.0, max=27000.0), HTML(value='')))




In [62]:
def batch_prediction(tokens_list):
    batch = [{"tokens": tokens} for tokens in tokens_list]
    return {"predictions": pipeline.predict(batch=batch)}
    
dataset = dataset.map(batch_prediction, input_columns="tokens", batched=True, batch_size=64)

HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))




# Classification output

In [64]:
def batch_classification_output(file_names: List[str], predictions: List[Dict]):
    return {
        "tweet_id": [file_name.split('.')[0] for file_name in file_names],
        "label": [prediction["classification_labels"][0] for prediction in predictions],                 
    }
    
ds = dataset.map(
    batch_classification_output,
    input_columns=["file_name", "predictions"],
    batched=True,
    batch_size=2,
    remove_columns=dataset.column_names
)

HBox(children=(FloatProgress(value=0.0, max=13500.0), HTML(value='')))




In [65]:
df = ds.head(n=None)[["tweet_id", "label"]]
df

Unnamed: 0,tweet_id,label
0,1242399368143147012,0
1,1242402119623286791,0
2,1242402388574601216,1
3,1242402392475340803,0
4,1242402598642167808,0
...,...,...
26995,1293657766507274245,0
26996,1293659187654909958,0
26997,1293661906151051264,0
26998,1293662952831188992,0


In [67]:
df.to_csv("submissions/final_transformer_model/task7a_test.tsv", sep="\t", index=False)

# NER output

In [68]:
nlp = get_custom_tokenizer_v1()

In [77]:
def batch_ner_output(file_names: List[str], raw_text: List[str], predictions: List[Dict]):
    #print(file_names, raw_text, predictions)
    docs = [nlp(text) for text in raw_text]
    bioul_tags = [to_bioul(prediction["ner_tags"]) for prediction in predictions]
    batch_offsets = [offsets_from_biluo_tags(doc, entities) for doc, entities in zip(docs, bioul_tags)]
    tweet_ids = [file_name.split('.')[0] for file_name in file_names]
    #print(tweet_ids, bioul_tags, batch_offsets, raw_text)
    
    tweet_id, begin, end, ent_type, extraction = [], [], [], [], []
    for tid, offsets, text in zip(tweet_ids, batch_offsets, raw_text):
        tweet_id += [tid]*len(offsets)
        begin += [offset[0] for offset in offsets]
        end += [offset[1] for offset in offsets]
        ent_type += [offset[2] for offset in offsets]
        extraction += [text[offset[0]:offset[1]] for offset in offsets]
    
    if any(['\n' in ext for ext in extraction]):
        print("Found 'newline' in extraction!!")
    #print(tweet_id, begin, end, ent_type, extraction)
    return {
        "tweet_id": tweet_id,
        "begin":begin,
        "end": end,
        "type":ent_type,
        "extraction": extraction,
    }


In [78]:
ds = dataset.map(
    batch_ner_output,
    input_columns=["file_name", "raw_text", "predictions"],
    batched=True,
    batch_size=8,
    remove_columns=dataset.column_names
)

HBox(children=(FloatProgress(value=0.0, max=3375.0), HTML(value='')))

Found 'newline' in extraction!!
Found 'newline' in extraction!!
Found 'newline' in extraction!!
Found 'newline' in extraction!!
Found 'newline' in extraction!!
Found 'newline' in extraction!!
Found 'newline' in extraction!!
Found 'newline' in extraction!!
Found 'newline' in extraction!!
Found 'newline' in extraction!!



In [75]:
df = ds.head(n=None)[["tweet_id", "begin", "end", "type", "extraction"]]
df

Unnamed: 0,tweet_id,begin,end,type,extraction
0,1242402388574601216,63,71,SITUACION_LABORAL,personal
1,1242409027801419777,4,18,SITUACION_LABORAL,universitarios
2,1242409027801419777,108,119,PROFESION,#sanitarios
3,1242420268397662208,22,32,PROFESION,sanitarios
4,1242437532123611136,24,60,PROFESION,personal contratado de enfermería de
...,...,...,...,...,...
7256,1293537468730552322,139,151,SITUACION_LABORAL,trabajadoras
7257,1293537468730552322,154,166,SITUACION_LABORAL,trabajadores
7258,1293575377323995138,4,30,PROFESION,funcionarios de la prisión
7259,1293599239046651911,98,108,PROFESION,presidente


In [76]:
df.to_csv("submissions/final_transformer_model/task7b_test.tsv", sep="\t", index=False)