# Inference

This notebook produces the output required for a submission to the competition.

In [1]:
from biome.text import Pipeline, Dataset
from typing import List, Dict
from helper import get_custom_tokenizer_v1
from spacy.gold import offsets_from_biluo_tags
from allennlp.data.dataset_readers.dataset_utils.span_utils import to_bioul
import warnings
from pathlib import Path
import glob
import spacy
from transformers import AutoTokenizer
from tqdm.auto import tqdm

In [2]:
pipeline = Pipeline.from_pretrained("../experiments/still_plasma_32_model.tar.gz")

In [3]:
pipeline.predict(["test", "this"])

{'classification_labels': ['0', '1'],
 'classification_probabilities': [0.9994658827781677, 0.0005341034848242998],
 'ner_tags': ['O', 'O']}

# Preprocess input data

Apply the same preprocessing as we did for the training data

In [4]:
nlp = get_custom_tokenizer_v1()

In [5]:
bert_tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased') 

In [6]:
txt_files = list(map(Path, sorted(glob.glob("../raw_data/subtask-2/brat/valid/*.txt"))))

In [7]:
def get_inference_dataset(
    txt_files: List[Path], 
    nlp: spacy.language.Language, 
    replace_antibert_token_with: str = None, 
    bert_tokenizer: "transformers.AutoTokenizer" = None
):
    data = {
        "raw_text": [],
        "tokens": [],
        "file_name": [],
    }
    
    for txt in tqdm(txt_files, total=len(txt_files)):
        doc = nlp(txt.read_text())

        tokens_str = list(map(str, doc))
        if replace_antibert_token_with is not None:
            for i, token in enumerate(tokens_str):
                input_ids = bert_tokenizer([token], is_split_into_words=True)["input_ids"]
                if len(input_ids) <= 2:
                    tokens_str[i] = replace_antibert_token_with

        data["raw_text"].append(doc.text)
        data["tokens"].append(tokens_str)
        data["file_name"].append(txt.name)
        
    return Dataset.from_dict(data)

In [8]:
dataset = get_inference_dataset(
    txt_files,
    nlp,
    replace_antibert_token_with="æ",
    bert_tokenizer=bert_tokenizer,
)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [9]:
def batch_prediction(tokens_list):
    batch = [{"tokens": tokens} for tokens in tokens_list]
    return {"predictions": pipeline.predict(batch=batch)}
    
dataset = dataset.map(batch_prediction, input_columns="tokens", batched=True, batch_size=32)

HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




In [10]:
dataset["predictions"][0]

{'classification_labels': ['1', '0'],
 'classification_probabilities': [0.9983344674110413, 0.0016655727522447705],
 'ner_tags': ['O',
  'O',
  'O',
  'O',
  'O',
  'B-PROFESION',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']}

# Classification output

In [11]:
def batch_classification_output(file_names: List[str], predictions: List[Dict]):
    return {
        "tweet_id": [file_name.split('.')[0] for file_name in file_names],
        "label": [prediction["classification_labels"][0] for prediction in predictions],                 
    }
    
ds = dataset.map(
    batch_classification_output,
    input_columns=["file_name", "predictions"],
    batched=True,
    batch_size=2,
    remove_columns=dataset.column_names
)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [12]:
df = ds.head(n=None)[["tweet_id", "label"]]
df

Unnamed: 0,tweet_id,label
0,1242407018465579008,1
1,1242486580222103554,0
2,1242506188555718656,0
3,1242686975943094273,1
4,1242726918132301825,0
...,...,...
1995,1293545412066975744,0
1996,1293561267601510402,0
1997,1293579520000368640,0
1998,1293598083545214980,0


In [13]:
df.to_csv("classification.tsv", sep="\t", index=False)

In [90]:
#!cat test.tsv

# NER output

In [14]:
nlp = get_custom_tokenizer_v1()

In [16]:
def batch_ner_output(file_names: List[str], raw_text: List[str], predictions: List[Dict]):
    docs = [nlp(text) for text in raw_text]
    bioul_tags = [to_bioul(prediction["ner_tags"]) for prediction in predictions]
    batch_offsets = [offsets_from_biluo_tags(doc, entities) for doc, entities in zip(docs, bioul_tags)]
    tweet_ids = [file_name.split('.')[0] for file_name in file_names]
    
    tweet_id, begin, end, ent_type, extraction = [], [], [], [], []
    for tid, offsets, text in zip(tweet_ids, batch_offsets, raw_text):
        tweet_id += [tid]*len(offsets)
        begin += [offset[0] for offset in offsets]
        end += [offset[1] for offset in offsets]
        ent_type += [offset[2] for offset in offsets]
        extraction += [text[offset[0]:offset[1]] for offset in offsets]
    
    if any(['\n' in ext for ext in extraction]):
        print("Found 'newline' in extraction!!")
    
    return {
        "tweet_id": tweet_id,
        "begin":begin,
        "end": end,
        "type":ent_type,
        "extraction": extraction,
    }
    
ds = dataset.map(
    batch_ner_output,
    input_columns=["file_name", "raw_text", "predictions"],
    batched=True,
    batch_size=2,
    remove_columns=dataset.column_names
)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [17]:
df = ds.head(n=None)[["tweet_id", "begin", "end", "type", "extraction"]]
df

Unnamed: 0,tweet_id,begin,end,type,extraction
0,1242407018465579008,31,41,PROFESION,Presidente
1,1242686975943094273,192,208,PROFESION,guardias civiles
2,1242742067501174785,0,7,PROFESION,Médicos
3,1242797453423841281,3,20,PROFESION,ex vicepresidente
4,1242813220416565250,34,38,PROFESION,juez
...,...,...,...,...,...
684,1292023343105937408,62,72,SITUACION_LABORAL,trabajador
685,1292023343105937408,102,114,SITUACION_LABORAL,#teletrabajo
686,1292408137010548737,32,42,PROFESION,reporteros
687,1292978140579532801,91,99,PROFESION,expertos


In [18]:
df.to_csv("ner.tsv", sep="\t", index=False)