# Playing with the transformers tokenizer

In [39]:
from transformers import AutoTokenizer
import numpy as np

In [2]:
bert_tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased') 

In [16]:
tokens = "check out this long sentence E.U. @testname".split()
tokens[2] = '\n'
tokens

['check', 'out', '\n', 'long', 'sentence', 'E.U.', '@testname']

In [27]:
tokens = ["COMUNICADO","POR","CORONAVIRUS", "\nEl","Presidente"]  #,"Ruso","Vladimir","Putin","ha","dicho",":","\"","los","ciudadanos","rusos","tienen","dos","opciones",",","se","quedan","en","su","casa","por","15","d\u00edas","o","van","a","prisi\u00f3n","por","5","a\u00f1os","\"","FIN","DEL","COMUNICADO","."," ","#","nomequedoencasa","https:\/\/t.co\/zmLwunVs80"]

In [29]:
indices = bert_tokenizer(tokens, return_offsets_mapping=True, is_split_into_words=True, return_special_tokens_mask=True)

In [30]:
indices

{'input_ids': [4, 11498, 8664, 8375, 10772, 18100, 18169, 8895, 30980, 7078, 1162, 2503, 5], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'offset_mapping': [(0, 0), (0, 5), (5, 8), (8, 10), (0, 3), (0, 3), (3, 6), (6, 8), (8, 9), (9, 11), (1, 3), (0, 10), (0, 0)]}

In [78]:
type(indices["offset_mapping"][0][0])

int

In [31]:
bert_tokenizer.decode(indices["input_ids"])

'[CLS] COMUNICADO POR CORONAVIRUS El Presidente [SEP]'

In [71]:
indices

{'input_ids': [4, 4935, 2416, 12244, 14560, 12179, 2983, 8538, 1006, 1008, 1482, 1008, 3, 13444, 21613, 30955, 5], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 3), (3, 5), (0, 3), (0, 4), (0, 4), (0, 4), (4, 8), (0, 1), (1, 2), (2, 3), (3, 4), (0, 1), (1, 5), (5, 8), (8, 9), (0, 0)]}

In [32]:
bert_tokenizer.convert_ids_to_tokens(indices["input_ids"])

['[CLS]',
 'COMUN',
 '##ICA',
 '##DO',
 'POR',
 'COR',
 '##ONA',
 '##VI',
 '##R',
 '##US',
 'El',
 'Presidente',
 '[SEP]']

In [None]:
!python -m spacy download es_core_news_sm

In [1]:
import spacy

In [2]:
nlp = spacy.load("es_core_news_sm")

In [29]:
string = "test \n\n this"
doc = nlp(string)
list(doc), len(doc), string[5:7]

([test,
  
  
   ,
  this],
 3,
 '\n\n')

In [None]:
spacy.gold.biluo_tags_from_offsets

# Preprocess data

## Imports

In [44]:
#!pip install spacymoji
#!pip install git+https://github.com/supadupa/spacymoji@fix-merging

In [3]:
import helper
import spacy
import glob
from pathlib import Path
from typing import List, Tuple, Optional, Dict
from tqdm.auto import tqdm
import pandas as pd
from spacy.tokenizer import Tokenizer
from allennlp.data.token_indexers import PretrainedTransformerIndexer, PretrainedTransformerMismatchedIndexer, PretrainedTransformerIndexer
from allennlp.data import Token, Vocabulary
from biome.text.helpers import bioul_tags_to_bio_tags
from transformers import AutoTokenizer
from tqdm.auto import tqdm

## NER part

In [4]:
train_txt = list(map(Path, sorted(glob.glob("../raw_data/subtask-2/brat/train/*.txt"))))
train_ann = list(map(Path, sorted(glob.glob("../raw_data/subtask-2/brat/train/*.ann"))))

valid_txt = list(map(Path, sorted(glob.glob("../raw_data/subtask-2/brat/valid//*.txt"))))
valid_ann = list(map(Path, sorted(glob.glob("../raw_data/subtask-2/brat/valid/*.ann"))))

In [5]:
nlp = helper.get_custom_tokenizer_v1()

## Classification part

In [6]:
def get_classification_dict(file_path: Path) -> Dict[str, str]:
    """A dict with {tweet_id: label}"""
    classification_dict = {}
    for i, line in enumerate(file_path.read_text().split('\n')[1:]):
        try:
            tweet_id, label = line.split('\t')
            classification_dict[tweet_id] = label
        except (ValueError, TypeError):
            print(i, line, line.split('\t'))
    
    return classification_dict

train_classification = get_classification_dict(Path("../raw_data/subtask-1/train.tsv"))
valid_classification = get_classification_dict(Path("../raw_data/subtask-1/valid.tsv"))

6000  ['']
2000  ['']


## Putting everything in a DataFrame

In [7]:
def create_df(
    txt_files: List[Path], 
    ann_files: List[Path], 
    nlp: "spacy.nlp", 
    classification: Dict[str, str],
    replace_antibert_token_with: str = None,
    bert_tokenizer: "transformers.AutoTokenizer" = None,
):
    
    data = {
        "raw_text": [],
        "tokens": [],
        "tags_bioul": [],
        "tags_bio": [],
        "entity_text": [],
        "classification_label": [],
        "file_name": [],
    }
    
    for txt, ann in tqdm(zip(txt_files, ann_files), total=len(txt_files)):
        try:
            doc: spacy.docs.Doc = helper.brat2doc(
                txt, 
                ann, 
                nlp, 
                # The competition will only evaluate PROFESION and SITUACION_LABORAL
                ignore_labels=["ACTIVIDAD", "FIGURATIVA"],
                remove_children=True,
                remove_parents=False, 
                remove_siblings=False,
                verbose=True,
            )
        except ValueError as error:
            print(txt, ann)
            raise error
            
        tokens_str = list(map(str, doc))
        if replace_antibert_token_with is not None:
            for i, token in enumerate(tokens_str):
                input_ids = bert_tokenizer([token], is_split_into_words=True)["input_ids"]
                if len(input_ids) <= 2:
                    tokens_str[i] = replace_antibert_token_with
            
        data["raw_text"].append(doc.text)
        data["tokens"].append(tokens_str)
        data["tags_bioul"].append([token._.ctag for token in doc])
        data["tags_bio"].append(bioul_tags_to_bio_tags(data["tags_bioul"][-1]))
        data["entity_text"].append(doc._.entity_text)
        data["classification_label"].append(classification[txt.name.split('.')[0]])
        data["file_name"].append(txt.name)
    
    return pd.DataFrame(data)

In [14]:
df_train = create_df(
    train_txt, 
    train_ann, 
    nlp, 
    train_classification,
    replace_antibert_token_with='æ',
    bert_tokenizer=bert_tokenizer,
)

HBox(children=(FloatProgress(value=0.0, max=6000.0), HTML(value='')))

Removed Span(text='Protección Civil', label='ACTIVIDAD', file='1244001571257581568.ann')
Removed Span(text='JUGADORES', label='ACTIVIDAD', file='1244576816133791745.ann')
Removed Span(text='jugadores', label='ACTIVIDAD', file='1244576816133791745.ann')
Removed Span(text='actor', label='ACTIVIDAD', file='1245657701423525888.ann')
Removed Span(text='cantante', label='ACTIVIDAD', file='1245657701423525888.ann')
Removed Span(text='futbolista', label='ACTIVIDAD', file='1245657701423525888.ann')
Removed Span(text='reina', label='FIGURATIVA', file='1246395424220549120.ann')
Removed Span(text='maestro', label='FIGURATIVA', file='1250067856190066691.ann')
Removed Span(text='jugadores', label='ACTIVIDAD', file='1250067856190066691.ann')
Removed Span(text='jugadores', label='ACTIVIDAD', file='1250393594046885888.ann')
Removed Span(text='deportistas', label='ACTIVIDAD', file='1251075915943219200.ann')
Removed Span(text='ViceKomisario', label='FIGURATIVA', file='1252685852796432384.ann')
Removed Sp

In [13]:
df_valid = create_df(
    valid_txt, 
    valid_ann, 
    nlp, 
    valid_classification,
    replace_antibert_token_with='æ',
    bert_tokenizer=bert_tokenizer,
)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))

Removed Span(text='ama de casa', label='ACTIVIDAD', file='1247422026375200769.ann')
Removed Span(text='atleta', label='FIGURATIVA', file='1252603036524052481.ann')
Removed Span(text='autor', label='ACTIVIDAD', file='1253703655359987712.ann')
Removed Span(text='necropolíticos', label='FIGURATIVA', file='1255419044657758211.ann')
Removed Span(text='necrófagos políticos', label='FIGURATIVA', file='1255419044657758211.ann')
Removed Span(text='presidente', label='FIGURATIVA', file='1257030225277464584.ann')
Removed Span(text='deportistas', label='ACTIVIDAD', file='1257601158065131522.ann')
Removed Span(text='Voluntariado de Refuerzo educativo', label='ACTIVIDAD', file='1257783753277091842.ann')
Removed Span(text='agentes', label='FIGURATIVA', file='1258327578475102209.ann')
Removed Span(text='artistas de espectáculos públicos', label='ACTIVIDAD', file='1261015959206232072.ann')
Removed Span(text='deportistas', label='ACTIVIDAD', file='1263780155673149442.ann')
Removed Span(text='Autor', lab

In [11]:
#!cat data/profner/subtask-2/brat/train/1269241724645318659.ann

### Checking for misalignment 

In [15]:
set(df_train.tags_bioul.sum()), set(df_train.tags_bio.sum()), set(df_valid.tags_bioul.sum()), set(df_valid.tags_bio.sum())

({'B-PROFESION',
  'B-SITUACION_LABORAL',
  'I-PROFESION',
  'I-SITUACION_LABORAL',
  'L-PROFESION',
  'L-SITUACION_LABORAL',
  'O',
  'U-PROFESION',
  'U-SITUACION_LABORAL'},
 {'B-PROFESION',
  'B-SITUACION_LABORAL',
  'I-PROFESION',
  'I-SITUACION_LABORAL',
  'O'},
 {'B-PROFESION',
  'B-SITUACION_LABORAL',
  'I-PROFESION',
  'L-PROFESION',
  'L-SITUACION_LABORAL',
  'O',
  'U-PROFESION',
  'U-SITUACION_LABORAL'},
 {'B-PROFESION',
  'B-SITUACION_LABORAL',
  'I-PROFESION',
  'I-SITUACION_LABORAL',
  'O'})

In [24]:
for row in df_valid.itertuples():
    if '-' in row.tags_bioul:
        print(row.file_name)
        print(list(zip(row.tokens, row.tags_bioul)), row.raw_text)

In [15]:
df_train

Unnamed: 0,raw_text,tokens,tags_bioul,tags_bio,entity_text,classification_label,file_name
0,Cerramos nuestra querida Radio 😢 Nuestros cola...,"[Cerramos, nuestra, querida, Radio, 😢, Nuestro...","[O, O, O, O, O, O, U-PROFESION, O, U-PROFESION...","[O, O, O, O, O, O, B-PROFESION, O, B-PROFESION...","[colaboradores, conductores]",1,1242399976644325376.txt
1,#OtroEscandalo #HastaCuando \n#DenunciaCCOO #C...,"[#, OtroEscandalo, #, HastaCuando, \n, #, Denu...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1242406334802395137.txt
2,¿Es necesario entregar nuestra privacidad a un...,"[¿, Es, necesario, entregar, nuestra, privacid...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1242407077278093313.txt
3,Así que estás chimbeando mucho con esos Decret...,"[Así, que, estás, chimbeando, mucho, con, esos...","[O, O, O, O, O, O, O, O, O, O, O, U-PROFESION,...","[O, O, O, O, O, O, O, O, O, O, O, B-PROFESION,...",[Presidente],1,1242407274771030016.txt
4,@FeGarPe79 @escipion_r @LuciaMendezEM Estás MU...,"[@FeGarPe79, @escipion_r, @LuciaMendezEM, Está...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1242409866515435520.txt
...,...,...,...,...,...,...,...
5995,Se avecina un nuevo confinamiento q es una man...,"[Se, avecina, un, nuevo, confinamiento, q, es,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293639784397766656.txt
5996,"Así funciona Radar COVID, la app de rastreo de...","[Así, funciona, Radar, COVID, ,, la, app, de, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293642161867632641.txt
5997,Se duplican los contagios por Coronavirus en M...,"[Se, duplican, los, contagios, por, Coronaviru...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293651264140726272.txt
5998,"Corríjanme si me equivoco, pero somos el único...","[Corríjanme, si, me, equivoco, ,, pero, somos,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293654036722442247.txt


In [16]:
df_valid

Unnamed: 0,raw_text,tokens,tags_bioul,tags_bio,entity_text,classification_label,file_name
0,COMUNICADO POR CORONAVIRUS \nEl Presidente Rus...,"[COMUNICADO, POR, CORONAVIRUS, æ, El, Presiden...","[O, O, O, O, O, U-PROFESION, O, O, O, O, O, O,...","[O, O, O, O, O, B-PROFESION, O, O, O, O, O, O,...",[Presidente],1,1242407018465579008.txt
1,“La falta de transparencia en asuntos de salud...,"[“, La, falta, de, transparencia, en, asuntos,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1242486580222103554.txt
2,Las enseñanzas del coronavirus |\nPero @jcoscu...,"[Las, enseñanzas, del, coronavirus, |, æ, Pero...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1242506188555718656.txt
3,"No me alegro de la muerte de nadie, ¿pero es m...","[No, me, alegro, de, la, muerte, de, nadie, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[guardias civiles],1,1242686975943094273.txt
4,UNIDOS venceremos al Coronavirus #COVID19 http...,"[UNIDOS, venceremos, al, Coronavirus, #, COVID...","[O, O, O, O, O, O, O]","[O, O, O, O, O, O, O]",[],0,1242726918132301825.txt
...,...,...,...,...,...,...,...
1995,—Me he bajado la app esa para detectar enfermo...,"[—, Me, he, bajado, la, app, esa, para, detect...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293545412066975744.txt
1996,"Desde el 7 de marzo, y mascarilla en ristre, a...","[Desde, el, 7, de, marzo, ,, y, mascarilla, en...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293561267601510402.txt
1997,Sigan para bingo en este 2020,"[Sigan, para, bingo, en, este, 2020]","[O, O, O, O, O, O]","[O, O, O, O, O, O]",[],0,1293579520000368640.txt
1998,🇪🇸 | URGENTE - CORONAVIRUS: España reporta 169...,"[🇪, 🇸, |, URGENTE, -, CORONAVIRUS, :, España, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293598083545214980.txt


In [17]:
df_train.to_json("train_v2.json", lines=True, orient="records")
df_valid.to_json("valid_v2.json", lines=True, orient="records")

## Playing with the mask concept
Basically only tag the first transformer token of a spacy token

In [33]:
from allennlp.data import Instance
from allennlp.data.fields import TextField
from allennlp.data import Token
from allennlp.data.token_indexers import PretrainedTransformerIndexer
from allennlp.data import Batch
from allennlp.data import Vocabulary

In [3]:
df_valid = pd.read_json("valid_v1.json", orient="records", lines=True)

In [16]:
model_name="dccuchile/bert-base-spanish-wwm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [20]:
input_ids = tokenizer(df_valid.tokens[0], is_split_into_words=True, return_offsets_mapping=True)
allennlp_tokens = list(map(Token, tokenizer.convert_ids_to_tokens(input_ids["input_ids"])))

In [28]:
indexer = PretrainedTransformerIndexer(model_name=model_name)
text_field = TextField(allennlp_tokens, token_indexers={"transformers": indexer})
instance = Instance({"tokens": text_field})

In [48]:
vocab = Vocabulary.from_instances([instance])
batch = Batch([instance])
batch.index_instances(vocab)
token_ids = batch.as_tensor_dict()["tokens"]["transformers"]
indices = indexer.tokens_to_indices(allennlp_tokens, vocab)

HBox(children=(FloatProgress(value=0.0, description='building vocab', max=1.0, style=ProgressStyle(description…




In [49]:
def check_offsets(df, model_name="dccuchile/bert-base-spanish-wwm-uncased") -> Tuple[List, List]:
    indexer = PretrainedTransformerIndexer(model_name=model_name)
    vocab = Vocabulary()
    tokenizer = AutoTokenizer.from_pretrained(model_name) 
    
    for row in tqdm(df.itertuples(), total=len(df)):
        indices = tokenizer(row.tokens, return_offsets_mapping=True, is_split_into_words=True, return_special_tokens_mask=True)
        print
        token_groups = []
        for input_id, offset in zip(indices["input_ids"][1:-1], indices["offset_mapping"][1:-1]):
            if offset[0] == 0:
                token_groups.append([tokenizer.convert_ids_to_tokens(input_id)])
            else:
                token_groups[-1].append(tokenizer.convert_ids_to_tokens(input_id))
        print(list(zip(row.tokens, token_groups)))
        tokens_str = tokenizer.convert_ids_to_tokens(indices["input_ids"])
        tokens = [Token(tok) for tok in tokens_str]
        token_indexes = indexer.tokens_to_indices(tokens, vocabulary=vocab)
        token_ids = token_indexes["token_ids"]
                    
    return destroyed_tokens, destroyed_tags, offs

In [50]:
check_offsets(df_valid[:2])

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

[('COMUNICADO', ['comunicado']), ('POR', ['por']), ('CORONAVIRUS', ['corona', '##vir', '##us']), ('\n', ['el']), ('El', ['presidente']), ('Presidente', ['ruso']), ('Ruso', ['vladimir']), ('Vladimir', ['pu', '##tin']), ('Putin', ['ha']), ('ha', ['dicho']), ('dicho', [':']), (':', ['"']), ('"', ['los']), ('los', ['ciudadanos']), ('ciudadanos', ['rusos']), ('rusos', ['tienen']), ('tienen', ['dos']), ('dos', ['opciones']), ('opciones', [',']), (',', ['se']), ('se', ['quedan']), ('quedan', ['en']), ('en', ['su']), ('su', ['casa']), ('casa', ['por']), ('por', ['15']), ('15', ['dias']), ('días', ['o']), ('o', ['van']), ('van', ['a']), ('a', ['prision']), ('prisión', ['por']), ('por', ['5']), ('5', ['anos']), ('años', ['"']), ('"', ['fin']), ('FIN', ['del']), ('DEL', ['comunicado']), ('COMUNICADO', ['.']), ('.', ['[UNK]']), (' ', ['nom', '##e', '##que', '##do', '##en', '##cas', '##a']), ('#', ['h', '##tt', '##ps', ':', '[UNK]', '[UNK]', 't', '.', 'co', '[UNK]', 'z', '##ml', '##w', '##un', '##v

NameError: name 'destroyed_tokens' is not defined

In [30]:
from biome.text import Dataset

In [32]:
ds = Dataset.from_json("valid_v1.json")

Using custom data configuration default
Reusing dataset json (/home/david/.cache/huggingface/datasets/json/default-b2dd006bd782a885/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514)


In [39]:
indices = tokenizer(ds[0]["tokens"], return_offsets_mapping=True, is_split_into_words=True, return_special_tokens_mask=True)

In [42]:
i = 0
for off in indices["offset_mapping"]:
    if off[0] == 0:
        i += 1
i

44

In [43]:
len(ds[0]["tags_bio"])

44

In [23]:
from allennlp.training.metrics import SpanBasedF1Measure
from allennlp.data.vocabulary import Vocabulary
import torch

In [24]:
f1 = SpanBasedF1Measure(Vocabulary())

In [None]:
pred = [["B-TEST", "I-TEST", "O"], ...]
gold = ["B-TEST", "O", "O"]
vocab = {"B-TEST": 0, "I-TEST": 1, "O": 2}

vocab = {"B-8474693", "I-...", ...........}

In [22]:
predictions = torch.tensor([[[1, 0, 0], [0, 1, 0], [0, 0, 1]], [...], [...]]) 
gold = torch.tensor([[[1, 0, 0], [0, 0, 1], [0, 0, 1]]])

len(predictions.size()) == 3
# batchsize, nr of words, nr of labels

[0;31mSignature:[0m     
[0mf1[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpredictions[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgold_labels[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmask[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mBoolTensor[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprediction_map[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m           SpanBasedF1Measure
[0;31mString form:[0m    <allennlp.training.metrics.span_based_f1_measure.SpanBasedF1Measure object at 0x7f712113e6d0>
[0;31mFile:[0m           ~/miniconda3/envs/biome/lib/python3.7/site-packages/allennlp/trai

In [25]:
f1(predictions, gold, mask)

[0;31mSignature:[0m     
[0mf1[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpredictions[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgold_labels[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmask[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mBoolTensor[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprediction_map[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m           SpanBasedF1Measure
[0;31mString form:[0m    <allennlp.training.metrics.span_based_f1_measure.SpanBasedF1Measure object at 0x7f7121209e90>
[0;31mFile:[0m           ~/miniconda3/envs/biome/lib/python3.7/site-packages/allennlp/trai