# Playing with the transformers tokenizer

In [22]:
from transformers import AutoTokenizer

In [23]:
tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased') 

In [69]:
indices = tokenizer("check out this long sentence E.U. @testname".split(), return_offsets_mapping=True, is_split_into_words=True)

In [70]:
tokenizer.decode(indices["input_ids"])

'[CLS] check out this long sentence e. u. [UNK] testname [SEP]'

In [71]:
indices

{'input_ids': [4, 4935, 2416, 12244, 14560, 12179, 2983, 8538, 1006, 1008, 1482, 1008, 3, 13444, 21613, 30955, 5], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 3), (3, 5), (0, 3), (0, 4), (0, 4), (0, 4), (4, 8), (0, 1), (1, 2), (2, 3), (3, 4), (0, 1), (1, 5), (5, 8), (8, 9), (0, 0)]}

In [72]:
tokenizer.convert_ids_to_tokens(indices["input_ids"])

['[CLS]',
 'che',
 '##ck',
 'out',
 'this',
 'long',
 'sent',
 '##ence',
 'e',
 '.',
 'u',
 '.',
 '[UNK]',
 'test',
 '##nam',
 '##e',
 '[SEP]']

In [None]:
!python -m spacy download es_core_news_sm

In [None]:
import spacy

In [None]:
nlp = spacy.load("es_core_news_sm")

In [None]:
spacy.gold.biluo_tags_from_offsets

# Preprocess data

## Imports

In [44]:
#!pip install spacymoji
#!pip install git+https://github.com/supadupa/spacymoji@fix-merging

In [14]:
import helper
import spacy
import glob
from pathlib import Path
from typing import List, Tuple, Optional, Dict
from tqdm.auto import tqdm
import pandas as pd
from spacy.tokenizer import Tokenizer
from allennlp.data.token_indexers import PretrainedTransformerIndexer, PretrainedTransformerMismatchedIndexer, PretrainedTransformerIndexer
from allennlp.data import Token, Vocabulary
from biome.text.helpers import bioul_tags_to_bio_tags
from spacymoji import Emoji

## NER part

In [15]:
train_txt = list(map(Path, sorted(glob.glob("../raw_data/subtask-2/brat/train/*.txt"))))
train_ann = list(map(Path, sorted(glob.glob("../raw_data/subtask-2/brat/train/*.ann"))))

valid_txt = list(map(Path, sorted(glob.glob("../raw_data/subtask-2/brat/valid//*.txt"))))
valid_ann = list(map(Path, sorted(glob.glob("../raw_data/subtask-2/brat/valid/*.ann"))))

In [16]:
nlp = spacy.load("es_core_news_sm")
# emoji = Emoji(nlp, merge_spans=False)
# nlp.add_pipe(emoji, first=True)

In [17]:
def create_custom_tokenizer(nlp):
    # TODO: Not if this can be improved/generalized better
    # yes, this is a skin tone ...
    prefix_re = spacy.util.compile_prefix_regex(tuple(list(nlp.Defaults.prefixes) + ['[\.\🏽]']))
    infix_re = spacy.util.compile_infix_regex(tuple(list(nlp.Defaults.infixes) + ['[\-\!\.\?\"\(\)\:]']))
    suffix_re = suffix_re = spacy.util.compile_suffix_regex(tuple(list(nlp.Defaults.suffixes) + []))   
            
    return Tokenizer(
        nlp.vocab, 
        rules=nlp.tokenizer.rules,
        prefix_search=prefix_re.search, 
        infix_finditer=infix_re.finditer,
        suffix_search=suffix_re.search,
        token_match=nlp.tokenizer.token_match,
        url_match=nlp.tokenizer.url_match,
    )


In [18]:
nlp.tokenizer = create_custom_tokenizer(nlp)

## Classification part

In [19]:
def get_classification_dict(file_path: Path) -> Dict[str, str]:
    """A dict with {tweet_id: label}"""
    classification_dict = {}
    for i, line in enumerate(file_path.read_text().split('\n')[1:]):
        try:
            tweet_id, label = line.split('\t')
            classification_dict[tweet_id] = label
        except (ValueError, TypeError):
            print(i, line, line.split('\t'))
    
    return classification_dict

train_classification = get_classification_dict(Path("../raw_data/subtask-1/train.tsv"))
valid_classification = get_classification_dict(Path("../raw_data/subtask-1/valid.tsv"))

6000  ['']
2000  ['']


## Putting everything in a DataFrame

In [20]:
def create_df(txt_files: List[Path], ann_files: List[Path], nlp: "spacy.nlp", classification: Dict[str, str]):
    
    data = {
        "raw_text": [],
        "tokens": [],
        "tags_bioul": [],
        "tags_bio": [],
        "entity_text": [],
        "classification_label": [],
        "file_name": [],
    }
    
    for txt, ann in tqdm(zip(txt_files, ann_files), total=len(txt_files)):
        try:
            doc: spacy.docs.Doc = helper.brat2doc(
                txt, 
                ann, 
                nlp, 
                # The competition will only evaluate PROFESION and SITUACION_LABORAL
                ignore_labels=["ACTIVIDAD", "FIGURATIVA"],
                remove_children=True,
                remove_parents=False, 
                remove_siblings=False,
                verbose=True,
            )
        except ValueError as error:
            print(txt, ann)
            raise error
            
        data["raw_text"].append(doc.text)
        data["tokens"].append(list(map(str, doc)))
        data["tags_bioul"].append([token._.ctag for token in doc])
        data["tags_bio"].append(bioul_tags_to_bio_tags(data["tags_bioul"][-1]))
        data["entity_text"].append(doc._.entity_text)
        data["classification_label"].append(classification[txt.name.split('.')[0]])
        data["file_name"].append(txt.name)
    
    return pd.DataFrame(data)

In [21]:
df_train = create_df(train_txt, train_ann, nlp, train_classification)

HBox(children=(FloatProgress(value=0.0, max=6000.0), HTML(value='')))

Removed Span(text='Protección Civil', label='ACTIVIDAD', file='1244001571257581568.ann')
Removed Span(text='JUGADORES', label='ACTIVIDAD', file='1244576816133791745.ann')
Removed Span(text='jugadores', label='ACTIVIDAD', file='1244576816133791745.ann')
Removed Span(text='actor', label='ACTIVIDAD', file='1245657701423525888.ann')
Removed Span(text='cantante', label='ACTIVIDAD', file='1245657701423525888.ann')
Removed Span(text='futbolista', label='ACTIVIDAD', file='1245657701423525888.ann')
Removed Span(text='reina', label='FIGURATIVA', file='1246395424220549120.ann')
Removed Span(text='maestro', label='FIGURATIVA', file='1250067856190066691.ann')
Removed Span(text='jugadores', label='ACTIVIDAD', file='1250067856190066691.ann')
Removed Span(text='jugadores', label='ACTIVIDAD', file='1250393594046885888.ann')
Removed Span(text='deportistas', label='ACTIVIDAD', file='1251075915943219200.ann')
Removed Span(text='ViceKomisario', label='FIGURATIVA', file='1252685852796432384.ann')
Removed Sp

In [22]:
df_valid = create_df(valid_txt, valid_ann, nlp, valid_classification)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))

Removed Span(text='ama de casa', label='ACTIVIDAD', file='1247422026375200769.ann')
Removed Span(text='atleta', label='FIGURATIVA', file='1252603036524052481.ann')
Removed Span(text='autor', label='ACTIVIDAD', file='1253703655359987712.ann')
Removed Span(text='necropolíticos', label='FIGURATIVA', file='1255419044657758211.ann')
Removed Span(text='necrófagos políticos', label='FIGURATIVA', file='1255419044657758211.ann')
Removed Span(text='presidente', label='FIGURATIVA', file='1257030225277464584.ann')
Removed Span(text='deportistas', label='ACTIVIDAD', file='1257601158065131522.ann')
Removed Span(text='Voluntariado de Refuerzo educativo', label='ACTIVIDAD', file='1257783753277091842.ann')
Removed Span(text='agentes', label='FIGURATIVA', file='1258327578475102209.ann')
Removed Span(text='artistas de espectáculos públicos', label='ACTIVIDAD', file='1261015959206232072.ann')
Removed Span(text='deportistas', label='ACTIVIDAD', file='1263780155673149442.ann')
Removed Span(text='Autor', lab

In [10]:
#!cat data/profner/subtask-2/brat/train/1269241724645318659.ann

### Checking for misalignment 

In [23]:
set(df_train.tags_bioul.sum()), set(df_train.tags_bio.sum()), set(df_valid.tags_bioul.sum()), set(df_valid.tags_bio.sum())

({'B-PROFESION',
  'B-SITUACION_LABORAL',
  'I-PROFESION',
  'I-SITUACION_LABORAL',
  'L-PROFESION',
  'L-SITUACION_LABORAL',
  'O',
  'U-PROFESION',
  'U-SITUACION_LABORAL'},
 {'B-PROFESION',
  'B-SITUACION_LABORAL',
  'I-PROFESION',
  'I-SITUACION_LABORAL',
  'O'},
 {'B-PROFESION',
  'B-SITUACION_LABORAL',
  'I-PROFESION',
  'L-PROFESION',
  'L-SITUACION_LABORAL',
  'O',
  'U-PROFESION',
  'U-SITUACION_LABORAL'},
 {'B-PROFESION',
  'B-SITUACION_LABORAL',
  'I-PROFESION',
  'I-SITUACION_LABORAL',
  'O'})

In [24]:
for row in df_valid.itertuples():
    if '-' in row.tags_bioul:
        print(row.file_name)
        print(list(zip(row.tokens, row.tags_bioul)), row.raw_text)

In [15]:
df_train

Unnamed: 0,raw_text,tokens,tags_bioul,tags_bio,entity_text,classification_label,file_name
0,Cerramos nuestra querida Radio 😢 Nuestros cola...,"[Cerramos, nuestra, querida, Radio, 😢, Nuestro...","[O, O, O, O, O, O, U-PROFESION, O, U-PROFESION...","[O, O, O, O, O, O, B-PROFESION, O, B-PROFESION...","[colaboradores, conductores]",1,1242399976644325376.txt
1,#OtroEscandalo #HastaCuando \n#DenunciaCCOO #C...,"[#, OtroEscandalo, #, HastaCuando, \n, #, Denu...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1242406334802395137.txt
2,¿Es necesario entregar nuestra privacidad a un...,"[¿, Es, necesario, entregar, nuestra, privacid...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1242407077278093313.txt
3,Así que estás chimbeando mucho con esos Decret...,"[Así, que, estás, chimbeando, mucho, con, esos...","[O, O, O, O, O, O, O, O, O, O, O, U-PROFESION,...","[O, O, O, O, O, O, O, O, O, O, O, B-PROFESION,...",[Presidente],1,1242407274771030016.txt
4,@FeGarPe79 @escipion_r @LuciaMendezEM Estás MU...,"[@FeGarPe79, @escipion_r, @LuciaMendezEM, Está...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1242409866515435520.txt
...,...,...,...,...,...,...,...
5995,Se avecina un nuevo confinamiento q es una man...,"[Se, avecina, un, nuevo, confinamiento, q, es,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293639784397766656.txt
5996,"Así funciona Radar COVID, la app de rastreo de...","[Así, funciona, Radar, COVID, ,, la, app, de, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293642161867632641.txt
5997,Se duplican los contagios por Coronavirus en M...,"[Se, duplican, los, contagios, por, Coronaviru...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293651264140726272.txt
5998,"Corríjanme si me equivoco, pero somos el único...","[Corríjanme, si, me, equivoco, ,, pero, somos,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293654036722442247.txt


In [16]:
df_valid

Unnamed: 0,raw_text,tokens,tags_bioul,tags_bio,entity_text,classification_label,file_name
0,COMUNICADO POR CORONAVIRUS \nEl Presidente Rus...,"[COMUNICADO, POR, CORONAVIRUS, \n, El, Preside...","[O, O, O, O, O, U-PROFESION, O, O, O, O, O, O,...","[O, O, O, O, O, B-PROFESION, O, O, O, O, O, O,...",[Presidente],1,1242407018465579008.txt
1,“La falta de transparencia en asuntos de salud...,"[“, La, falta, de, transparencia, en, asuntos,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1242486580222103554.txt
2,Las enseñanzas del coronavirus |\nPero @jcoscu...,"[Las, enseñanzas, del, coronavirus, |, \n, Per...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1242506188555718656.txt
3,"No me alegro de la muerte de nadie, ¿pero es m...","[No, me, alegro, de, la, muerte, de, nadie, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[guardias civiles],1,1242686975943094273.txt
4,UNIDOS venceremos al Coronavirus #COVID19 http...,"[UNIDOS, venceremos, al, Coronavirus, #, COVID...","[O, O, O, O, O, O, O]","[O, O, O, O, O, O, O]",[],0,1242726918132301825.txt
...,...,...,...,...,...,...,...
1995,—Me he bajado la app esa para detectar enfermo...,"[—, Me, he, bajado, la, app, esa, para, detect...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293545412066975744.txt
1996,"Desde el 7 de marzo, y mascarilla en ristre, a...","[Desde, el, 7, de, marzo, ,, y, mascarilla, en...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293561267601510402.txt
1997,Sigan para bingo en este 2020,"[Sigan, para, bingo, en, este, 2020]","[O, O, O, O, O, O]","[O, O, O, O, O, O]",[],0,1293579520000368640.txt
1998,🇪🇸 | URGENTE - CORONAVIRUS: España reporta 169...,"[🇪, 🇸, |, URGENTE, -, CORONAVIRUS, :, España, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[],0,1293598083545214980.txt


In [26]:
df_train.to_json("train_v1.json", lines=True, orient="records")
df_valid.to_json("valid_v1.json", lines=True, orient="records")

## Playing with the mask concept
Basically only tag the first transformer token of a spacy token

In [113]:
def check_offsets(df, model_name="dccuchile/bert-base-spanish-wwm-uncased") -> Tuple[List, List]:
    destroyed_tokens = []
    destroyed_tags = []
    offs = []
    indexer = PretrainedTransformerIndexer(model_name=model_name)
    vocab = Vocabulary()
    tokenizer = AutoTokenizer.from_pretrained(model_name) 
    
    for row in tqdm(df.itertuples(), total=len(df)):
        indices = tokenizer(row.tokens, return_offsets_mapping=True, is_split_into_words=True)
        token_groups = []
        for input_id, offset in zip(indices["input_ids"][1:-1], indices["offset_mapping"][1:-1]):
            if offset[0] == 0:
                token_groups.append([tokenizer.convert_ids_to_tokens(input_id)])
            else:
                token_groups[-1].append(tokenizer.convert_ids_to_tokens(input_id))
        print(list(zip(row.tokens, token_groups)))
        tokens_str = tokenizer.convert_ids_to_tokens(indices["input_ids"])
        tokens = [Token(tok) for tok in tokens_str]
        token_indexes = indexer.tokens_to_indices(tokens, vocabulary=vocab)
        token_ids = token_indexes["token_ids"]
                    
    return destroyed_tokens, destroyed_tags, offs

In [25]:
check_offsets(df[:2])

NameError: name 'check_offsets' is not defined

In [None]:
df_train.to_json("train_for_bert.json", lines=True, orient="records")