In [1]:
!ls ../../../models/

robertuito-lince-ner-uncased  robertuito-lince-sentiment
robertuito-lince-pos	      robertuito-lince-sentiment-2


In [2]:
%load_ext autoreload
%autoreload 2
from transformers import AutoModelForTokenClassification, AutoTokenizer
from pysentimiento.lince.ner import load_datasets

model_name = "../../../models/robertuito-lince-ner-uncased"

model = AutoModelForTokenClassification.from_pretrained(model_name)


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128

_, _, test_dataset = load_datasets(lang="es", preprocess=False)

Reusing dataset lince (/root/.cache/huggingface/datasets/lince/ner_spaeng/1.0.0/10d41747f55f0849fa84ac579ea1acfa7df49aa2015b60426bc459c111b3d589)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/lince/ner_spaeng/1.0.0/10d41747f55f0849fa84ac579ea1acfa7df49aa2015b60426bc459c111b3d589/cache-98e277271cd16b75.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/lince/ner_spaeng/1.0.0/10d41747f55f0849fa84ac579ea1acfa7df49aa2015b60426bc459c111b3d589/cache-8ff715c63222e434.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/lince/ner_spaeng/1.0.0/10d41747f55f0849fa84ac579ea1acfa7df49aa2015b60426bc459c111b3d589/cache-8b685d7fa06b5f0b.arrow


In [3]:
from datasets import Dataset
from pysentimiento.lince.ner import preprocess_token, tokenize_and_align_labels

original_words = test_dataset["words"]

test_dataset = test_dataset.map(
    lambda x: {
        "words": [preprocess_token(word, "es") for word in x["words"]]
    }
)


  0%|          | 0/23527 [00:00<?, ?ex/s]

In [4]:

tokenize_fun = lambda x: tokenize_and_align_labels(x, tokenizer)
test_dataset = test_dataset.map(
    tokenize_fun, batched=True, batch_size=32, 
)

test_dataset = test_dataset.remove_columns(["labels"])

  0%|          | 0/736 [00:00<?, ?ba/s]

In [5]:
from tqdm.auto import tqdm

problematic_instances = []

for idx, row in tqdm(enumerate(test_dataset), total=len(test_dataset)):
    word_ids = row["word_ids"]
    input_ids = row["input_ids"]
    words = original_words[idx]
    if not (word_ids[-2] + 1) == len(words):
        problematic_instances.append(idx)

print(f"{len(problematic_instances)} problematic instances")
assert len(problematic_instances) == 0

  0%|          | 0/23527 [00:00<?, ?it/s]

0 problematic instances


In [6]:
from transformers import DataCollatorForTokenClassification, Trainer, TrainingArguments
data_collator = DataCollatorForTokenClassification(tokenizer)

train_args = TrainingArguments(
    output_dir="./test/",
    per_device_eval_batch_size=32,
    do_train=False,
)

trainer_args = {
    "model": model,
    "args": train_args,
    "eval_dataset": test_dataset,
    "data_collator": data_collator,
    "tokenizer": tokenizer,
}

eval_trainer = Trainer(**trainer_args)
ret = eval_trainer.predict(test_dataset)

The following columns in the test set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: word_ids, lid, idx, words, ner.
***** Running Prediction *****
  Num examples = 23527
  Batch size = 32


In [7]:
from tqdm.auto import tqdm
from pysentimiento.lince.ner import id2label, label2id

outputs = []

for idx, row in tqdm(enumerate(test_dataset), total=len(test_dataset)):
    word_ids = row["word_ids"]
    input_ids = row["input_ids"]
    words = original_words[idx]
    assert (word_ids[-2] + 1) == len(words)
    preds = ret.predictions[idx]

    sentence_output = []
    current_word_id = None

    sentence_output = [None] * len(words)

    for word_id, token_id, pred in zip(word_ids, input_ids, preds):
        token = tokenizer.decode(token_id)
        if current_word_id != word_id and word_id is not None:
            current_word_id = word_id
            label = id2label[pred.argmax()]
            sentence_output[current_word_id] = label
    
    if not all(sentence_output):
        print("="*80)
        print(idx)
        print(list(zip(
            words,
            sentence_output,
        )))

        # Filling gaps
        sentence_output = [s or 'O' for s in sentence_output]
        
    outputs.append(sentence_output)


  0%|          | 0/23527 [00:00<?, ?it/s]

8722
[('"', 'O'), ('Lmao', 'O'), ('!', 'O'), ('\ue411\ue412', None), ('ima', 'O'), ('turn', 'O'), ('into', 'O'), ('one', 'O'), ('foreal', 'O'), ('lol', 'O'), ('I', 'O'), ('told', 'O'), ('you', 'O'), ('!', 'O'), ('I', 'O'), ('like', 'O'), ('Mexican', 'O'), ('chicks', 'O'), (',', 'O'), ('Mexican', 'O'), ('food', 'O'), (',', 'O'), ('Mexican', 'O'), ('jobs', 'O'), ('lol', 'O'), ('"', 'O'), ('Hes', 'O'), ('something', 'O'), ('else', 'O'), ('!!!', 'O'), (':D', 'O')]
9637
[('“', 'O'), ('@_laguera01', 'O'), (':', 'O'), ('Los', 'O'), ('bailes', 'O'), ('de', 'O'), ('Mexico', 'B-LOC'), ('no', 'O'), ('son', 'O'), ('chingaderas', 'O'), ('como', 'O'), ('las', 'O'), ('de', 'O'), ('aqui', 'O'), ('.', 'O'), ('\ue50f\ue01a\ue312', None), ('🎊', 'O'), ('\ue047\ue30c', None), ('”', 'O')]
16313
[('RT', 'O'), ('@adamaris_cruzz', 'O'), (':', 'O'), ('Mis', 'O'), ('nenaaaas', 'O'), ('\ue022\ue022\ue022\ue022', None), ('@andreanoguera1d', 'O'), ('http://t.co/JVuhkno7oo', 'O')]


Toda pura mierda

In [10]:
def write_list(path, data):
    with open(path, "w") as f:
        for sentence in data:
            for row in sentence:
                f.write(f"{row}\n")
            f.write("\n")

write_list("../submissions/02_uncased/ner_spa_eng.txt", outputs)

In [62]:
import torch
sum(map(torch.numel, model.parameters()))

108212755