# Rubrix and Veganuary

In [None]:
#%pip install rubrix pandas datasets transformers[torch] gradio

## Uploading the tweets for annotation

In [None]:
# read tweets
import pandas as pd

tweets = pd.read_json('tweets.json')
tweets

In [None]:
# we tokenize the tweets using spacy
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
# we preprocess the tweets with the pysentimiento library
from pysentimiento.preprocessing import preprocess_tweet

In [None]:
# create Rubrix records for annotations
from tqdm.auto import tqdm

records = []
for index, row in tqdm(tweets.iterrows(), total=len(tweets)):  
    text = preprocess_tweet(row["text"], lang="en")
    
    # spaCy Doc creation
    doc = nlp(text)

    # get spacy tokens
    tokens = [token.text for token in doc]
    
    # Rubrix TokenClassificationRecord list
    records.append(
        rb.TokenClassificationRecord(
            text=text,
            tokens=tokens,
        )
    )

In [None]:
# upload the records to Rubrix for annotating
rb.log(records=records, name="veganuary")

## Load annotated tweets

In [None]:
# Load only the annotated records from Rubrix (your/own NER/token classification task)
tweets_df = rb.load('veganuary', query="status:Validated")

In [None]:
tweets_df

### Transform entities to bio tags 

In [None]:
# transform entity spans to bio tags
from spacy.training import offsets_to_biluo_tags, biluo_to_iob
tqdm.pandas()

def entities_to_tags(row):
    doc = nlp(row["text"])
    entities = [(entity[1], entity[2], entity[0]) for entity in row['annotation']]
    biluo_tags = offsets_to_biluo_tags(doc, entities)
    
    if "-" in biluo_tags:
        return None
    
    return biluo_to_iob(biluo_tags)

tweets_df["ner_tags"] = tweets_df.progress_apply(entities_to_tags, axis=1)

In [None]:
# remove annotations that could not be transformed
tweets_df = tweets_df.dropna(subset=["ner_tags"])

In [None]:
len(tweets_df)

In [None]:
set(tweets_df.ner_tags.sum())

## Train a transformer

Most of the stuff is a copy&paste from the transformers docs & examples

In [None]:
from datasets import Dataset, Features, ClassLabel, Value

# [ClassLabel] does not work with Dataset.from_pandas ...
tweets_dict = {"tokens": list(tweets_df.tokens), "ner_tags": list(tweets_df.ner_tags)}

label_list = ["O", "B-food", "I-food"]
features = Features({
    "tokens": [Value("string")], 
    "ner_tags": [ClassLabel(names=label_list)]
})

tweets = Dataset.from_dict(tweets_dict, features=features)

tweets = tweets.train_test_split(0.2, seed=43)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base", add_prefix_space=True, model_max_length=512)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_tweets = tweets.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
model = AutoModelForTokenClassification.from_pretrained("cardiffnlp/twitter-roberta-base", num_labels=3)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to=["wandb"],
    no_cuda=True,
)

In [None]:
from datasets import load_metric
import numpy as np

# Metrics
metric = load_metric("seqeval")
per_entity = False

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if per_entity:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_tweets["train"],
    eval_dataset=tokenized_tweets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("model")

## Make predictions

In [None]:
from transformers import pipeline

In [None]:
pl = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")

In [None]:
# Load the dataset from Rubrix (your/own NER/token classification task)
tweets_df = rb.load('veganuary')

In [None]:
tweets_df

In [None]:
def extract_mentions(text):
    # here we emulate the 'is_split_into_words', since the pipeline does not allow for tokenized input
    doc = nlp(text)
    text = " ".join([token.text for token in doc])
    
    predictions = pl(text)
    return [pred["word"].strip() for pred in predictions if pred["entity_group"] == "LABEL_1"]
    
tweets_df["mentions"] = tweets_df.text.progress_map(extract_mentions)

In [None]:
pd.Series(tweets_df.mentions.sum()).value_counts()

## push dataset and model to the HF Hub

In [None]:
# add 'unsupervised' split
idx = tweets_df.status == "Default"

tweets["unsupervised"] = Dataset.from_dict(
    {"tokens": tweets_df[idx].tokens}, 
    features=Features({"tokens": [Value("string")]})
)

In [None]:
# push to the HF Hub
tweets.push_to_hub("Recognai/veganuary", token="your-token")

In [None]:
# push model to the HF Hub
model.push_to_hub("veganuary_ner", organization="Recognai", use_auth_token="your-token")