In [1]:
# Import our main packages 
import rubrix as rb
from transformers import (pipeline, AutoTokenizer, 
                          DataCollatorForTokenClassification,
                          AutoModelForSequenceClassification,
                          TrainingArguments, Trainer,
                         AutoModelForTokenClassification, TrainingArguments, Trainer)
from datasets import load_dataset
from tqdm.auto import tqdm
import pandas as pd

In [2]:
# Get the data
data_files='one-year-of-r-india-comments.csv'
data = load_dataset('csv', data_files=data_files,
                   split='train', streaming=True)




In [3]:
# Set the Classifier
classifier = pipeline(
    'ner',
    model="elastic/distilbert-base-cased-finetuned-conll03-english",
)

In [4]:
# Set the number of samples
n = 100
# For annotation purposes we want word tokens, NOT subword tokens
def make_tokens(examples):
    batch_encoding = classifier.tokenizer(examples["body"])
    examples["tokens"] = []
    for text, encoding in zip(examples["body"], batch_encoding.encodings):
        word_ids = sorted(set(encoding.word_ids) - {None})
        words = []
        for word_id in word_ids:
            start, end = encoding.word_to_chars(word_id)
            words.append(text[start:end])
        examples["tokens"].append(words)
    return examples

# Get the prediction from our fine-tuned distilbert
def make_predictions(examples):
    examples["prediction"] = classifier(examples["body"], aggregation_strategy="first")
    return examples

# Add tokens and predictions
data_prepared =data.take(n)\
    .map(make_tokens, batched=True)\
    .map(make_predictions, batched=True, batch_size=32)

# Create Rubrix records
records = []
for idx, example in tqdm(enumerate(data_prepared), total=n, desc="Create records"):
    record = rb.TokenClassificationRecord(
        text=example["body"],
        tokens=example["tokens"],
        prediction=[(p["entity_group"], p["start"], p["end"]) for p in example["prediction"]],
        prediction_agent="elastic/distilbert-base-cased-finetuned-conll03-english",
        id=idx,
    )
    records.append(record)

# Upload records to Rubrix
rb.log(records, "subreddit_ner")

Create records:   0%|          | 0/100 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (736 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/100 [00:00<?, ?it/s]

100 records logged to http://localhost:6900/datasets/rubrix/subreddit_ner


BulkResponse(dataset='subreddit_ner', processed=100, failed=0)

In [10]:
# Load the dataset from the web app and prepare it for training a Hugging Face transformer
data_ds = rb.load("subreddit_ner").prepare_for_training()

# Split it into a train and test set
data = data_ds.train_test_split()

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ex/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
# Tokenize our data and align our labels
tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_data = data.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
# Get the labels
data_ds.features["ner_tags"][0].names

['O', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

In [16]:
# Load the pre-trained transformer and provide the dimensions of your token classification head
model = AutoModelForTokenClassification.from_pretrained(
    "google/electra-small-discriminator",
    num_labels=len(data_ds.features["ner_tags"][0].names)
)


# Define your training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Instantiate the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

loading configuration file https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json from cache at /home/ahmed/.cache/huggingface/transformers/ca13c16218c6780ec76753d3afa19fcb7cc759e3f63ee87e441562d374762b3d.3dd1921e571dfa18c0bdaa17b9b38f111097812281989b1cb22263738e66ef73
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL

Epoch,Training Loss,Validation Loss
1,No log,1.897692
2,No log,1.689202
3,No log,1.607506


The following columns in the evaluation set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: event_timestamp, status, metadata, annotation, text, annotation_agent, prediction, prediction_agent, id, ner_tags, metrics, tokens. If event_timestamp, status, metadata, annotation, text, annotation_agent, prediction, prediction_agent, id, ner_tags, metrics, tokens are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: event_timestamp, status, metadata, annotation, text, annotation_agent, prediction, prediction_agent, id, ner_tags, metrics, tokens. If event_timestamp, status, metadata, annotation, text, annotation_agent, prediction, prediction_agent, id, ner_tags, metrics, tokens are not ex

TrainOutput(global_step=30, training_loss=1.8247698465983073, metrics={'train_runtime': 50.655, 'train_samples_per_second': 4.442, 'train_steps_per_second': 0.592, 'total_flos': 2880354369270.0, 'train_loss': 1.8247698465983073, 'epoch': 3.0})