# < Title > 

< Opening >

- Bullet points
- Bullet points 
- Bullet points


# Introduction

< Introduction >

# Setup 

Rubrix, is a free and open-source tool to explore, annotate, and monitor data for NLP projects.

If you are new to Rubrix, check out the ⭐ Github repository.

If you have not installed and launched Rubrix yet, check the Setup and Installation guide.

For this tutorial we also need some third party libraries that can be installed via pip:

In [1]:
%pip install sentence_transformers transformers datasets

# 1. Log the dataset into Rubrix

Rubrix allows you to log and track data for different NLP tasks (such as `Token Classification` or `Text Classification`).

< Dataset description >


## The dataset

< Dataset description >

In [2]:
import pandas as pd

train_df = pd.read_csv('../tutorials/data/yt_comments_train.csv')
test_df = pd.read_csv('../tutorials/data/yt_comments_test.csv')

In [3]:
import rubrix as rb

# build records from the train dataset
records = [
    rb.TextClassificationRecord(
        inputs=row.text,
        metadata={"video":row.video, "author": row.author}
    )
    for i,row in train_df.iterrows()
]

# build records from the test dataset with annotation
labels = ["HAM", "SPAM"]
records += [
    rb.TextClassificationRecord(
        inputs=row.text,
        annotation=labels[row.label],
        metadata={"video":row.video, "author": row.author}
    )
    for i,row in test_df.iterrows()
]

# log records to Rubrix
rb.log(records, name="weak_supervision_yt")



  0%|          | 0/1836 [00:00<?, ?it/s]

1836 records logged to http://localhost:6900/ws/rubrix/weak_supervision_yt


BulkResponse(dataset='weak_supervision_yt', processed=1836, failed=0)

# 2. Create a weak labels matrix

In [None]:
from rubrix.labeling.text_classification import Rule, WeakLabels

#  rules defined as Elasticsearch queries
check_out = Rule(query="check out", label="HAM_key")
plz = Rule(query="plz OR please", label="SPAM")
subscribe = Rule(query="subscribe", label="HAM_key")
my = Rule(query="my", label="SPAM")
song = Rule(query="song", label="HAM")
love = Rule(query="love", label="SPAM")

import re

# rules defined as Python labeling functions
def contains_http(record: rb.TextClassificationRecord):
    if "http" in record.inputs["text"]:
        return "SPAM"

def short_comment(record: rb.TextClassificationRecord):
    return "HAM" if len(record.inputs["text"].split()) < 5 else None

def regex_check_out(record: rb.TextClassificationRecord):
    return "SPAM" if re.search(r"check.*out", record.inputs["text"], flags=re.I) else None

from rubrix.labeling.text_classification import load_rules

# bundle our rules in a list
rules = [my, song, love, contains_http, short_comment, regex_check_out, plz]

# optionally add the rules defined in the web app UI
rules += load_rules(dataset="weak_supervision_yt")

In [None]:

# apply the rules to a dataset to obtain the weak labels
weak_labels = WeakLabels(
    rules=rules,
    dataset="weak_supervision_yt"
)

In [6]:
weak_labels.summary()

Unnamed: 0,label,coverage,annotated_coverage,overlaps,conflicts,correct,incorrect,precision
my,{SPAM},0.190632,0.188,0.142702,0.033224,41,6,0.87234
song,{HAM},0.132898,0.192,0.078976,0.055556,39,9,0.8125
love,{SPAM},0.092048,0.14,0.068083,0.046296,7,28,0.2
contains_http,{SPAM},0.106209,0.024,0.071351,0.046296,6,0,1.0
short_comment,{HAM},0.245098,0.368,0.100763,0.077342,84,8,0.913043
regex_check_out,{SPAM},0.22658,0.18,0.098039,0.01634,45,0,1.0
plz OR please,{SPAM},0.090414,0.08,0.074619,0.011438,20,0,1.0
total,"{SPAM, HAM}",0.724946,0.804,0.275599,0.125272,242,51,0.825939


# 3. Example: Expand matrix with all thresholds set to 50%

### Generate embeddings with a sentence embeddings model

In [7]:
from sentence_transformers import SentenceTransformer
embedding_model_name = "all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(embedding_model_name)
embeddings = embedding_model.encode([x.text for x in weak_labels.records()])

### Extend the weak labels matrix

In [8]:
thresholds = [0.5] * weak_labels.matrix().shape[1]
extended_wl_matrix, queries = weak_labels.extend_matrix(embeddings, thresholds)

## Visualize how many records from each label were expanded

In [9]:
from collections import Counter
from functools import reduce

transitions = list(list(zip(row[0], row[1])) for row in zip(weak_labels._matrix, extended_wl_matrix))
transitions = reduce(lambda x,y: x+y, transitions)
transitions = [f"{weak_labels.int2label[x[0]]} -> {weak_labels.int2label[x[1]]}".format(x) for x in transitions]
transitions = Counter(transitions)
pd.DataFrame.from_dict(transitions, orient='index').reset_index().rename(columns={"index": "expansion", 0: "count"})

Unnamed: 0,expansion,count
0,None -> None,7090
1,None -> HAM,1274
2,HAM -> HAM,694
3,None -> SPAM,2498
4,SPAM -> SPAM,1296


# 4. Perform gridsearch for the best thresholds

## Build the training data

In [11]:
from rubrix.labeling.text_classification import FlyingSquid
from tqdm import tqdm
import copy

def generate_flyingsquid_training_data(weak_labels):
    model = FlyingSquid(weak_labels)
    model.fit()
    records = model.predict()
    data = pd.DataFrame(
        [
            {"text": rec.inputs["text"], "label": model.weak_labels.label2int[rec.prediction[0][0]]}
            for rec in records
        ]
    )
    return data

def gridsearch(weak_labels, embeddings, num=20):
    linspace = np.linspace(0, 1, num=num)
    thresholds_set = [ [x] * weak_labels.matrix().shape[1] for x in linspace ]

    extended_wl_matrix_set = []
    for idx, thresholds in enumerate(thresholds_set):
        if idx == 0:
            extended_wl_matrix, queries = weak_labels.extend_matrix(embeddings, thresholds, cache=None)
        else:
            extended_wl_matrix, queries = weak_labels.extend_matrix(embeddings, thresholds, cache=queries)
        extended_wl_matrix_set.append(extended_wl_matrix)

    grid_search_training_data = []
    for wl_matrix in extended_wl_matrix_set:
        weak_labels_copy = copy.deepcopy(weak_labels)
        weak_labels_copy._matrix = wl_matrix
        yield weak_labels_copy


original_data = generate_flyingsquid_training_data(weak_labels)
grid_search_training_data = []
num = 20
for weak_label_matrix in tqdm(grisearch(weak_labels, embeddings, num=num), total=num):
    data = generate_flyingsquid_training_data(weak_label_matrix)
    grid_search_training_data.append(data)

100%|██████████| 20/20 [00:14<00:00,  1.41it/s]


## Define a transformer model

In [None]:
import numpy as np
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

def tokenize_dataset(dataset):
    preprocess_function = lambda examples: tokenizer(examples["text"], truncation=True)
    tokenized_dataset = dataset.map(preprocess_function, batched=True)
    return tokenized_dataset

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
    return {"accuracy": accuracy, "f1": f1}

def get_trainer(train_dataset, dev_dataset, tokenizer, data_collator, model):
    training_args = TrainingArguments(
        output_dir="/tmp",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=1,
        weight_decay=0.01,
        save_strategy="epoch",
        push_to_hub=False,
      )

    tokenized_train = tokenize_dataset(train_dataset)
    tokenized_dev = tokenize_dataset(dev_dataset)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_dev,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
      )
  
    return trainer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

## Keep only the two bottom layers

In [None]:
import copy
from torch import nn

def deleteEncodingLayers(model, num_layers):  # must pass in the full bert model
    oldModuleList = model.bert.encoder.layer
    newModuleList = nn.ModuleList()

    # Now iterate over all layers, only keepign only the relevant layers.
    for i in range(0, num_layers):
        newModuleList.append(oldModuleList[i])

    # create a copy of the model, modify it with the new list, and return
    copyOfModel = copy.deepcopy(model)
    copyOfModel.bert.encoder.layer = newModuleList

    return copyOfModel

In [None]:
model = deleteEncodingLayers(model, 2)

## Baseline: train and evaluate the model with FlyingSquid the unexpanded weak labels matrix

In [None]:
from datasets import Dataset, load_dataset
import pandas as pd

df = original_data
train_dataset = Dataset.from_pandas(df)

df = pd.read_csv('../tutorials/data/yt_comments_test.csv')[["text", "label"]]
dev_dataset = Dataset.from_pandas(df)

trainer = get_trainer(train_dataset, dev_dataset, tokenizer, data_collator, model)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1130
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 71


Step,Training Loss


Saving model checkpoint to /tmp/checkpoint-71
Configuration saved in /tmp/checkpoint-71/config.json
Model weights saved in /tmp/checkpoint-71/pytorch_model.bin
tokenizer config file saved in /tmp/checkpoint-71/tokenizer_config.json
Special tokens file saved in /tmp/checkpoint-71/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=71, training_loss=0.5412735200264085, metrics={'train_runtime': 4.9274, 'train_samples_per_second': 229.33, 'train_steps_per_second': 14.409, 'total_flos': 14079383164800.0, 'train_loss': 0.5412735200264085, 'epoch': 1.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 250
  Batch size = 16


Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

{'epoch': 1.0,
 'eval_accuracy': 0.472,
 'eval_f1': 0.32065217391304346,
 'eval_loss': 0.6701595783233643,
 'eval_runtime': 1.3442,
 'eval_samples_per_second': 185.986,
 'eval_steps_per_second': 11.903}

## Perform gridsearch with BERT

In [None]:
import transformers

from tqdm import tqdm
max_acc = 0
final_thresholds = None
for idx, row in tqdm(enumerate(data)):
    training_data = row[["text", "label"]]
    train_dataset = Dataset.from_pandas(training_data)
    thresholds = idx
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model = deleteEncodingLayers(model, 2)
    trainer = get_trainer(train_dataset, dev_dataset, tokenizer, data_collator, model)
    trainer.train()
    model_acc = trainer.evaluate()["eval_accuracy"]
    print(f"model_acc: {model_acc}, max_acc: {max_acc}")
    if model_acc > max_acc:
        final_thresholds = thresholds
        max_acc = model_acc

After performing gridsearch and expanding the weak labels matrix with the best thresholds, the accuracy of our model went from **47.2%** to **68.4%**.

In [None]:
max_acc

0.684

In [None]:
final_thresholds

14

In [16]:
thresholds_set[14]

[0.7368421052631579,
 0.7368421052631579,
 0.7368421052631579,
 0.7368421052631579,
 0.7368421052631579,
 0.7368421052631579,
 0.7368421052631579]

# Summary

< Summary >

## Next steps

### ⭐ Rubrix [Github repo](https://github.com/recognai/rubrix) to stay updated.

### 📚 [Rubrix documentation](https://docs.rubrix.ml) for more guides and tutorials.

### 🙋‍♀️ Join the Rubrix community! A good place to start is the [discussion forum](https://github.com/recognai/rubrix/discussions).