# Weak supervision in multi-label text classification tasks

WORK IN PROGRESS: This tutorial is a work in progress and you can expect some changes within the next few releases.
We will showcase new features here as soon as they are available.

In this tutorial we will tackle two text classification tasks that deal with multi-labels.

## go emotions

In [None]:
import rubrix as rb
from datasets import load_dataset

# Download preprocessed dataset
ds_rb = rb.read_datasets(
    load_dataset("rubrix/go_emotions_multi-label", split="train", use_auth_token=True),
    task="TextClassification"
)

In [None]:
# Log dataset to Rubrix to find good heuristics
rb.log(ds_rb, name="go_emotions")

In [13]:
from rubrix.labeling.text_classification import Rule

# Define our heuristic rules (can probably be improved)
rules = [
    Rule("thank*", "gratitude"),
    Rule("appreciate", "gratitude"),
    Rule("text:(thanks AND good)", ["admiration", "gratitude"]),
    Rule("advice", "admiration"),
    Rule("amazing", "admiration"),
    Rule("awesome", "admiration"),
    Rule("impressed", "admiration"),
    Rule("text:(good AND (point OR call OR idea OR job))", "admiration"),
    Rule("legend", "admiration"),
    Rule("exactly", "approval"),
    Rule("agree", "approval"),
    Rule("yeah", "approval"),
    Rule("suck", "annoyance"),
    Rule("pissed", "annoyance"),
    Rule("annoying", "annoyance"),
    Rule("ruined", "annoyance"),
    Rule("hoping", "optimism"),
    Rule("text:(\"good luck\")", "optimism"),
    Rule("\"nice day\"", "optimism"),
    Rule("\"what is\"", "curiosity"),
    Rule("\"can you\"", "curiosity"),
    Rule("\"would you\"", "curiosity"),
]

In [14]:
from rubrix.labeling.text_classification import WeakMultiLabels

# Compute the weak labels for our dataset given the rules
weak_labels = WeakMultiLabels("go_emotions", rules=rules)

Preparing rules:   0%|          | 0/22 [00:00<?, ?it/s]

Applying rules:   0%|          | 0/4208 [00:00<?, ?it/s]

Filling weak label matrix:   0%|          | 0/4208 [00:00<?, ?it/s]

In [15]:
# Check coverage/precision of our rules
weak_labels.summary()

Unnamed: 0,label,coverage,annotated_coverage,overlaps,correct,incorrect,precision
thank*,{gratitude},0.196768,0.196237,0.037785,73,0,1.0
appreciate,{gratitude},0.01616,0.021505,0.009506,7,1,0.875
text:(thanks AND good),"{admiration, gratitude}",0.007842,0.010753,0.007605,8,0,1.0
advice,{admiration},0.008317,0.008065,0.006654,3,0,1.0
amazing,{admiration},0.025428,0.021505,0.003565,8,0,1.0
awesome,{admiration},0.02519,0.034946,0.006179,12,1,0.923077
impressed,{admiration},0.002139,0.005376,0.0,2,0,1.0
text:(good AND (point OR call OR idea OR job)),{admiration},0.008555,0.018817,0.002376,7,0,1.0
legend,{admiration},0.001901,0.002688,0.000475,1,0,1.0
exactly,{approval},0.004278,0.002688,0.001188,1,0,1.0


In [16]:
from rubrix.labeling.text_classification import MajorityVoter

# Use the majority voter as the label model
label_model = MajorityVoter(weak_labels)

In [17]:
# Get records with the predictions from the label model to train a down-stream model
train_rb = rb.DatasetForTextClassification(label_model.predict())

# Copy label model predictions to annotation
for rec in train_rb:
    rec.annotation = [pred[0] for pred in rec.prediction if pred[1] > 0.5]

In [18]:
# Get records with manual annotations to use as test set for the down-stream model
test_rb = rb.DatasetForTextClassification(weak_labels.records(has_annotation=True))

In [19]:
from datasets import DatasetDict

# Create dataset dictionary and shuffle training set
ds = DatasetDict(
    train=train_rb.prepare_for_training().shuffle(seed=42),
    test=test_rb.prepare_for_training()
)

In [None]:
# Push dataset for training our down-stream model to the HF hub
ds.push_to_hub("rubrix/go_emotions_training", private=True)

### Train transformers down-stream model

In [20]:
from transformers import AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [21]:
def tokenize_func(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize the data
tokenized_ds = ds.map(tokenize_func, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [39]:
def binarize_labels(examples):
    return {"label": [
        [int(i in labels) for i in range(len(ds["test"].features["label"][0].names))] 
        for labels in examples["label"]
    ]}

# Turn labels into multi-label format
binarized_tokenized_ds = tokenized_ds.map(binarize_labels, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
from transformers import AutoModelForSequenceClassification

# Init our down-stream model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    problem_type="multi_label_classification", 
    num_labels=6
)

In [283]:
from transformers import TrainingArguments

# Set our training arguments
training_args = TrainingArguments(
    output_dir="test_trainer", 
    evaluation_strategy="epoch", 
    num_train_epochs=2,
    per_device_train_batch_size=16,   
    per_device_eval_batch_size=16, 
)

In [285]:
from datasets import load_metric
import numpy as np

# Define our metrics
metric = load_metric("f1", config_name="multilabel")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = ( 1. / (1 + np.exp(-logits)) ) > 0.5
    
    metrics = metric.compute(predictions=predictions, references=labels, average="micro")
    per_label_metric = metric.compute(predictions=predictions, references=labels, average=None)
    for label, f1 in zip(ds["train"].features["label"][0].names, per_label_metric["f1"]):
        metrics[f"f1_{label}"] = f1

    return metrics

In [286]:
from transformers import Trainer

# Init the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=binarized_tokenized_ds["train"],
    eval_dataset=binarized_tokenized_ds["test"],
    compute_metrics=compute_metrics,
)

In [None]:
# Train the down-stream model
trainer.train()

## Research topic dataset

See Appendix B for the data preprocessing.

In [1]:
import rubrix as rb
from datasets import load_dataset

# Download preprocessed dataset
ds_rb = rb.read_datasets(
    load_dataset("rubrix/go_emotions_multi_label", split="train", use_auth_token=True),
    task="TextClassification"
)



In [95]:
# Log dataset to Rubrix to find good heuristics
rb.log(records, "research_titles")

  0%|          | 0/20972 [00:00<?, ?it/s]

20972 records logged to http://localhost:6900/ws/rubrix/research_titles


BulkResponse(dataset='research_titles', processed=20972, failed=0)

In [2]:
from rubrix.labeling.text_classification import Rule

# Define our heuristic rules (can probably be improved)
rules = [
    Rule("stock*", "Quantitative Finance"),
    Rule("*asset*", "Quantitative Finance"),
    Rule("trading", "Quantitative Finance"),
    Rule("finance", "Quantitative Finance"),
    Rule("pric*", "Quantitative Finance"),
    Rule("economy", "Quantitative Finance"),
    Rule("deep AND neural AND network*", "Computer Science"),
    Rule("convolutional", "Computer Science"),
    Rule("memor* AND (design* OR network*)", "Computer Science"),
    Rule("system* AND design*", "Computer Science"),
    Rule("allocat* AND *net*", "Computer Science"),
    Rule("program", "Computer Science"),
    Rule("scattering", "Physics"),
    Rule("astro*", "Physics"),
    Rule("material*", "Physics"),
    Rule("spin", "Physics"),
    Rule("magnetic", "Physics"),
    Rule("optical", "Physics"),
    Rule("ray", "Physics"),
    Rule("entangle*", "Physics"),
    Rule("*algebra*", "Mathematics"),
    Rule("manifold* AND (NOT learn*)", "Mathematics"),
    Rule("equation", "Mathematics"),
    Rule("spaces", "Mathematics"), 
    Rule("operators", "Mathematics"), 
    Rule("regression", "Statistics"),
    Rule("bayes*", "Statistics"),
    Rule("estimation", "Statistics"),
    Rule("mixture", "Statistics"),
    Rule("gaussian", "Statistics"),
    Rule("gene", "Quantitative Biology"),
]

In [3]:
from rubrix.labeling.text_classification import WeakMultiLabels

# Compute the weak labels for our dataset given the rules
weak_labels = WeakMultiLabels("research_titles", rules=rules)



Preparing rules:   0%|          | 0/31 [00:00<?, ?it/s]

Applying rules:   0%|          | 0/20972 [00:00<?, ?it/s]

Filling weak label matrix:   0%|          | 0/20972 [00:00<?, ?it/s]

In [4]:
# Check coverage/precision of our rules
weak_labels.summary()

Unnamed: 0,label,coverage,annotated_coverage,overlaps,correct,incorrect,precision
stock*,{Quantitative Finance},0.000954,0.000715,0.000334,3,0,1.0
*asset*,{Quantitative Finance},0.000477,0.000715,0.000286,3,0,1.0
trading,{Quantitative Finance},0.000954,0.000238,0.000191,1,0,1.0
finance,{Quantitative Finance},4.8e-05,0.000238,0.0,1,0,1.0
pric*,{Quantitative Finance},0.003433,0.003337,0.000715,9,5,0.642857
economy,{Quantitative Finance},0.000238,0.000238,0.0,1,0,1.0
deep AND neural AND network*,{Computer Science},0.009155,0.01025,0.002098,32,11,0.744186
convolutional,{Computer Science},0.010109,0.009297,0.002146,32,7,0.820513
memor* AND (design* OR network*),{Computer Science},0.001383,0.002145,0.000286,9,0,1.0
system* AND design*,{Computer Science},0.001144,0.002384,0.000238,9,1,0.9


In [5]:
from rubrix.labeling.text_classification import MajorityVoter

# Use the majority voter as the label model
label_model = MajorityVoter(weak_labels)

In [6]:
train_df = rb.DatasetForTextClassification(label_model.predict()).to_pandas()

In [7]:
# Create labels in multi-label format
train_df["label"] = train_df.prediction.map(
    lambda x: [
        {p[0]: int(p[1] > 0.5) for p in x}[label] 
        for label in weak_labels.labels
    ]
)

In [8]:
from skmultilearn.problem_transform import ClassifierChain, BinaryRelevance
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Define our down-stream model
classifier = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', BinaryRelevance(MultinomialNB()))
])

In [10]:
import numpy as np

# Fit the down-stream classifier
classifier.fit(
    X=train_df.text,
    y=np.array(train_df.label.tolist()),
)

Pipeline(steps=[('vect', CountVectorizer()),
                ('clf',
                 BinaryRelevance(classifier=MultinomialNB(),
                                 require_dense=[True, True]))])

In [11]:
# Get predictions for test set
predictions = classifier.predict(
    X=[rec.text for rec in weak_labels.records(has_annotation=True)]
)

In [12]:
from sklearn.metrics import classification_report

# Compute metrics
print(classification_report(weak_labels.annotation(), predictions))

              precision    recall  f1-score   support

           0       0.81      0.23      0.36      1740
           1       0.77      0.59      0.67      1141
           2       0.88      0.66      0.75      1186
           3       0.50      0.01      0.02       109
           4       0.45      0.11      0.18        45
           5       0.55      0.67      0.60      1069

   micro avg       0.72      0.49      0.58      5290
   macro avg       0.66      0.38      0.43      5290
weighted avg       0.75      0.49      0.56      5290
 samples avg       0.58      0.52      0.53      5290



  _warn_prf(average, modifier, msg_start, len(result))


## APPENDIX A

We want to limit the labels, and down-sample single-label annotations to move the focus to multi-label outputs.

In [16]:
import pandas as pd
import datasets

In [17]:
go_emotions = datasets.load_dataset("go_emotions")

Downloading:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.67k [00:00<?, ?B/s]



  0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
df = go_emotions["test"].to_pandas()

In [19]:
def int2str(i):
    #return int(i)
    return go_emotions["train"].features["labels"].feature.int2str(int(i))

In [21]:
label_freq = []

idx_multi = df.labels.map(lambda x: len(x) > 1)
df["is_single"] = df.labels.map(lambda x: 0 if len(x) > 1 else 1) 
df[idx_multi].labels.map(lambda x: [label_freq.append(int(l)) for l in x])
pd.Series(label_freq).value_counts();

In [22]:
def create(split: str) -> pd.DataFrame:
    df = go_emotions[split].to_pandas()
    df["is_single"] = df.labels.map(lambda x: 0 if len(x) > 1 else 1)
    
    #['admiration', 'approval', 'annoyance', 'gratitude', 'curiosity', 'optimism', 'amusement']
    idx_most_common = df.labels.map(lambda x: all([int(label) in [0, 4, 3, 15, 7, 15, 20] for label in x]))
    df_multi = df[(df.is_single == 0) & idx_most_common]
    df_single = df[idx_most_common].sample(3*len(df_multi), weights="is_single", axis=0, random_state=42)
    return pd.concat([df_multi, df_single]).sample(frac=1, random_state=42)

In [23]:
import rubrix as rb

def make_records(row, is_train: bool) -> rb.TextClassificationRecord:
    annotation = [int2str(i) for i in row.labels] if not is_train else None
    return rb.TextClassificationRecord(
        inputs=row.text,
        annotation=annotation,
        multi_label=True,
        id=row.id,
    )

In [24]:
train_recs = create("train").apply(make_records, axis=1, is_train=True)

In [25]:
test_recs = create("test").apply(make_records, axis=1, is_train=False)

In [26]:
records = train_recs.to_list() + test_recs.tolist()

In [27]:
ds_rb = rb.DatasetForTextClassification(records).to_datasets()

In [28]:
ds_rb.push_to_hub("rubrix/go_emotions_multi-label", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

## APPENDIX B

https://www.kaggle.com/shivanandmn/multilabel-classification-dataset

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("/home/david/Downloads/topic_modeling_researc_articles/train.csv")

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
_, test_id = train_test_split(df.ID, test_size=0.2, random_state=42)

In [8]:
labels = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]

In [9]:
def make_record(row):
    annotation = [label for label in labels if row[label] == 1]
    return rb.TextClassificationRecord(
        inputs=row.TITLE,
        annotation=annotation if row.ID in test_id else None,
        multi_label=True,
        id=row.ID,
    )

In [10]:
records = df.apply(make_record, axis=1)

In [11]:
import rubrix as rb

In [13]:
dataset_rb = rb.DatasetForTextClassification(records.tolist())

In [15]:
dataset_rb.to_datasets().push_to_hub("rubrix/research_titles_multi-label", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]