# 🧐 Using Rubrix to find label errors with cleanlab 

In [None]:
# !pip install -U transformers
# !pip install -U datasets
# !pip install -U cleanlab

In [7]:
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
from cleanlab.pruning import get_noise_indices
import rubrix as rb
import torch

In [2]:
rb.init(
)

# MNLI

In [57]:
#model_name = "typeform/distilbert-base-uncased-mnli"
model_name = "roberta-large-mnli"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [58]:
dataset = datasets.load_dataset('multi_nli', split='validation_mismatched')

Using custom data configuration default
Reusing dataset multi_nli (/home/david/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)


In [59]:
dataset

Dataset({
    features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
    num_rows: 9832
})

In [60]:
# get model predictions
probs_ds = dataset.map(
    lambda x: {"prob": torch.softmax(model(**tokenizer(x["premise"], x["hypothesis"], return_tensors="pt")).logits, dim=1)[0]}, 
    remove_columns=dataset.column_names
)

  0%|          | 0/9832 [00:00<?, ?ex/s]

In [61]:
# combine data and model predictions
data_df = pd.DataFrame({"premise": dataset["premise"], "hypothesis": dataset["hypothesis"], "probs": probs_ds["prob"], "label": dataset["label"]})

In [74]:
data_df["probs"] = data_df.probs.map(lambda x: [x[2], x[1], x[0]])

In [75]:
probs_matrix = np.array(data_df["probs"].to_list())

In [80]:
label_errors = get_noise_indices(
    s=data_df["label"].to_numpy(),
    psx=probs_matrix,
    sorted_index_method=None#'normalized_margin', # Orders label errors
)
label_errors.sum()

481

In [77]:
data_df.iloc[0].to_list()

['Your contribution helped make it possible for us to provide our students with a quality education.',
 "Your contributions were of no help with our students' education.",
 [0.00016414420679211617, 0.0003369634796399623, 0.9994988441467285],
 2]

In [78]:
def make_rec(row):
    #prem_hypo = {"premise": row.premise, "hypothesis": row.hypothesis},
    preds = list(zip(["entailment", "neutral", "contradiction"], row.probs))
    annot = "neutral"
    if row.label == 0:
        annot = "entailment"
    elif row.label == 2:
        annot = "contradiction"
        
    return rb.TextClassificationRecord(inputs={"premise": row.premise, "hypothesis": row.hypothesis}, prediction=preds, prediction_agent=model_name, annotation=annot, annotation_agent="mnli")
    # return rb.TextClassificationRecord(inputs=prem_hypo)  # super weird error

recs = data_df[label_errors].apply(make_rec, axis=1)

In [79]:
rb.log(records=recs.to_list(), name="mnli_label_error_mm")

BulkResponse(dataset='mnli_label_error_mm', processed=481, failed=0)

# MRPC

In [49]:
model_name = "textattack/roberta-base-MRPC"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at textattack/roberta-base-MRPC were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [50]:
dataset = datasets.load_dataset("glue", "mrpc", split="validation")

Reusing dataset glue (/home/david/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [51]:
dataset[0]

{'sentence1': "He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .",
 'sentence2': '" The foodservice pie business does not fit our long-term growth strategy .',
 'label': 1,
 'idx': 9}

In [52]:
# get model predictions
probs_ds = dataset.map(
    lambda x: {"prob": torch.softmax(model(**tokenizer(x["sentence1"], x["sentence2"], return_tensors="pt")).logits, dim=1)[0]}, 
    remove_columns=dataset.column_names
)

  0%|          | 0/408 [00:00<?, ?ex/s]

In [53]:
# combine data and model predictions
data_df = pd.DataFrame({"sentence1": dataset["sentence1"], "sentence2": dataset["sentence2"], "probs": probs_ds["prob"], "label": dataset["label"]})

In [54]:
probs_matrix = np.array(data_df["probs"].to_list())

In [55]:
label_errors = get_noise_indices(
    s=data_df["label"].to_numpy(),
    psx=probs_matrix,
    sorted_index_method=None#'normalized_margin', # Orders label errors
)

In [56]:
label_errors.sum()

19

In [37]:
def make_rec(row):
    #prem_hypo = {"premise": row.premise, "hypothesis": row.hypothesis},
    preds = list(zip(["Not equivalent", "Equivalent"], row.probs))
    if row.label == 0:
        annot = "Not equivalent"
    elif row.label == 1:
        annot = "Equivalent"
    else:
        raise RuntimeError("WTF")
        
    return rb.TextClassificationRecord(
        inputs={"sentence1": row.sentence1, "sentence2": row.sentence2}, 
        prediction=preds, 
        prediction_agent=model_name, 
        annotation=annot, 
        annotation_agent="mrpc"
    )
    # return rb.TextClassificationRecord(inputs=prem_hypo)  # super weird error

recs = data_df[label_errors].apply(make_rec, axis=1)

In [38]:
rb.log(records=recs.to_list(), name="mrpc_label_error")

BulkResponse(dataset='mrpc_label_error', processed=19, failed=0)