In [33]:
# !python3.10 -m pip install cleanlab
# !python3.10 -m pip install -U scikit-learn sentence-transformers datasets
# !python3.10 -m pip install -U "cleanlab[datalab]"
# !python3.10 -m pip install datasets
# !python3.10 -m pip install -U sentence-transformers

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m


https://huggingface.co/learn/cookbook/issues_in_text_dataset (ref)

In [9]:
from cleanlab import Datalab 

In [10]:
import random 
import numpy as np
import pandas as pd

np.random.seed(12345)
random.seed(12345)

In [20]:
from datasets import load_dataset

dataset = load_dataset("PolyAI/banking77", split='train')
data = pd.DataFrame(dataset[:1000])
data.head()

Unnamed: 0,text,label
0,I am still waiting on my card?,11
1,What can I do if my card still hasn't arrived ...,11
2,I have been waiting over a week. Is the card s...,11
3,Can I track my card while it is in the process...,11
4,"How do I know if I will get my card, or if it ...",11


In [49]:
raw_texts, labels = data['text'].values, data['label'].values
num_classes = len(set(labels))
len(raw_texts)

1000

In [50]:
raw_texts[:5]

array(['I am still waiting on my card?',
       "What can I do if my card still hasn't arrived after 2 weeks?",
       'I have been waiting over a week. Is the card still coming?',
       'Can I track my card while it is in the process of delivery?',
       'How do I know if I will get my card, or if it is lost?'],
      dtype=object)

In [62]:
from sentence_transformers import SentenceTransformer

transformer = SentenceTransformer("google/electra-small-discriminator")
text_embeddings = transformer.encode(raw_texts)

No sentence-transformers model found with name google/electra-small-discriminator. Creating a new one with MEAN pooling.


In [63]:
text_embeddings.shape

(1000, 256)

In [53]:
import re
import string
from sklearn.metrics import accuracy_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

In [54]:
model = LogisticRegression(max_iter=400)

In [64]:
pred_probs = cross_val_predict(model, text_embeddings, labels, method='predict_proba')

In [65]:
pred_probs.shape

(1000, 7)

In [60]:
data_dict = {"texts": raw_texts, "labels": labels}

In [66]:
lab = Datalab(data_dict, label_name="labels")
lab.find_issues(pred_probs=pred_probs, features=text_embeddings)

Finding null issues ...
Finding label issues ...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Finding outlier issues ...
Fitting OOD estimator based on provided features ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding class_imbalance issues ...
Finding underperforming_group issues ...

Audit complete. 62 issues found in the dataset.


In [67]:
lab.report()

Here is a summary of the different kinds of issues found in the data:

    issue_type  num_issues
       outlier          37
near_duplicate          14
         label          10
       non_iid           1

Dataset Information: num_examples: 1000, num_classes: 7


---------------------- outlier issues ----------------------

About this issue:
	Examples that are very different from the rest of the dataset 
    (i.e. potentially out-of-distribution or rare/anomalous instances).
    

Number of examples with this issue: 37
Overall dataset quality in terms of this issue: 0.3671

Examples representing most severe instances of this issue:
     is_outlier_issue  outlier_score
791              True       0.024866
601              True       0.031162
863              True       0.060738
355              True       0.064199
157              True       0.065075


------------------ near_duplicate issues -------------------

About this issue:
	A (near) duplicate issue refers to two or more example

In [68]:
label_issues = lab.get_issues("label")
label_issues.head()

Unnamed: 0,is_label_issue,label_score,given_label,predicted_label
0,False,0.903856,11,11
1,False,0.861113,11,11
2,False,0.653667,11,11
3,False,0.697994,11,11
4,False,0.435811,11,11


In [72]:
identified_label_issues = label_issues[label_issues["is_label_issue"] == True]
lowest_quality_labels = label_issues["label_score"].argsort()[:5].to_numpy()
lowest_quality_labels

array([379, 100, 300, 485, 413])

In [73]:
print(
    f"cleanlab found {len(identified_label_issues)} potential label errors in the dataset.\n"
    f"Here are indices of the top 5 most likely errors: \n {lowest_quality_labels}"
)

cleanlab found 10 potential label errors in the dataset.
Here are indices of the top 5 most likely errors: 
 [379 100 300 485 413]


In [74]:
data_with_suggested_labels = pd.DataFrame(
    {"text": raw_texts, "given_label": labels, "suggested_label": label_issues["predicted_label"]}
)
data_with_suggested_labels.iloc[lowest_quality_labels]

Unnamed: 0,text,given_label,suggested_label
379,Is there a specific source that the exchange r...,32,11
100,can you share card tracking number?,11,36
300,"If I need to cash foreign transfers, how does ...",32,46
485,Was I charged more than I should of been for a...,17,34
413,I was charged the wing amount for an item.,17,34


In [75]:
outlier_issues = lab.get_issues("outlier")
outlier_issues.sort_values("outlier_score").head()

Unnamed: 0,is_outlier_issue,outlier_score
791,True,0.024866
601,True,0.031162
863,True,0.060738
355,True,0.064199
157,True,0.065075


In [76]:
lowest_quality_outliers = outlier_issues["outlier_score"].argsort()[:5]

data.iloc[lowest_quality_outliers]

Unnamed: 0,text,label
791,withdrawal pending meaning?,46
601,$1 charge in transaction.,34
863,My atm withdraw is stillpending,46
355,explain the interbank exchange rate,32
157,"lost card found, want to put it back in app",13


In [77]:
duplicate_issues = lab.get_issues("near_duplicate")
duplicate_issues.sort_values("near_duplicate_score").head()

Unnamed: 0,is_near_duplicate_issue,near_duplicate_score,near_duplicate_sets,distance_to_nearest_neighbor
459,True,0.009548,[429],0.000566
429,True,0.009548,[459],0.000566
501,True,0.046052,"[412, 517]",0.002782
412,True,0.046052,[501],0.002782
698,True,0.054629,[607],0.003314


In [78]:
data.iloc[[459, 429]]

Unnamed: 0,text,label
459,I purchased something abroad and the incorrect...,17
429,I purchased something overseas and the incorre...,17


In [79]:
p_value = lab.get_info("non_iid")["p-value"]
p_value

0.0