# Tutorial: detecting label errors in a dataset

## Q: What are samples with label errors?

A figure below illustrates what are samples with label errors.

In [None]:
# Set path
import sys
import os
os.chdir('..')
o_path = os.getcwd()
print(o_path)
sys.path.append(o_path) # set path so that modules from other foloders can be loaded
from IPython.display import Image

#### The figure above includes 10 selected CIFAR-100N images with ground-truth labels (first row in text) and human annotated labels (second row in text). Images with red-circled text indicate that their ``human annotations`` differ from the ``ground-truth label``. Thus, there exists label error in images with red-circled labels.

### Don't worry about label errors in your data. Docta can help you detecting them!

## Docta Experiment Image Classification

In [None]:
import torch
torch.cuda.set_device(0)
torch.cuda.get_device_name(0)

In [None]:
%run ./tools/docta_image_classification.py

In [None]:
import matplotlib.pyplot as plt

plt.hist(label_error[:,1], alpha=0.5, label='Label Error')
plt.hist(label_curation[:,2], alpha=0.5, label='Label Curation')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.title('Histogram of Label Error and Label Curation')
plt.legend()
plt.show()

In [None]:
sel = (label_curation[:, 2] > 0.3) & (label_error[:, 1] > 0.95)
print(f"Found {np.sum(sel)} label errors from {len(dataset_raw)} samples")

# generate cured labels
cured_labels = np.array(dataset_raw.label)
cured_labels[label_curation[sel, 0].astype(int)] = label_curation[sel, 1].astype(int)
save_path = cfg.save_path + f"{cfg.embedding_model.split('/')[-1]}_{cfg.crop}_cured_labels_{cfg.dataset_type}.pt"
torch.save(cured_labels, save_path)
print(f"Saved cured labels to {save_path}")


## A help function for visualization

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
def visualize_result(num_show, data, noisy_label, cured_labels, label_name):
    plt.rcParams["figure.figsize"] = (20,24)
    size = num_show
    width = int(size / 3)
    cnt = 0
    for i in range(len(dataset)):
        if noisy_label[i] != cured_labels[i]:
            ax = plt.subplot(width,3,cnt+1)
            ax.imshow(data.feature[i])
            ax.set_title(f'Expert: {label_name[noisy_label[i]]}.\nSuggested: {label_name[cured_labels[i]]}.')
            ax.axis('off')
            cnt += 1
        if cnt == num_show:
            break
    plt.show()

## Visualize results

In [None]:
dataset = ImageDataset10Classes(cfg, train=True)
noisy_label = dataset.label
feature = dataset.feature
label_name = [index_to_class[idx] for idx in range(len(index_to_class))]
num_show = 18
visualize_result(num_show, dataset, noisy_label, cured_labels, label_name)


In [None]:
corrupt_labels_df = dataset_raw.image_metadata[["_id", "filename", "expert_classification"]].copy()
corrupt_labels_df["clip_model"] = cfg.embedding_model.split('/')[-1]
corrupt_labels_df["label_error_confidence"] = np.nan
corrupt_labels_df.loc[label_error[:, 0].astype(int), "label_error_confidence"] = label_error[:, 1]
corrupt_labels_df["label_curation"] = ""
corrupt_labels_df.loc[label_curation[:, 0].astype(int), "label_curation"] = list(map(lambda x: index_to_class[x], label_curation[:, 1].astype(int).tolist()))
corrupt_labels_df["label_curation_confidence"] = np.nan
corrupt_labels_df.loc[label_curation[:, 0].astype(int), "label_curation_confidence"] = label_curation[:, 2]

In [None]:
corrupt_labels_df.to_csv(cfg.save_path + f"{cfg.embedding_model.split('/')[-1]}_{cfg.crop}_corrupt_labels_{cfg.crop}_hoc_{cfg.hoc_cfg.max_step}_simfeat_{cfg.detect_cfg.num_epochs}.csv", index=False)

In [None]:
corrupt_labels_df