# Labelling python notebook tool

This notebook allows for easy labelling of the artifact dataset.

In [5]:
import pandas as pd
import numpy as np
from utils_for_labelling_tool import get_images, plot_images
import ipympl
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))
from artifact_detector_model import MARKER_NAMES

### Data loading

In [None]:
annotation_file = "manual_annotations_full_new.csv"
annotation_file_output = "manual_annotations_full_new.csv"
base_csv_file = "/vol/biomedic3/data/EMBED/tables/mammo-net-csv/embed-non-negative.csv"  # initialise with the original embed file

First load base dataset, with images ids. Filter out images that have already been labelled.

In [None]:
df_labelled = pd.read_csv(annotation_file)  # the file with already labelled ids.
df_base = pd.read_csv(base_csv_file)
df_base = df_base[~df_base["image_path"].isin(df_labelled["image_path"].values)]
# optional additonal filtering
# df_base = df_base[df_base["spot_mag"] != 1]

Take a subset of the full dataset for labelling

In [None]:
df_filtered = df_base

# Make empty columns that will be filled in labeling time
num_classes = len(MARKER_NAMES)
for marker in MARKER_NAMES:
    df_filtered[marker] = 0
df_filtered["multilabel_markers"] = df_filtered.apply(
    lambda row: np.array([row[name] for name in MARKER_NAMES]), axis=1
)

# Take 20 images
df_filtered = df_filtered.sample(60)
df_filtered["images"] = get_images(df_filtered["image_path"])

### Labelling
Labelling tool in form of interactive plot. If you click on one image you set the label of label index marker_names=['circle marker','triangle marker','breast implant', 'pacemaker','compression'] defines index order.

In [None]:
label_index = 2 # change this to change which marker label to update.
print(f"You are updating label index {label_index} i.e. {MARKER_NAMES[label_index]}")
plot_images(
    True,
    df_filtered["images"],
    df_filtered["multilabel_markers"],
    label_index=label_index,
    batch=2,
    n_samples_row=4,
)

In [None]:
# Update the labelled dataset file with the new labels.
for i in range(len(MARKER_NAMES)):
    df_filtered[MARKER_NAMES[i]] = df_filtered["multilabel_markers"].apply(
        lambda x: x[i]
    )
df_filtered = df_filtered[
    [
        "image_path",
        "acc_anon",
        "empi_anon",
        "SeriesDescription",
    ] 
    +
    MARKER_NAMES
]
df_labelled_new = pd.concat((df_labelled, df_filtered))
df_labelled_new.to_csv(annotation_file_output, index=False)
print(f"Finished writing to {annotation_file_output}")

# Display statistics about current label file

In [None]:
df = pd.read_csv(annotation_file_output)

In [None]:
len(df)

In [None]:
for c in MARKER_NAMES:
    print(df[c].value_counts())

In [None]:
for c in MARKER_NAMES:
    print(df[c].value_counts(normalize=True))