## Cleaning annotation datasets

Postprocess annotation from lightag and save them in a dataset format.

In [None]:
from datasets import load_dataset
import json

with open('data_lightag/pii-prefilter-final.json') as f:
    labels = json.load(f)

In [44]:
with open('data_lightag/pii-first-final.json') as f:
    labels_no_filter = json.load(f)

In [45]:
print(f"level 0 format: {labels.keys()},size # annot: {len(labels)} # samples: {len(labels['examples'])}")
print(f"level 1 format: {labels['examples'][0].keys()}, size: {len(labels['examples'][0])}")
print(f"level 1 format: {labels['examples'][0].keys()}, size: {len(labels['examples'][0])}")
print(labels["examples"][0]["annotations"][0].keys())

level 0 format: dict_keys(['id', 'examples', 'schema', 'dataset', 'relations', 'name', 'annotators_per_example']),size # annot: 7 # samples: 400
level 1 format: dict_keys(['content', 'seen_by', 'metadata', 'example_id', 'annotations', 'classifications']), size: 6
level 1 format: dict_keys(['content', 'seen_by', 'metadata', 'example_id', 'annotations', 'classifications']), size: 6
dict_keys(['end', 'tag', 'start', 'value', 'tag_id', 'correct', 'reviewed', 'example_id', 'annotated_by', 'definition_id', 'tagged_token_id'])


In [46]:
# verify that all annotations are reviewed
count = 0
for example in labels["examples"]:
    for annotation in example["annotations"]:
        assert annotation["reviewed"] == True
        count += 1
print(f"total # reviewed annotations: {count}")
print(f"total # annotations: {sum([len(ex['annotations']) for ex in labels['examples']])}")

total # reviewed annotations: 1293
total # annotations: 1293


In [47]:
# remove examples where seen_by is empty
L = []
for example in labels_no_filter["examples"]:
    if example["seen_by"]:
        L.append(example)
labels_no_filter["examples"] = L
print(f"# samples kept: {len(labels_no_filter['examples'])}")

# samples kept: 224


In [48]:
# verify that all annotations are reviewed
count = 0
for example in labels_no_filter["examples"]:
    for annotation in example["annotations"]:
        assert annotation["reviewed"] == True
        count += 1
print(f"total # reviewed annotations: {count}")
print(f"total # annotations: {sum([len(ex['annotations']) for ex in labels_no_filter['examples']])}")

total # reviewed annotations: 215
total # annotations: 215


All samples in the current dataset are annotated and reviewed:
* 400 samples in the prefiltered daatset, with 1293 annotations
* 224 samples in the non filtered dataset, with 215 annotations

In [51]:
ds_prefiltered = labels["examples"]
ds_no_filter = labels_no_filter["examples"]

In [54]:
ds_prefiltered[0].keys()

dict_keys(['content', 'seen_by', 'metadata', 'example_id', 'annotations', 'classifications'])

In [60]:
# this is a list: ds_prefiltered[0]["annotations"]
ds_prefiltered[0]["annotations"][0]

{'end': 52,
 'tag': 'EMAIL',
 'start': 38,
 'value': 'davis@dlib.net',
 'tag_id': '76b4fbc7-b129-40ea-a6d3-8988fe626973',
 'correct': True,
 'reviewed': True,
 'example_id': '04f66c12-f51a-4f75-9206-5f67989b30a0',
 'annotated_by': [{'annotator': 'mi.lappert@gmail.com',
   'timestamp': '2022-11-05T10:46:50.415+00:00',
   'annotator_id': 9}],
 'definition_id': 'a6e0c132-cd58-45d1-8dd5-5af8cdc6d399',
 'tagged_token_id': 'dc6eca22-a38b-4d99-a527-a694dffb04a5'}

In [87]:
def process_example(example):
    new_sample = {}
    new_sample["content"] = example["content"]
    new_sample["language"] = example["metadata"]["lang"]
    new_sample["license"] = example["metadata"]["licenses"][0]
    new_sample["path"] = example["metadata"]["repository_name"] + "/" + example["metadata"]["path"]
    new_sample["annotation_id"] = example["example_id"]
    new_sample["pii"] = []
    for annotation in example["annotations"]:
        pii = {"tag": annotation["tag"],
                "value": annotation["value"],
                "start": annotation["start"],
                "end": annotation["end"]}
        start = max(0, pii["start"] - 50)
        end = min(len(new_sample["content"]), pii["end"] + 50)
        pii["context"] = new_sample["content"][start:end]
        new_sample["pii"].append(pii)
    # we save the pii in json to avoid not matching sizes per column in `datasets`
    new_sample["pii"] = json.dumps(new_sample["pii"])
    return new_sample

In [None]:
# let's remove example 11
# issue with sample 34 '251926c8-f8cb-443f-99e0-77254d63430d' there is a "password" that wasn't in the labeling

In [136]:
res = process_example(ds_prefiltered[0])
print(res.keys())
pii = json.loads(res["pii"])
pii

dict_keys(['content', 'language', 'license', 'path', 'pii'])


[{'tag': 'EMAIL',
  'value': 'davis@dlib.net',
  'start': 38,
  'end': 52,
  'context': '// Copyright (C) 2003  Davis E. King (davis@dlib.net), Miguel Grinberg\n// License: Boost Software Lice'},
 {'tag': 'NAME',
  'value': 'Miguel Grinberg',
  'start': 55,
  'end': 70,
  'context': 'pyright (C) 2003  Davis E. King (davis@dlib.net), Miguel Grinberg\n// License: Boost Software License   See LICENSE.'},
 {'tag': 'NAME',
  'value': 'Davis E. King',
  'start': 23,
  'end': 36,
  'context': '// Copyright (C) 2003  Davis E. King (davis@dlib.net), Miguel Grinberg\n// License: Boo'}]

In [137]:
# build a dataset
import datasets
import pandas as pd

def build_dataset(labels):
    examples = labels["examples"]
    df = pd.DataFrame([process_example(ex) for ex in examples])
    dataset = datasets.Dataset.from_pandas(df)
    return dataset

In [138]:
dataset_filter = build_dataset(labels)
dataset_no_filter = build_dataset(labels_no_filter)

In [139]:
dataset_filter

Dataset({
    features: ['content', 'language', 'license', 'path', 'pii'],
    num_rows: 400
})

In [140]:
dataset_no_filter

Dataset({
    features: ['content', 'language', 'license', 'path', 'pii'],
    num_rows: 224
})

In [None]:
# the labeling of this sample was confusing, many uuid style ids and the context doesn't help to decide which are secrets
# and there are many (137) of them which can make the evaluation wrong
pii = json.loads(dataset_filter[11]["pii"])
pii

In [201]:
# remove examples 11 and 34 and replace 
# and replace them with the samples -1 and -6 from the no_filter dataset
dataset_filter2 = dataset_filter.select((i for i in range(len(dataset_filter)) if i!=11 and i!=34))
dataset_filter2

Dataset({
    features: ['content', 'language', 'license', 'path', 'pii'],
    num_rows: 398
})

In [183]:
two_samples = dataset_no_filter.select([223, 218])
two_samples

Dataset({
    features: ['content', 'language', 'license', 'path', 'pii'],
    num_rows: 2
})

In [202]:
dataset_filter_clean = datasets.concatenate_datasets([dataset_filter2, two_samples])
dataset_filter_clean

Dataset({
    features: ['content', 'language', 'license', 'path', 'pii'],
    num_rows: 400
})

In [211]:
dataset_filter_clean

Dataset({
    features: ['content', 'language', 'license', 'path', 'pii'],
    num_rows: 400
})

In [212]:
dataset_no_filter_clean = dataset_no_filter.select((i for i in range(len(dataset_no_filter)) if i!=223 and i!=218))

In [213]:
dataset_no_filter_clean

Dataset({
    features: ['content', 'language', 'license', 'path', 'pii'],
    num_rows: 222
})

In [218]:
dataset_filter_clean.to_json("pii-for-code/data/prefiltered.json")
dataset_no_filter_clean.to_json("pii-for-code/data/not-prefiltered.json")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

2513550

In [229]:
ds = load_dataset("pii-for-code", data_dir="data/not-prefiltered")

Using custom data configuration pii-for-code-706a5ee0c620e8b4


Downloading and preparing dataset json/pii-for-code to /Users/loubnabenallal/.cache/huggingface/datasets/json/pii-for-code-706a5ee0c620e8b4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /Users/loubnabenallal/.cache/huggingface/datasets/json/pii-for-code-706a5ee0c620e8b4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [231]:
ds = load_dataset("loubnabnl/pii-for-code", data_dir="data/not-prefiltered")

Downloading readme:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Using custom data configuration loubnabnl--pii-for-code-3e83605526768f46


Downloading and preparing dataset json/loubnabnl--pii-for-code to /Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___json/loubnabnl--pii-for-code-3e83605526768f46/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___json/loubnabnl--pii-for-code-3e83605526768f46/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [232]:
ds

DatasetDict({
    train: Dataset({
        features: ['content', 'language', 'license', 'path', 'pii'],
        num_rows: 222
    })
})