## LighTag data post-processing
Let's download the dataset from LightTag using their API and postprocess the data + push it to the hub (downloading from the LightTag's UI doesn't work properly)

In [2]:
import requests
import pandas as pd

LIGHTTAG_DOMAIN = 'demo'  #should be your lighttag domain
domain = "bigcodepii"
SERVER = f'https://{domain}.lighttag.io/api/'
API_BASE = SERVER +'v1/'
MY_USER="XX"
MY_PWD="XX"

In [3]:
response = requests.post(SERVER+'auth/token/login/',
              json={"username":MY_USER,"password":MY_PWD})
assert response.status_code ==200, "Couldn't authenticate"

In [4]:
auth_details = response.json()
token = auth_details['key']
assert auth_details['is_manager'] ==1, "not a manager" # Check you are a manager

In [5]:
session = requests.session()
session.headers.update({"Authorization":"Token {token}".format(token=token)})
#Try it out
session.get(API_BASE+'projects/').json()

[{'id': '87c47b2c-d503-4967-b314-c04d1e7f8be7',
  'slug': 'default',
  'url': 'http://bigcodepii.lighttag.io/api/v1/projects/default/',
  'name': 'default'}]

In [6]:
task_definitions = (session.get(API_BASE+'projects/default/task_definitions/').json())
task_definitions[1]

{'id': 'a6e0c132-cd58-45d1-8dd5-5af8cdc6d399',
 'name': 'PII labeling pre-filtered data',
 'slug': 'pii-labeling-pre-filtered-data',
 'url': 'http://bigcodepii.lighttag.io/api/v1/projects/default/task_definitions/pii-labeling-pre-filtered-data/',
 'allow_suggestions': True,
 'annotators_per_example': 1,
 'async_status': 'done',
 'archived': False,
 'priority': 1,
 'active': True,
 'guidelines': '## Task Overview\n\nWelcome to our annotation task. In this task we\'ll present you with one code file at a time and ask you to tag specific entities. We\'ll be using this data to evaluate PII detection tools on source code from different programming languages.   \n\n1. Please highlight the entire span for each tags where applicable. For example: For tag `NAME`, if the text presented has John Doe, please highlight John Doe as one span, instead of highlighting John and Doe separately.\n2. If you think a word that should be highlighted, but unsure about which tag to go to, use `AMBIGUOUS` instead

In [7]:
test_td = task_definitions[1]['url'] # Get the url for the test set task definition
test_td

'http://bigcodepii.lighttag.io/api/v1/projects/default/task_definitions/pii-labeling-pre-filtered-data/'

In [8]:
data = session.get(test_td+'download/').json()

In [9]:
data.keys()

dict_keys(['id', 'examples', 'schema', 'dataset', 'relations', 'name', 'annotators_per_example'])

In [10]:
dataset = data["examples"]

In [11]:
dataset[0].keys()

dict_keys(['content', 'seen_by', 'comments', 'metadata', 'example_id', 'annotations', 'classifications'])

In [59]:
# verify that all annotations are reviewed
count = 0
for example in dataset:
    for annotation in example["annotations"]:
        assert annotation["reviewed"] == True
        count += 1
print(f"total # reviewed annotations: {count}")
print(f"total # annotations: {sum([len(ex['annotations']) for ex in dataset])}")

total # reviewed annotations: 1318
total # annotations: 1318


In [39]:
for i, e in enumerate(dataset):
    if e["example_id"] == "251926c8-f8cb-443f-99e0-77254d63430d":
        print(i)
        break

34


In [40]:
dataset[i]["annotations"]

[{'end': 33732,
  'tag': 'PASSWORD',
  'start': 33724,
  'value': 'password',
  'tag_id': 'd7945153-264b-42d8-8db8-e98fa49397a7',
  'correct': True,
  'reviewed': True,
  'example_id': '251926c8-f8cb-443f-99e0-77254d63430d',
  'annotated_by': [{'annotator': None,
    'timestamp': None,
    'annotator_id': None}],
  'definition_id': 'a6e0c132-cd58-45d1-8dd5-5af8cdc6d399',
  'tagged_token_id': '38e747c4-76f3-468e-a660-33c98f79c24a'},
 {'end': 33372,
  'tag': 'PASSWORD',
  'start': 33364,
  'value': 'wrongpwd',
  'tag_id': 'd7945153-264b-42d8-8db8-e98fa49397a7',
  'correct': True,
  'reviewed': True,
  'example_id': '251926c8-f8cb-443f-99e0-77254d63430d',
  'annotated_by': [{'annotator': 'shamik.bose89@gmail.com',
    'timestamp': '2022-11-05T16:51:02.317+00:00',
    'annotator_id': 10}],
  'definition_id': 'a6e0c132-cd58-45d1-8dd5-5af8cdc6d399',
  'tagged_token_id': 'ad3fcf9d-7c96-4db9-adf4-3c5e4852540e'}]

In [61]:
# example annotations: those rejected by reviewers are kept with the tag correct: False, let's remove them
example = dataset[362]
print([a for a in example["annotations"] if a["correct"]])
print("\n")
print([a for a in example["annotations"] if not a["correct"]])

[{'end': 20662, 'tag': 'USERNAME', 'start': 20654, 'value': 'rheineke', 'tag_id': 'c1f50281-3cff-4eee-85d4-212d99963238', 'correct': True, 'reviewed': True, 'example_id': '00f9b923-ab9d-4373-a6e0-d6172f7999fd', 'annotated_by': [{'annotator': 'christopher.akiki@gmail.com', 'timestamp': '2022-11-08T14:47:37.582428+00:00', 'annotator_id': 4}], 'definition_id': 'a6e0c132-cd58-45d1-8dd5-5af8cdc6d399', 'tagged_token_id': 'e0ad7e43-6233-4346-98b0-f72f9b6b35ea'}, {'end': 55, 'tag': 'EMAIL', 'start': 32, 'value': 'reece.heineke@gmail.com', 'tag_id': '76b4fbc7-b129-40ea-a6d3-8988fe626973', 'correct': True, 'reviewed': True, 'example_id': '00f9b923-ab9d-4373-a6e0-d6172f7999fd', 'annotated_by': [{'annotator': 'loubnabenallal1999@gmail.com', 'timestamp': '2022-11-05T17:58:17.881+00:00', 'annotator_id': 1}], 'definition_id': 'a6e0c132-cd58-45d1-8dd5-5af8cdc6d399', 'tagged_token_id': 'ad00e862-d92a-4cc0-92ea-a0e97b70f5f7'}, {'end': 31, 'tag': 'NAME', 'start': 18, 'value': 'Reece Heineke', 'tag_id': '

In [74]:
# remove annotations where "correct" is False
rejected = 0
kept = 0
list_rejections = []
for i, example in enumerate(dataset):
    # only keep correct annotations (after review)
    correct_annotations = [a for a in example["annotations"] if a["correct"]]
    list_rejections += [a for a in example['annotations'] if not a['correct']]
    #update dataset
    dataset[i]["annotations"] = correct_annotations
    
print(f"kept {sum([len(ex['annotations']) for ex in dataset])} annotations, rejected {len(list_rejections)} annotations")

kept 1016 annotations, rejected 302 annotations


In [27]:
# remove annotations where "correct" is False
rejected = 0
kept = 0
list_rejections = []
for i, example in enumerate(dataset):
    # only keep correct annotations (after review)
    correct_annotations = [a for a in example["annotations"] if a["correct"]]
    list_rejections += [a for a in example['annotations'] if not a['correct']]
    #update dataset
    dataset[i]["annotations"] = correct_annotations
    
print(f"kept {sum([len(ex['annotations']) for ex in dataset])} annotations, rejected {len(list_rejections)} annotations")

kept 1016 annotations, rejected 302 annotations


### Prepare the dataset for `datasets` library

In [29]:
import json
import datasets
import pandas as pd
from copy import deepcopy

def convert_key(example):
    sample = deepcopy(example)
    if sample["tag"] in ["API_KEY", "SSH_KEY"]:
        sample["tag"] = "KEY"
    return sample

def process_example(example):
    new_sample = {}
    new_sample["content"] = example["content"]
    new_sample["language"] = example["metadata"]["lang"]
    new_sample["license"] = example["metadata"]["licenses"][0]
    new_sample["path"] = example["metadata"]["repository_name"] + "/" + example["metadata"]["path"]
    new_sample["annotation_id"] = example["example_id"]
    new_sample["pii"] = []
    new_sample["pii_modified"] = []
    for annotation in example["annotations"]:
        pii = {"tag": annotation["tag"],
                "value": annotation["value"],
                "start": annotation["start"],
                "end": annotation["end"]}
        start = max(0, pii["start"] - 50)
        end = min(len(new_sample["content"]), pii["end"] + 50)
        pii["context"] = new_sample["content"][start:end]
        # column with one Key tag for both API Keys and SSH keys
        modified_pii = convert_key(pii)
        new_sample["pii"].append(pii)
        new_sample["pii_modified"].append(modified_pii)
    # we save the pii in json to avoid not matching sizes per column in `datasets`
    new_sample["pii"] = json.dumps(new_sample["pii"])
    new_sample["pii_modified"] = json.dumps(new_sample["pii_modified"])
    return new_sample

In [30]:
# small test
res = process_example(dataset[360])
print(res.keys())
pii = json.loads(res["pii"])
pii

dict_keys(['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'])


[{'tag': 'API_KEY',
  'value': '476611152863-ltgqfk9jhq1vsenin5039n58ogkraltb.apps.googleusercontent.com',
  'start': 6842,
  'end': 6914,
  'context': 'onScheme;\n\n                   options.ClientId = "476611152863-ltgqfk9jhq1vsenin5039n58ogkraltb.apps.googleusercontent.com";\n                   options.ClientSecret = "rSHv'},
 {'tag': 'API_KEY',
  'value': '99eb0b9d-ca40-476e-b5ac-6f4c32bfb530',
  'start': 7336,
  'end': 7372,
  'context': ' false };\n                    options.ClientId = "99eb0b9d-ca40-476e-b5ac-6f4c32bfb530";\n                    options.CallbackPath = "/si'},
 {'tag': 'USERNAME',
  'value': 'aspnet',
  'start': 7783,
  'end': 7789,
  'context': '         // And\n            // https://github.com/aspnet/Docs/issues/2384#issuecomment-297980490\n         '},
 {'tag': 'USERNAME',
  'value': 'openiddict',
  'start': 7692,
  'end': 7702,
  'context': ' env)\n        {\n            // https://github.com/openiddict/openiddict-core/issues/518\n            // And\n   '},

In [31]:
pii = json.loads(res["pii_modified"])
pii

[{'tag': 'KEY',
  'value': '476611152863-ltgqfk9jhq1vsenin5039n58ogkraltb.apps.googleusercontent.com',
  'start': 6842,
  'end': 6914,
  'context': 'onScheme;\n\n                   options.ClientId = "476611152863-ltgqfk9jhq1vsenin5039n58ogkraltb.apps.googleusercontent.com";\n                   options.ClientSecret = "rSHv'},
 {'tag': 'KEY',
  'value': '99eb0b9d-ca40-476e-b5ac-6f4c32bfb530',
  'start': 7336,
  'end': 7372,
  'context': ' false };\n                    options.ClientId = "99eb0b9d-ca40-476e-b5ac-6f4c32bfb530";\n                    options.CallbackPath = "/si'},
 {'tag': 'USERNAME',
  'value': 'aspnet',
  'start': 7783,
  'end': 7789,
  'context': '         // And\n            // https://github.com/aspnet/Docs/issues/2384#issuecomment-297980490\n         '},
 {'tag': 'USERNAME',
  'value': 'openiddict',
  'start': 7692,
  'end': 7702,
  'context': ' env)\n        {\n            // https://github.com/openiddict/openiddict-core/issues/518\n            // And\n   '},
 {'tag'

In [32]:
# build a dataset
def build_dataset(data):
    df = pd.DataFrame([process_example(ex) for ex in data])
    dataset = datasets.Dataset.from_pandas(df)
    return dataset

In [549]:
hf_dataset = build_dataset(dataset)

In [34]:
hf_dataset

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'],
    num_rows: 400
})

In [35]:
hf_dataset.to_json("prefiltered_v2.json")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3861956

In [36]:
hf_dataset

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'],
    num_rows: 400
})

### Manual reviewing of the dataset

In [51]:
i = -1

In [None]:
i += 1
pii = json.loads(hf_dataset[i]["pii"])
for annot in pii:
    print(i, annot["tag"], annot["value"])

* example 34 has a wrong password, example 158 has a wrong IP address, 206 has a wrong passwor and 241 has many API keys that don't look like tokens and are propably passwords

In [550]:
# add id column to dataset as a new column
hf_dataset = hf_dataset.add_column("id", [i for i in range(len(hf_dataset))])
hf_dataset

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id'],
    num_rows: 400
})

In [551]:
def update_pii(example):
    new_example = deepcopy(example)
    if example["id"] in [34, 206]:
        pii = json.loads(example["pii"])
        # remove wrong password
        new_example["pii"] = json.dumps(pii[1:])
        new_example["pii_modified"] = json.dumps(pii[1:])

    elif example["id"] == 158:
        pii = json.loads(hf_dataset[158]["pii"])
        # remove wrong IP_ADDRESS
        pii = pii[:3] + pii[4:]
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)

    elif example["id"] == 241:
        pii = json.loads(example["pii"])
        for id, e in enumerate(pii):
            if e["value"] == "AIzaasdf":
                pii[id]["tag"] = "PASSWORD"
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
    return new_example

In [552]:
hf_dataset_2 = hf_dataset.map(update_pii, features=hf_dataset.features)

  0%|          | 0/400 [00:00<?, ?ex/s]

In [553]:
hf_dataset_2 = hf_dataset_2.remove_columns(["id"])

In [558]:
hf_dataset_2

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'],
    num_rows: 400
})

In [557]:
id = 34
pii = json.loads(hf_dataset_2[id]["pii"])
for annot in pii:
    print(id, annot["tag"], annot["value"])

34 PASSWORD wrongpwd


In [None]:
# remove file at index 11 many ambigous keys
# remove file 51 many incorrect names

In [569]:
hf_dataset_3 = hf_dataset_2.select((i for i in range(len(hf_dataset_2)) if i!=11 and i!=51))



In [565]:
with open('data_lightag/pii-first-final.json') as f:
    labels_no_filter = json.load(f)

def build_dataset_n(labels):
    examples = labels["examples"]
    df = pd.DataFrame([process_example(ex) for ex in examples])
    dataset = datasets.Dataset.from_pandas(df)
    return dataset

# remove examples where seen_by is empty
L = []
for example in labels_no_filter["examples"]:
    if example["seen_by"]:
        L.append(example)
labels_no_filter["examples"] = L
print(f"# samples kept: {len(labels_no_filter['examples'])}")

dataset_no_filter = build_dataset_n(labels_no_filter)

# samples kept: 224


In [568]:
two_samples = dataset_no_filter.select([223, 218])
two_samples

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'],
    num_rows: 2
})

In [571]:
ds_clean = datasets.concatenate_datasets([hf_dataset_3, two_samples])
ds_clean

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'],
    num_rows: 400
})

In [573]:
ds_clean.to_json("prefiltered_final.json")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3724333

In [574]:
ds_clean.push_to_hub("dummy_data_clean")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]