## LighTag data post-processing
Let's download the dataset from LightTag using their API and postprocess the data + push it to the hub (downloading from the LightTag's UI doesn't work properly)

In [2]:
import requests
import pandas as pd

LIGHTTAG_DOMAIN = 'demo'  #should be your lighttag domain
domain = "bigcodepii"
SERVER = f'https://{domain}.lighttag.io/api/'
API_BASE = SERVER +'v1/'
MY_USER="XX"
MY_PWD="XX"

In [3]:
response = requests.post(SERVER+'auth/token/login/',
              json={"username":MY_USER,"password":MY_PWD})
assert response.status_code ==200, "Couldn't authenticate"

In [4]:
auth_details = response.json()
token = auth_details['key']
assert auth_details['is_manager'] ==1, "not a manager" # Check you are a manager

In [5]:
session = requests.session()
session.headers.update({"Authorization":"Token {token}".format(token=token)})
#Try it out
session.get(API_BASE+'projects/').json()

[{'id': '87c47b2c-d503-4967-b314-c04d1e7f8be7',
  'slug': 'default',
  'url': 'http://bigcodepii.lighttag.io/api/v1/projects/default/',
  'name': 'default'}]

In [6]:
task_definitions = (session.get(API_BASE+'projects/default/task_definitions/').json())
task_definitions[1]

{'id': 'a6e0c132-cd58-45d1-8dd5-5af8cdc6d399',
 'name': 'PII labeling pre-filtered data',
 'slug': 'pii-labeling-pre-filtered-data',
 'url': 'http://bigcodepii.lighttag.io/api/v1/projects/default/task_definitions/pii-labeling-pre-filtered-data/',
 'allow_suggestions': True,
 'annotators_per_example': 1,
 'async_status': 'done',
 'archived': False,
 'priority': 1,
 'active': True,
 'guidelines': '## Task Overview\n\nWelcome to our annotation task. In this task we\'ll present you with one code file at a time and ask you to tag specific entities. We\'ll be using this data to evaluate PII detection tools on source code from different programming languages.   \n\n1. Please highlight the entire span for each tags where applicable. For example: For tag `NAME`, if the text presented has John Doe, please highlight John Doe as one span, instead of highlighting John and Doe separately.\n2. If you think a word that should be highlighted, but unsure about which tag to go to, use `AMBIGUOUS` instead

In [7]:
test_td = task_definitions[1]['url'] # Get the url for the test set task definition
test_td

'http://bigcodepii.lighttag.io/api/v1/projects/default/task_definitions/pii-labeling-pre-filtered-data/'

In [8]:
data = session.get(test_td+'download/').json()

In [9]:
data.keys()

dict_keys(['id', 'examples', 'schema', 'dataset', 'relations', 'name', 'annotators_per_example'])

In [10]:
dataset = data["examples"]

In [11]:
dataset[0].keys()

dict_keys(['content', 'seen_by', 'comments', 'metadata', 'example_id', 'annotations', 'classifications'])

In [59]:
# verify that all annotations are reviewed
count = 0
for example in dataset:
    for annotation in example["annotations"]:
        assert annotation["reviewed"] == True
        count += 1
print(f"total # reviewed annotations: {count}")
print(f"total # annotations: {sum([len(ex['annotations']) for ex in dataset])}")

total # reviewed annotations: 1318
total # annotations: 1318


In [39]:
for i, e in enumerate(dataset):
    if e["example_id"] == "251926c8-f8cb-443f-99e0-77254d63430d":
        print(i)
        break

34


In [40]:
dataset[i]["annotations"]

[{'end': 33732,
  'tag': 'PASSWORD',
  'start': 33724,
  'value': 'password',
  'tag_id': 'd7945153-264b-42d8-8db8-e98fa49397a7',
  'correct': True,
  'reviewed': True,
  'example_id': '251926c8-f8cb-443f-99e0-77254d63430d',
  'annotated_by': [{'annotator': None,
    'timestamp': None,
    'annotator_id': None}],
  'definition_id': 'a6e0c132-cd58-45d1-8dd5-5af8cdc6d399',
  'tagged_token_id': '38e747c4-76f3-468e-a660-33c98f79c24a'},
 {'end': 33372,
  'tag': 'PASSWORD',
  'start': 33364,
  'value': 'wrongpwd',
  'tag_id': 'd7945153-264b-42d8-8db8-e98fa49397a7',
  'correct': True,
  'reviewed': True,
  'example_id': '251926c8-f8cb-443f-99e0-77254d63430d',
  'annotated_by': [{'annotator': 'shamik.bose89@gmail.com',
    'timestamp': '2022-11-05T16:51:02.317+00:00',
    'annotator_id': 10}],
  'definition_id': 'a6e0c132-cd58-45d1-8dd5-5af8cdc6d399',
  'tagged_token_id': 'ad3fcf9d-7c96-4db9-adf4-3c5e4852540e'}]

In [61]:
# example annotations: those rejected by reviewers are kept with the tag correct: False, let's remove them
example = dataset[362]
print([a for a in example["annotations"] if a["correct"]])
print("\n")
print([a for a in example["annotations"] if not a["correct"]])

[{'end': 20662, 'tag': 'USERNAME', 'start': 20654, 'value': 'rheineke', 'tag_id': 'c1f50281-3cff-4eee-85d4-212d99963238', 'correct': True, 'reviewed': True, 'example_id': '00f9b923-ab9d-4373-a6e0-d6172f7999fd', 'annotated_by': [{'annotator': 'christopher.akiki@gmail.com', 'timestamp': '2022-11-08T14:47:37.582428+00:00', 'annotator_id': 4}], 'definition_id': 'a6e0c132-cd58-45d1-8dd5-5af8cdc6d399', 'tagged_token_id': 'e0ad7e43-6233-4346-98b0-f72f9b6b35ea'}, {'end': 55, 'tag': 'EMAIL', 'start': 32, 'value': 'reece.heineke@gmail.com', 'tag_id': '76b4fbc7-b129-40ea-a6d3-8988fe626973', 'correct': True, 'reviewed': True, 'example_id': '00f9b923-ab9d-4373-a6e0-d6172f7999fd', 'annotated_by': [{'annotator': 'loubnabenallal1999@gmail.com', 'timestamp': '2022-11-05T17:58:17.881+00:00', 'annotator_id': 1}], 'definition_id': 'a6e0c132-cd58-45d1-8dd5-5af8cdc6d399', 'tagged_token_id': 'ad00e862-d92a-4cc0-92ea-a0e97b70f5f7'}, {'end': 31, 'tag': 'NAME', 'start': 18, 'value': 'Reece Heineke', 'tag_id': '

In [74]:
# remove annotations where "correct" is False
rejected = 0
kept = 0
list_rejections = []
for i, example in enumerate(dataset):
    # only keep correct annotations (after review)
    correct_annotations = [a for a in example["annotations"] if a["correct"]]
    list_rejections += [a for a in example['annotations'] if not a['correct']]
    #update dataset
    dataset[i]["annotations"] = correct_annotations
    
print(f"kept {sum([len(ex['annotations']) for ex in dataset])} annotations, rejected {len(list_rejections)} annotations")

kept 1016 annotations, rejected 302 annotations


In [27]:
# remove annotations where "correct" is False
rejected = 0
kept = 0
list_rejections = []
for i, example in enumerate(dataset):
    # only keep correct annotations (after review)
    correct_annotations = [a for a in example["annotations"] if a["correct"]]
    list_rejections += [a for a in example['annotations'] if not a['correct']]
    #update dataset
    dataset[i]["annotations"] = correct_annotations
    
print(f"kept {sum([len(ex['annotations']) for ex in dataset])} annotations, rejected {len(list_rejections)} annotations")

kept 1016 annotations, rejected 302 annotations


### Prepare the dataset for `datasets` library

In [51]:
import json
import datasets
import pandas as pd
from copy import deepcopy

def convert_key(example):
    sample = deepcopy(example)
    if sample["tag"] in ["API_KEY", "SSH_KEY"]:
        sample["tag"] = "KEY"
    return sample

def process_example(example):
    new_sample = {}
    new_sample["content"] = example["content"]
    new_sample["language"] = example["metadata"]["lang"]
    new_sample["license"] = example["metadata"]["licenses"][0]
    new_sample["path"] = example["metadata"]["repository_name"] + "/" + example["metadata"]["path"]
    new_sample["annotation_id"] = example["example_id"]
    new_sample["pii"] = []
    new_sample["pii_modified"] = []
    for annotation in example["annotations"]:
        pii = {"tag": annotation["tag"],
                "value": annotation["value"],
                "start": annotation["start"],
                "end": annotation["end"]}
        start = max(0, pii["start"] - 50)
        end = min(len(new_sample["content"]), pii["end"] + 50)
        pii["context"] = new_sample["content"][start:end]
        # column with one Key tag for both API Keys and SSH keys
        modified_pii = convert_key(pii)
        new_sample["pii"].append(pii)
        new_sample["pii_modified"].append(modified_pii)
    # we save the pii in json to avoid not matching sizes per column in `datasets`
    new_sample["pii"] = json.dumps(new_sample["pii"])
    new_sample["pii_modified"] = json.dumps(new_sample["pii_modified"])
    return new_sample

In [30]:
# small test
res = process_example(dataset[360])
print(res.keys())
pii = json.loads(res["pii"])
pii

dict_keys(['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'])


[{'tag': 'API_KEY',
  'value': '476611152863-ltgqfk9jhq1vsenin5039n58ogkraltb.apps.googleusercontent.com',
  'start': 6842,
  'end': 6914,
  'context': 'onScheme;\n\n                   options.ClientId = "476611152863-ltgqfk9jhq1vsenin5039n58ogkraltb.apps.googleusercontent.com";\n                   options.ClientSecret = "rSHv'},
 {'tag': 'API_KEY',
  'value': '99eb0b9d-ca40-476e-b5ac-6f4c32bfb530',
  'start': 7336,
  'end': 7372,
  'context': ' false };\n                    options.ClientId = "99eb0b9d-ca40-476e-b5ac-6f4c32bfb530";\n                    options.CallbackPath = "/si'},
 {'tag': 'USERNAME',
  'value': 'aspnet',
  'start': 7783,
  'end': 7789,
  'context': '         // And\n            // https://github.com/aspnet/Docs/issues/2384#issuecomment-297980490\n         '},
 {'tag': 'USERNAME',
  'value': 'openiddict',
  'start': 7692,
  'end': 7702,
  'context': ' env)\n        {\n            // https://github.com/openiddict/openiddict-core/issues/518\n            // And\n   '},

In [31]:
pii = json.loads(res["pii_modified"])
pii

[{'tag': 'KEY',
  'value': '476611152863-ltgqfk9jhq1vsenin5039n58ogkraltb.apps.googleusercontent.com',
  'start': 6842,
  'end': 6914,
  'context': 'onScheme;\n\n                   options.ClientId = "476611152863-ltgqfk9jhq1vsenin5039n58ogkraltb.apps.googleusercontent.com";\n                   options.ClientSecret = "rSHv'},
 {'tag': 'KEY',
  'value': '99eb0b9d-ca40-476e-b5ac-6f4c32bfb530',
  'start': 7336,
  'end': 7372,
  'context': ' false };\n                    options.ClientId = "99eb0b9d-ca40-476e-b5ac-6f4c32bfb530";\n                    options.CallbackPath = "/si'},
 {'tag': 'USERNAME',
  'value': 'aspnet',
  'start': 7783,
  'end': 7789,
  'context': '         // And\n            // https://github.com/aspnet/Docs/issues/2384#issuecomment-297980490\n         '},
 {'tag': 'USERNAME',
  'value': 'openiddict',
  'start': 7692,
  'end': 7702,
  'context': ' env)\n        {\n            // https://github.com/openiddict/openiddict-core/issues/518\n            // And\n   '},
 {'tag'

In [32]:
# build a dataset
def build_dataset(data):
    df = pd.DataFrame([process_example(ex) for ex in data])
    dataset = datasets.Dataset.from_pandas(df)
    return dataset

In [549]:
hf_dataset = build_dataset(dataset)

In [34]:
hf_dataset

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'],
    num_rows: 400
})

In [35]:
hf_dataset.to_json("prefiltered_v2.json")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3861956

In [36]:
hf_dataset

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'],
    num_rows: 400
})

### Manual reviewing of the dataset

In [51]:
i = -1

In [None]:
i += 1
pii = json.loads(hf_dataset[i]["pii"])
for annot in pii:
    print(i, annot["tag"], annot["value"])

* example 34 has a wrong password, example 158 has a wrong IP address, 206 has a wrong passwor and 241 has many API keys that don't look like tokens and are propably passwords

In [550]:
# add id column to dataset as a new column
hf_dataset = hf_dataset.add_column("id", [i for i in range(len(hf_dataset))])
hf_dataset

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id'],
    num_rows: 400
})

In [551]:
def update_pii(example):
    new_example = deepcopy(example)
    if example["id"] in [34, 206]:
        pii = json.loads(example["pii"])
        # remove wrong password
        new_example["pii"] = json.dumps(pii[1:])
        new_example["pii_modified"] = json.dumps(pii[1:])

    elif example["id"] == 158:
        pii = json.loads(hf_dataset[158]["pii"])
        # remove wrong IP_ADDRESS
        pii = pii[:3] + pii[4:]
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)

    elif example["id"] == 241:
        pii = json.loads(example["pii"])
        for id, e in enumerate(pii):
            if e["value"] == "AIzaasdf":
                pii[id]["tag"] = "PASSWORD"
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
    return new_example

In [552]:
hf_dataset_2 = hf_dataset.map(update_pii, features=hf_dataset.features)

  0%|          | 0/400 [00:00<?, ?ex/s]

In [553]:
hf_dataset_2 = hf_dataset_2.remove_columns(["id"])

In [558]:
hf_dataset_2

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'],
    num_rows: 400
})

In [557]:
id = 34
pii = json.loads(hf_dataset_2[id]["pii"])
for annot in pii:
    print(id, annot["tag"], annot["value"])

34 PASSWORD wrongpwd


In [None]:
# remove file at index 11 many ambiguous keys
# remove file 51 many incorrect names

In [569]:
hf_dataset_3 = hf_dataset_2.select((i for i in range(len(hf_dataset_2)) if i!=11 and i!=51))



In [52]:
with open('data_lightag/pii-first-final.json') as f:
    labels_no_filter = json.load(f)

def build_dataset_n(labels):
    import pandas as pd
    examples = labels["examples"]
    df = pd.DataFrame([process_example(ex) for ex in examples])
    dataset = datasets.Dataset.from_pandas(df)
    return dataset

# remove examples where seen_by is empty
L = []
for example in labels_no_filter["examples"]:
    if example["seen_by"]:
        L.append(example)
labels_no_filter["examples"] = L
print(f"# samples kept: {len(labels_no_filter['examples'])}")

dataset_no_filter = build_dataset_n(labels_no_filter)

# samples kept: 224


In [568]:
two_samples = dataset_no_filter.select([223, 218])
two_samples

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'],
    num_rows: 2
})

In [None]:

ds_clean = datasets.concatenate_datasets([hf_dataset_3, two_samples])
ds_clean

In [573]:
ds_clean.to_json("prefiltered_final.json")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3724333

In [574]:
ds_clean.push_to_hub("dummy_data_clean")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

### More cleaning after we ran detect-secrets and spotted mis annotations

In [1]:
import json
from datasets import load_dataset

ds = load_dataset("bigcode/pii-for-code", use_auth_token=True, split="train")

Using custom data configuration bigcode--pii-for-code-72a7cff2b59e4251
Found cached dataset json (/Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--pii-for-code-72a7cff2b59e4251/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


In [3]:
i = 317
pii = json.loads(ds[i]["pii"])
print("pii", pii)
print(ds[i]["content"][4098:])

pii []
eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI1ZWJhYzUzNDk1NGI1NDEzOTgwNmMxMTIiLCJpYXQiOjE1ODkyOTg0ODQsImV4cCI6MTU4OTMwMDI4NH0.m1U63blB0MLej_WfB7yC2FTMnCziif9X8yzwDEfJXAg
 *      responses:
 *        "204":
 *          description: No content
 *        "404":
 *          $ref: '#/components/responses/NotFound'
 */

/**
 * @swagger
 * paths:
 *  /auth/refresh-tokens:
 *    post:
 *      summary: Refresh auth tokens
 *      tags: [Auth]
 *      requestBody:
 *        required: true
 *        content:
 *          application/json:
 *            schema:
 *              type: object
 *              required:
 *                - refreshToken
 *              properties:
 *                refreshToken:
 *                  type: string
 *              example:
 *                refreshToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI1ZWJhYzUzNDk1NGI1NDEzOTgwNmMxMTIiLCJpYXQiOjE1ODkyOTg0ODQsImV4cCI6MTU4OTMwMDI4NH0.m1U63blB0MLej_WfB7yC2FTMnCziif9X8yzwDEfJXAg
 *      responses:
 *   

In [4]:
i = 315
pii = json.loads(ds[i]["pii"])
print("pii", pii)
print(ds[i]["content"][17731:])

pii [{'tag': 'NAME', 'value': 'Satoshi Nakamoto', 'start': 22, 'end': 38, 'context': '// Copyright (c) 2010 Satoshi Nakamoto\n// Original Code: Copyright (c) 2009-2014 The Bit'}]
12tvKAXCxZjSmdNbao16dKXC8tRWfcF5oc\"   (string) bitmark address\n"
            "           ,...\n"
            "         ]\n"
            "       }\n"
            "     }\n"
            "     ,...\n"
            "  ],\n"
            "}\n"

            "\nExamples:\n"
            + HelpExampleCli("decoderawtransaction", "\"hexstring\"")
            + HelpExampleRpc("decoderawtransaction", "\"hexstring\"")
        );

    vector<unsigned char> txData(ParseHexV(params[0], "argument"));
    CDataStream ssData(txData, SER_NETWORK, PROTOCOL_VERSION);
    CTransaction tx;
    try {
        ssData >> tx;
    }
    catch (std::exception &e) {
        throw JSONRPCError(RPC_DESERIALIZATION_ERROR, "TX decode failed");
    }

    Object result;
    TxToJSON(tx, 0, result);

    return result;
}

Value decodescript(const A

In [5]:
#318
i=318
pii = json.loads(ds[i]["pii"])
print("pii", pii)
print(ds[i]["content"][2308:])

pii [{'tag': 'PASSWORD', 'value': 'aa7097a2-f2fb-11e7-a565-0a580a28057d', 'start': 1535, 'end': 1571, 'context': "_at: '2018-01-06T16:11:04.393590+00:00',\n    id: 'aa7097a2-f2fb-11e7-a565-0a580a28057d',\n    label: 'fix: Remove break-word behavior on "}, {'tag': 'SSH_KEY', 'value': 'eff634a68a01d081c0bdc51752dfa0709781f0e4', 'start': 2308, 'end': 2348, 'context': "break-word behavior on coverage\\n',\n        sha: 'eff634a68a01d081c0bdc51752dfa0709781f0e4'\n      }\n    },\n    started_at: '2018-01-06T16:07"}, {'tag': 'EMAIL', 'value': 'dcramer@gmail.com', 'start': 2012, 'end': 2029, 'context': "  revision: {\n        author: {\n          email: 'dcramer@gmail.com',\n          id: '659dc21c-81db-11e7-988a-0a580a28"}, {'tag': 'PASSWORD', 'value': '63e820d4-81db-11e7-a6df-0a580a28004e', 'start': 3093, 'end': 3129, 'context': "00',\n    full_name: 'gh/getsentry/zeus',\n    id: '63e820d4-81db-11e7-a6df-0a580a28004e',\n    latest_build: null,\n    name: 'zeus',\n    o"}, {'tag': 'PASSWORD'

In [7]:
i=326
pii = json.loads(ds[i]["pii"])
print("pii", pii)
print(ds[i]["content"][1141:])

pii []
qua3H8VlBuvExmhX2b7wKgYO',
        'redirect' => 'http://www.gunny.site/callback',
    ],
];


In [10]:
i=377
pii = json.loads(ds[i]["pii"])
print("pii", pii)
print(ds[i]["content"][:])

pii [{'tag': 'SSH_KEY', 'value': '4d986a461d1b24bb5776fb49063b9a1891939f336b306a6bc75f58d0a4e98bcb', 'start': 42, 'end': 106, 'context': 'cask "tip" do\n  version "2.0.0"\n  sha256 "4d986a461d1b24bb5776fb49063b9a1891939f336b306a6bc75f58d0a4e98bcb"\n\n  url "https://github.com/tanin47/tip/releases/'}]
cask "tip" do
  version "2.0.0"
  sha256 "4d986a461d1b24bb5776fb49063b9a1891939f336b306a6bc75f58d0a4e98bcb"

  url "https://github.com/tanin47/tip/releases/download/v#{version}/Tip.zip"
  name "Tip"
  desc "Programmable tooltip that can be used with any app"
  homepage "https://github.com/tanin47/tip"

  app "Tip.app"

  zap trash: "~/Library/Application Scripts/tanin.tip"
end



In [12]:
ds[0].keys()

dict_keys(['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'])

In [13]:
# add new column with id= row number
ds = ds.add_column("id", [i for i in range(len(ds))])
ds

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id'],
    num_rows: 400
})

In [18]:
print(ds[208]["content"][:])
# delete 208 and replcae it with something else

#include "googletest/googletest/include/gtest/gtest.h"

#include "environment.h"

#include "json/json_spirit_reader_template.h"
#include "json/json_spirit_utils.h"
#include "json/json_spirit_writer_template.h"

#include "base58.h"
#include "util.h"

using namespace json_spirit;
extern Array read_json(const std::string& filename);

// Goal: test low-level base58 encoding functionality
TEST(base58_tests, base58_EncodeBase58)
{
    Array tests = read_json("base58_encode_decode.json");

    for (Value& tv : tests) {
        Array       test    = tv.get_array();
        std::string strTest = write_string(tv, false);
        if (test.size() < 2) // Allow for extra stuff (useful for comments)
        {
            ADD_FAILURE() << "Bad test: " << strTest;
            continue;
        }
        std::vector<unsigned char> sourcedata   = ParseHex(test[0].get_str());
        std::string                base58string = test[1].get_str();
        EXPECT_EQ(EncodeBase58(&sourcedata[0], &sourcedata[so

In [None]:

res315 = """{'tag': 'KEY',
      'value': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI1ZWJhYzUzNDk1NGI1NDEzOTgwNmMxMTIiLCJpYXQiOjE1ODkyOTg0ODQsImV4cCI6MTU4OTMwMDI4NH0.',
      'start': 4098,
      'end': 4227},
     {'tag': 'KEY',
      'value': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI1ZWJhYzUzNDk1NGI1NDEzOTgwNmMxMTIiLCJpYXQiOjE1ODkyOTg0ODQsImV4cCI6MTU4OTMwMDI4NH0.',
      'start': 4894,
      'end': 5023}]}"""
res317 = """{'tag': 'KEY',
      'value': '12tvKAXCxZjSmdNbao16dKXC8tRWfcF5oc\\',
      'start': 17731,
      'end': 17766},
     {'tag': 'KEY',
      'value': '1LtvqCaApEdUGFkpKMM4MstjcaL4dKg8SP\\\\\\',
      'start': 9587,
      'end': 9624},
     {'tag': 'KEY',
      'value': '1LtvqCaApEdUGFkpKMM4MstjcaL4dKg8SP\\\\\\',
      'start': 9737,
      'end': 9774},
     {'tag': 'KEY',
      'value': '1PGFqEzfmQch1gKD3ra4k18PNj3tTUUSqg\\\\\\',
      'start': 9544,
      'end': 9581},
     {'tag': 'KEY',
      'value': '1PGFqEzfmQch1gKD3ra4k18PNj3tTUUSqg\\\\\\',
      'start': 9694,
      'end': 9731}]}"""
# key res318 should be removed
res318 = """{'tag': 'KEY',
      'value': 'eff634a68a01d081c0bdc51752dfa0709781f0e4',
      'start': 2308,
      'end': 2348,
      'context': "break-word behavior on coverage\\n',\n        sha: 'eff634a68a01d081c0bdc51752dfa0709781f0e4'\n      }\n    },\n    started_at: '2018-01-06T16:07"}],
    'FP': []}"""
res326 = """{'tag': 'KEY',
      'value': 'qua3H8VlBuvExmhX2b7wKgYO',
      'start': 1141,
      'end': 1165}]}"""
res377 = """{'tag': 'KEY',
      'value': '4d986a461d1b24bb5776fb49063b9a1891939f336b306a6bc75f58d0a4e98bcb',
      'start': 42,
      'end': 106,
      'context': 'cask "tip" do\n  version "2.0.0"\n  sha256 "4d986a461d1b24bb5776fb49063b9a1891939f336b306a6bc75f58d0a4e98bcb"\n\n  url "https://github.com/tanin47/tip/releases/'}],
    'FP': []}"""

In [92]:
def update_pii(example):
    from copy import deepcopy
    new_example = deepcopy(example)
    pii = json.loads(example["pii"])
    if example["id"] == 232:
        # remove key
        new_pii = [key for key in pii if key["value"] != "43e0352fee07fa5b92dd22e557cb1d050ccde0cf97273e02f694930695b15134"]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}")
    
    elif example["id"] == 239:
        # add key
        key_1 = {"tag": "KEY", "value": example["content"][5026:5054], "start": 5026, "end": 5054}
        key_2 = {"tag": "KEY", "value": example["content"][10032:10060], "start": 10032, "end": 10060}
        pii.append(key_1)
        pii.append(key_2)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}")

    elif example["id"] == 269:
        # remove key
        new_pii = [key for key in pii if key["value"] != "c9eb8a1102d0a68cafc93f22df73445b8f69706f3322285f9a2f623a28df0176"]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}")

    elif example["id"] == 315:
        # add missing keys
        key_1 = {"tag": "KEY", "value": example["content"][17731:17766], "start": 17731, "end": 17766}
        key_2 = {"tag": "KEY", "value": example["content"][9587:9624], "start": 9587, "end": 9624}
        key_3 = {"tag": "KEY", "value": example["content"][9737:9774], "start": 9737, "end": 9774}
        key_4 = {"tag": "KEY", "value": example["content"][9544:9581], "start": 9544, "end": 9581}
        key_5 = {"tag": "KEY", "value": example["content"][9694:9731], "start": 9694, "end": 9731}
        pii.append(key_1)
        pii.append(key_2)
        pii.append(key_3)
        pii.append(key_4)
        pii.append(key_5)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}")

    elif example["id"] == 317:
        # add 5 missing keys
        key_1 = {"tag": "KEY", "value": example["content"][4098:4227], "start": 4098, "end": 4227}
        key_2 = {"tag": "KEY", "value": example["content"][4894:5023], "start": 4894, "end": 5023}
        pii.append(key_1)
        pii.append(key_2)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)

    elif example["id"] == 318:
        # remove key
        new_pii = [key for key in pii if key["value"] != "eff634a68a01d081c0bdc51752dfa0709781f0e4"]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}")
    
    elif example["id"] == 326:
        # add missing key
        key_1 = {"tag": "KEY", "value": example["content"][1141:1165], "start": 1141, "end": 1165}
        pii.append(key_1)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}")

    elif example["id"] == 377:
        # remove key
        new_pii = [key for key in pii if key["value"] != "4d986a461d1b24bb5776fb49063b9a1891939f336b306a6bc75f58d0a4e98bcb"]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}")
    
    return new_example

In [45]:
new_ds = ds.map(update_pii)

  0%|          | 0/400 [00:00<?, ?ex/s]

new pii for 232: [{'tag': 'USERNAME', 'value': 'WebStorm', 'start': 214, 'end': 222, 'context': 'bStorm-#{version}-custom-jdk-bundled.dmg"\n  name \'WebStorm\'\n  homepage \'http://www.jetbrains.com/webstorm/\'\n'}]
new pii for 239: [{'tag': 'PASSWORD', 'value': 'AIzaasdf', 'start': 7889, 'end': 7897, 'context': 'n/json")\n\n        client = googlemaps.Client(key="AIzaasdf")\n        b = client._get("/bar", {}, extract_bod'}, {'tag': 'PASSWORD', 'value': 'AIzaasdf', 'start': 6861, 'end': 6869, 'context': 'n/json\')\n\n        client = googlemaps.Client(key="AIzaasdf")\n        with self.assertRaises(googlemaps.excep'}, {'tag': 'PASSWORD', 'value': 'AIzaasdf', 'start': 2938, 'end': 2946, 'context': 'i/geocode/json?"\n                            "key=AIzaasdf&address=Sesame+St.",\n                            '}, {'tag': 'PASSWORD', 'value': 'AIzaasdf', 'start': 2107, 'end': 2115, 'context': 'on/json")\n        client = googlemaps.Client(key="AIzaasdf",\n                                  

In [46]:
new_ds

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id'],
    num_rows: 400
})

In [57]:
one_sample = dataset_no_filter.select([17])
hf_ds = new_ds.select((i for i in range(len(new_ds)) if i!=208))



In [59]:
hf_ds

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id'],
    num_rows: 399
})

In [60]:
ds_clean = datasets.concatenate_datasets([hf_ds, one_sample])
ds_clean

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id'],
    num_rows: 400
})

In [65]:
ds_clean.push_to_hub("dummy_data2")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/454 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.


In [66]:
ds_clean.to_json("prefiltered_data.json")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3724499

In [67]:
dd = load_dataset("pii-for-code/data")

Using custom data configuration data-490d13ba0a302c92


Downloading and preparing dataset json/data to /Users/loubnabenallal/.cache/huggingface/datasets/json/data-490d13ba0a302c92/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /Users/loubnabenallal/.cache/huggingface/datasets/json/data-490d13ba0a302c92/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [70]:
dd["train"][208]

{'content': '"use strict";\n\nvar _interopRequireDefault = require("@babel/runtime/helpers/builtin/interopRequireDefault");\n\nObject.defineProperty(exports, "__esModule", {\n  value: true\n});\nexports.default = void 0;\n\nvar _react = _interopRequireDefault(require("react"));\n\nvar _createSvgIcon = _interopRequireDefault(require("./utils/createSvgIcon"));\n\nvar _default = (0, _createSvgIcon.default)(_react.default.createElement(_react.default.Fragment, null, _react.default.createElement("path", {\n  d: "M9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4zm2 2H5V5h14v14zm2-16H3v18h18V3z"\n})), \'InsertChartOutlinedSharp\');\n\nexports.default = _default;',
 'language': 'JavaScript',
 'license': 'MIT',
 'path': 'GJCHOWDARY/chowdary_react/node_modules/@material-ui/icons/InsertChartOutlinedSharp.js',
 'annotation_id': '5157a4f9-7197-402f-807d-f944390b0334',
 'pii': '[]',
 'pii_modified': '[]',
 'id': 209.0}

### More cleaning

In [93]:
import json
from datasets import load_dataset

ds = load_dataset("bigcode/pii-for-code", use_auth_token=True, split="train")

Using custom data configuration bigcode--pii-for-code-a9a3a14b9b71dff0
Found cached dataset json (/Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--pii-for-code-a9a3a14b9b71dff0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


In [128]:
ds

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id'],
    num_rows: 400
})

In [129]:
# add id column to dataset as a new column
ds = ds.remove_columns(["id"])
ds = ds.add_column("id", [i for i in range(len(ds))])
ds

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id'],
    num_rows: 400
})

In [None]:


# remove key
res73 = """'FN': [{'tag': 'KEY',
      'value': '6622386f8d83dc9efefb8c03a4dbfc18e7928d89ffc2ec3e2feb9473e8f410c9',
      'start': 57,
      'end': 121,
      'context': "ockrattler' do\n  version '4.15,2018.11'\n  sha256 '6622386f8d83dc9efefb8c03a4dbfc18e7928d89ffc2ec3e2feb9473e8f410c9'\n\n  # eclecticlightdotcom.files.wordpress.com was"}],
    'FP': []}}}"""


res365 = """'FP': [{'tag': 'KEY',
      'value': '23fedee89bcadec0487bf990c2c714d1',
      'start': 3116,
      'end': 3148}]}}}"""

# remove key
res137 = """'FN': [{'tag': 'KEY',
      'value': '57ed642b-1ee3-47b3-be6d-b7297d500409',
      'start': 3842,
      'end': 3878,
      'context': 'nal UUID SCHEDULER_STORE_TOKEN = UUID.fromString("57ed642b-1ee3-47b3-be6d-b7297d500409");\n\n    /**\n     * The default scheduler store ve'}],
    'FP': []}}}"""

#remove key
res188 = """'FN': [{'tag': 'KEY',
      'value': '546d57b6c88c2be7517759c016c0bf0313dfcc14adfcb43967f3c5d24657f366',
      'start': 61,
      'end': 125,
      'context': 'owser-for-sqlite" do\n  version "3.12.2"\n  sha256 "546d57b6c88c2be7517759c016c0bf0313dfcc14adfcb43967f3c5d24657f366"\n\n  url "https://github.com/sqlitebrowser/sqliteb'}]"""

#remove key
res221 = """'FN': [{'tag': 'KEY',
      'value': '76d8ae334545bbdf2db49414c25d2cfd8685e7b6187f119b28e93ad9c5118e9d',
      'start': 54,
      'end': 118,
      'context': 'ion https://git-lfs.github.com/spec/v1\noid sha256:76d8ae334545bbdf2db49414c25d2cfd8685e7b6187f119b28e93ad9c5118e9d\nsize 4727\n'}]"""

res316 = """    'FP': [{'tag': 'EMAIL',
      'value': 'fake@example.com',
      'start': 1796,
      'end': 1812},
     {'tag': 'EMAIL', 'value': 'fake@example.com', 'start': 2904, 'end': 2920},
     {'tag': 'EMAIL',
      'value': 'fake@example.com',
      'start': 5893,
      'end': 5909}]}"""

res317 = """    'FP': [{'tag': 'EMAIL',
      'value': 'git@github.com',
      'start': 3233,
      'end': 3247}]}"""

res346 = """    'FP': [{'tag': 'EMAIL',
      'value': 'oli@dwslab.de',
      'start': 1971,
      'end': 1984}]}"""

res355 = """'FP': [{'tag': 'EMAIL',
      'value': 'license@prestashop.com',
      'start': 435,
      'end': 457}]}"""

res362 = """    'FP': [{'tag': 'EMAIL',
      'value': 'bootrax@rackspace.com',
      'start': 3785,
      'end': 3806}]}"""

In [117]:
print(ds[365]["content"][3106:])

Token || '23fedee89bcadec0487bf990c2c714d1',
        });


        // Sample Options:
        var defaultOptions = {
            "product": {
                "layout": "horizontal",
                    "variantId": "all",
                    "width": "100%",
                    "contents": {
                    "img": false,
                        "imgWithCarousel": true,
                        "variantTitle": false,
                        "description": true,
                        "buttonWithQuantity": false,
                        "quantity": false
                },
                "styles": {
                    "product": {
                        "text-align": "left",
                            "@media (min-width: 601px)": {
                            "max-width": "100%",
                                "margin-left": "0",
                                "margin-bottom": "50px"
                        }
                    },
                    "title": {
               

In [132]:
def update_pii(example):
    from copy import deepcopy
    new_example = deepcopy(example)
    pii = json.loads(example["pii"])
    if example["id"] == 73:
        new_pii = [key for key in pii if key["value"] != "6622386f8d83dc9efefb8c03a4dbfc18e7928d89ffc2ec3e2feb9473e8f410c9"]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}\n")
    
    elif example["id"] == 137:
        new_pii = [key for key in pii if key["value"] != "57ed642b-1ee3-47b3-be6d-b7297d500409"]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}\n")
    
    elif example["id"] == 188:
        new_pii = [key for key in pii if key["value"] != "546d57b6c88c2be7517759c016c0bf0313dfcc14adfcb43967f3c5d24657f366"]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}\n")

    elif example["id"] == 221:
        new_pii = [key for key in pii if key["value"] != "76d8ae334545bbdf2db49414c25d2cfd8685e7b6187f119b28e93ad9c5118e9d"]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}\n")
    
    elif example["id"] == 355:
        key_1 = {"tag": "EMAIL", "value": example["content"][435:457], "start": 435, "end": 457}
        pii.append(key_1)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}\n")

    elif example["id"] == 365:
        # add missing keys
        key_1 = {"tag": "KEY", "value": example["content"][3116:3148], "start": 3116, "end": 3148}
        pii.append(key_1)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}\n")

    elif example["id"] == 316:
        # add missing keys
        key_1 = {"tag": "EMAIL", "value": example["content"][1796:1812], "start": 1796, "end": 1812}
        key_2 = {"tag": "EMAIL", "value": example["content"][2904:2920], "start": 2904, "end": 2920}
        key_3 = {"tag": "EMAIL", "value": example["content"][5893:5909], "start": 5893, "end": 5909}
        pii.append(key_1)
        pii.append(key_2)
        pii.append(key_3)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}\n")
    
    elif example["id"] == 317:
        # add missing keys
        key_1 = {"tag": "EMAIL", "value": example["content"][3233:3247], "start": 3233, "end": 3247}
        pii.append(key_1)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}\n")
    
    elif example["id"] == 346:
        key_1 = {"tag": "EMAIL", "value": example["content"][1971:1984], "start": 1971, "end": 1984}
        pii.append(key_1)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}\n")

    elif example["id"] == 362:
        key_1 = {"tag": "EMAIL", "value": example["content"][3785:3806], "start": 3785, "end": 3806}
        pii.append(key_1)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}\n")

    return new_example

In [133]:
new_ds = ds.map(update_pii, load_from_cache_file=False)

  0%|          | 0/400 [00:00<?, ?ex/s]

new pii for 73: []

new pii for 137: []

new pii for 188: []

new pii for 221: []

new pii for 316: [{'tag': 'KEY', 'value': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI1ZWJhYzUzNDk1NGI1NDEzOTgwNmMxMTIiLCJpYXQiOjE1ODkyOTg0ODQsImV4cCI6MTU4OTMwMDI4NH0.', 'start': 4098, 'end': 4227}, {'tag': 'KEY', 'value': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI1ZWJhYzUzNDk1NGI1NDEzOTgwNmMxMTIiLCJpYXQiOjE1ODkyOTg0ODQsImV4cCI6MTU4OTMwMDI4NH0.', 'start': 4894, 'end': 5023}, {'tag': 'EMAIL', 'value': 'fake@example.com', 'start': 1796, 'end': 1812}, {'tag': 'EMAIL', 'value': 'fake@example.com', 'start': 2904, 'end': 2920}, {'tag': 'EMAIL', 'value': 'fake@example.com', 'start': 5893, 'end': 5909}]

new pii for 317: [{'tag': 'PASSWORD', 'value': 'aa7097a2-f2fb-11e7-a565-0a580a28057d', 'start': 1535, 'end': 1571, 'context': "_at: '2018-01-06T16:11:04.393590+00:00',\n    id: 'aa7097a2-f2fb-11e7-a565-0a580a28057d',\n    label: 'fix: Remove break-word behavior on "}, {'tag': 'EMAIL', 'value': 'dcram

In [None]:

res16 = """    'FP': [{'tag': 'IP_ADDRESS',
      'value': '72.44.36.12',
      'start': 1193,
      'end': 1204}]}"""

res60 = """   'IP_ADDRESS': {'FN': [{'tag': 'IP_ADDRESS',
      'value': '224.250.0.2',
      'start': 1600,
      'end': 1611,
      'context': ';239.192.0.1:7500\n//       link-local;224.250.0.1,224.250.0.2;224.250.0.3:8000\n//       ;[fe80::1%en0]:7500\nint'},
     {'tag': 'IP_ADDRESS',
      'value': '224.250.0.3:8000',
      'start': 1612,
      'end': 1628,
      'context': ':7500\n//       link-local;224.250.0.1,224.250.0.2;224.250.0.3:8000\n//       ;[fe80::1%en0]:7500\nint zmq::pgm_socket_'},
     {'tag': 'IP_ADDRESS',
      'value': '239.192.0.1:7500',
      'start': 1551,
      'end': 1567,
      'context': ' & multicast group decls>:<IP port>\n//  e.g. eth0;239.192.0.1:7500\n//       link-local;224.250.0.1,224.250.0.2;224.2'}],
    'FP': [{'tag': 'IP_ADDRESS',
      'value': '239.192.0.1',
      'start': 1551,
      'end': 1562},
     {'tag': 'IP_ADDRESS',
      'value': '224.250.0.3',
      'start': 1612,
      'end': 1623}]}"""

res70 = """    'FP': [{'tag': 'IP_ADDRESS',
      'value': '87.248.113.14',
      'start': 6973,
      'end': 6986}]}"""

res189 = """    'FP': [{'tag': 'IP_ADDRESS',
      'value': 'FF02::FB',
      'start': 3326,
      'end': 3334},
     {'tag': 'IP_ADDRESS',
      'value': '224.0.0.251',
      'start': 3536,
      'end': 3547}]}"""

res214 = """    'FP': [{'tag': 'IP_ADDRESS',
      'value': '127.0.0.1',
      'start': 293,
      'end': 302}]}"""

res254 = """    'FP': [{'tag': 'IP_ADDRESS',
      'value': '192.168.0.9',
      'start': 1514,
      'end': 1525}"""

In [134]:
def update_pii_2(example):
    from copy import deepcopy
    new_example = deepcopy(example)
    pii = json.loads(example["pii"])
    if example["id"] == 16:
        # add missing keys
        key_1 = {"tag": "IP_ADDRESS", "value": example["content"][1193:1204], "start": 1193, "end": 1204}
        pii.append(key_1)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}\n")
    
    elif example["id"] == 70:
        # add missing keys
        key_1 = {"tag": "IP_ADDRESS", "value": example["content"][6973:6986], "start": 6973, "end": 6986}
        pii.append(key_1)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}\n")

    elif example["id"] == 189:
        # add missing keys
        key_1 = {"tag": "IP_ADDRESS", "value": example["content"][3326:3334], "start": 3326, "end": 3334}
        key_2 = {"tag": "IP_ADDRESS", "value": example["content"][3536:3547], "start": 3536, "end": 3547}
        pii.append(key_1)
        pii.append(key_2)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}\n")

    elif example["id"] == 214:
        # add missing keys
        key_1 = {"tag": "IP_ADDRESS", "value": example["content"][293:302], "start": 293, "end": 302}
        pii.append(key_1)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}\n")
    
    elif example["id"] == 254:
        # add missing keys
        key_1 = {"tag": "IP_ADDRESS", "value": example["content"][1514:1525], "start": 1514, "end": 1525}
        pii.append(key_1)
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}\n")
    
    elif example["id"] == 60:
        # remove key
        # get key that has the value "224.250.0.3:8000"
        key_1 = [key for key in pii if key["value"] == "224.250.0.3:8000"][0]
        key_2 = [key for key in pii if key["value"] == "239.192.0.1:7500"][0]
        key_1["value"] =  "224.250.0.3"
        key_2["value"] = "239.192.0.1"
        key_1["end"] = 1623
        key_2["end"] = 1562
        new_pii = [x for x in pii if x["value"] not in ["224.250.0.3:8000", "239.192.0.1:7500"]]
        new_pii.extend((key_1, key_2))
        print(f"new pii for {example['id']}: {new_pii}\n")
    return new_example        

In [135]:
new_ds_2 = new_ds.map(update_pii_2, load_from_cache_file=False)

  0%|          | 0/400 [00:00<?, ?ex/s]

new pii for 16: [{'tag': 'IP_ADDRESS', 'value': '72.44.36.12', 'start': 1336, 'end': 1347, 'context': '  "72.44.36.12".convert_from_ec2_to_ip.should == "72.44.36.12"\n    end\n    it "should be able to parse the date'}, {'tag': 'IP_ADDRESS', 'value': '72.44.36.12', 'start': 1289, 'end': 1300, 'context': 'an error if another string is returned" do\n      "72.44.36.12".convert_from_ec2_to_ip.should == "72.44.36.12"\n '}, {'tag': 'IP_ADDRESS', 'value': '72.44.36.12', 'start': 1193, 'end': 1204}]

new pii for 60: [{'tag': 'IP_ADDRESS', 'value': '224.250.0.1', 'start': 1588, 'end': 1599, 'context': '/  e.g. eth0;239.192.0.1:7500\n//       link-local;224.250.0.1,224.250.0.2;224.250.0.3:8000\n//       ;[fe80::1%e'}, {'tag': 'IP_ADDRESS', 'value': '224.250.0.2', 'start': 1600, 'end': 1611, 'context': ';239.192.0.1:7500\n//       link-local;224.250.0.1,224.250.0.2;224.250.0.3:8000\n//       ;[fe80::1%en0]:7500\nint'}, {'tag': 'IP_ADDRESS', 'value': '224.250.0.3', 'start': 1612, 'end': 1623, 'co

In [136]:
new_ds_2.to_json("data_final.json")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3723704

In [None]:
#more filtering
res45 = """'FP': [{'tag': 'IP_ADDRESS', 'value': '1.2.10.1', 'start': 31, 'end': 39}"""
res161 = """'FP': [{'tag': 'IP_ADDRESS',
      'value': '47.04.66.11',
      'start': 855,
      'end': 866},
     {'tag': 'IP_ADDRESS', 'value': '42.06.16.1', 'start': 900, 'end': 910},
     {'tag': 'IP_ADDRESS', 'value': '73.27.21.6', 'start': 1136, 'end': 1146},
     {'tag': 'IP_ADDRESS', 'value': '98.53.38.14', 'start': 1150, 'end': 1161},
     {'tag': 'IP_ADDRESS', 'value': '1.27.36.39', 'start': 1168, 'end': 1178},
     {'tag': 'IP_ADDRESS', 'value': '13.1.22.22', 'start': 1208, 'end': 1218},
     {'tag': 'IP_ADDRESS', 'value': '7.93.31.27', 'start': 1413, 'end': 1423},
     {'tag': 'IP_ADDRESS', 'value': '1.15.66.46', 'start': 1430, 'end': 1440},
     {'tag': 'IP_ADDRESS', 'value': '1.58.25.53', 'start': 1450, 'end': 1460},
     {'tag': 'IP_ADDRESS', 'value': '2.39.19.66', 'start': 1745, 'end': 1755},
     {'tag': 'IP_ADDRESS', 'value': '1.6.34.41', 'start': 1766, 'end': 1775},
     {'tag': 'IP_ADDRESS', 'value': '1.23.89.48', 'start': 1782, 'end': 1792},
     {'tag': 'IP_ADDRESS', 'value': '1.59.28.59', 'start': 1804, 'end': 1814},
     {'tag': 'IP_ADDRESS', 'value': '58.06.81.17', 'start': 2235, 'end': 2246},
     {'tag': 'IP_ADDRESS', 'value': '11.44.29.6', 'start': 2250, 'end': 2260},
     {'tag': 'IP_ADDRESS',
      'value': '99.08.41.13',
      'start': 2279,
      'end': 2290}]}"""

res297 = """'FP': [{'tag': 'IP_ADDRESS',
      'value': '20.0.0.2',
      'start': 27297,
      'end': 27305},
     {'tag': 'IP_ADDRESS',
      'value': '60c0:a800::6',
      'start': 27337,
      'end': 27349},
     {'tag': 'IP_ADDRESS', 'value': '20.0.0.7', 'start': 27771, 'end': 27779},
     {'tag': 'IP_ADDRESS',
      'value': '60c0:a800::7',
      'start': 27811,
      'end': 27823},
     {'tag': 'IP_ADDRESS', 'value': '20.0.0.3', 'start': 28238, 'end': 28246},
     {'tag': 'IP_ADDRESS',
      'value': '60c0:a800::4',
      'start': 28278,
      'end': 28290},
     {'tag': 'IP_ADDRESS', 'value': '20.0.0.6', 'start': 29843, 'end': 29851},
     {'tag': 'IP_ADDRESS',
      'value': '60c0:a800::3',
      'start': 29883,
      'end': 29895},
     {'tag': 'IP_ADDRESS', 'value': '20.0.0.4', 'start': 30324, 'end': 30332},
     {'tag': 'IP_ADDRESS',
      'value': '60c0:a800::8',
      'start': 30364,
      'end': 30376}]},"""

In [151]:
new_ds_2

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id'],
    num_rows: 400
})

In [150]:
print(new_ds_2["content"][297][27507:])

False, ip_version)
        counters_sanity_check.append(1)

    def test_rules_priority_forwarded(self, setup, direction, ptfadapter, counters_sanity_check, ip_version):
        """Verify that we respect rule priorities in the forwarding case."""
        src_ip = "20.0.0.7" if ip_version == "ipv4" else "60c0:a800::7"
        pkt = self.tcp_packet(setup, direction, ptfadapter, ip_version, src_ip=src_ip)

        self._verify_acl_traffic(setup, direction, ptfadapter, pkt, False, ip_version)
        counters_sanity_check.append(20)

    def test_rules_priority_dropped(self, setup, direction, ptfadapter, counters_sanity_check, ip_version):
        """Verify that we respect rule priorities in the drop case."""
        src_ip = "20.0.0.3" if ip_version == "ipv4" else "60c0:a800::4"
        pkt = self.tcp_packet(setup, direction, ptfadapter, ip_version, src_ip=src_ip)

        self._verify_acl_traffic(setup, direction, ptfadapter, pkt, True, ip_version)
        counters_sanity_check.append(7)

In [None]:

res52 = """    'FN': [{'tag': 'IP_ADDRESS',
      'value': '0.0.0.0',
      'start': 42191,
      'end': 42198,
      'context': 'ev_version.IsValid())\n    prev_version = Version("0.0.0.0");\n  Version current_version(version_info.Version'}],"""

res89 = """    'FN': [{'tag': 'IP_ADDRESS',
      'value': '4.0.0.0',
      'start': 942,
      'end': 949,
      'context': 'm.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0")]\n    [global::System.Diagnostics.DebuggerNonUse'}]"""

res129 = """
     {'tag': 'IP_ADDRESS',
      'value': '1.1.1.1',
      'start': 23855,
      'end': 23862,
      'context': 'ample"\n    primary     = true\n    dns_servers = ["1.1.1.1", "8.8.8.8"]\n\n    ip_configuration {\n      name  '},
     {'tag': 'IP_ADDRESS',
      'value': '8.8.8.8',
      'start': 34255,
      'end': 34262,
      'context': 'imary"\n    primary     = true\n    dns_servers = ["8.8.8.8"]\n\n    ip_configuration {\n      name      = "inte'},
     {'tag': 'IP_ADDRESS',
      'value': '8.8.8.8',
      'start': 23866,
      'end': 23873,
      'context': 'primary     = true\n    dns_servers = ["1.1.1.1", "8.8.8.8"]\n\n    ip_configuration {\n      name      = "inte'},
     {'tag': 'IP_ADDRESS',
      'value': '8.8.8.8',
      'start': 22803,
      'end': 22810,
      'context': 'ample"\n    primary     = true\n    dns_servers = ["8.8.8.8"]\n\n    ip_configuration {\n      name      = "inte'}"""

res150 = """[{'tag': 'IP_ADDRESS',
      'value': '4.0.0.0',
      'start': 988,
      'end': 995,
      'context': 'm.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0")]\n  [global::System.Diagnostics.DebuggerNonUserC'}]"""

res327 = """    'FN': [{'tag': 'IP_ADDRESS',
      'value': '4.0.0.0',
      'start': 2269,
      'end': 2276,
      'context': 'GeneratedCodeAttribute("PresentationBuildTasks", "4.0.0.0")]\n        public void InitializeComponent() {\n  '},
     {'tag': 'IP_ADDRESS',
      'value': '4.0.0.0',
      'start': 2920,
      'end': 2927,
      'context': 'GeneratedCodeAttribute("PresentationBuildTasks", "4.0.0.0")]\n        [System.ComponentModel.EditorBrowsable'}]"""

res314 = """    'FN': [{'tag': 'KEY',
      'value': '12tvKAXCxZjSmdNbao16dKXC8tRWfcF5oc\\',
      'start': 17731,
      'end': 17766},
     {'tag': 'KEY',
      'value': '1LtvqCaApEdUGFkpKMM4MstjcaL4dKg8SP\\\\\\',
      'start': 9587,
      'end': 9624},
     {'tag': 'KEY',
      'value': '1LtvqCaApEdUGFkpKMM4MstjcaL4dKg8SP\\\\\\',
      'start': 9737,
      'end': 9774},
     {'tag': 'KEY',
      'value': '1PGFqEzfmQch1gKD3ra4k18PNj3tTUUSqg\\\\\\',
      'start': 9544,
      'end': 9581},
     {'tag': 'KEY',
      'value': '1PGFqEzfmQch1gKD3ra4k18PNj3tTUUSqg\\\\\\',
      'start': 9694,
      'end': 9731}]"""

In [157]:
def update_pii_3(example):
    from copy import deepcopy
    new_example = deepcopy(example)
    pii = json.loads(example["pii"])
    if example["id"] == 297:
        # add missing keys
        key_1 = {"tag": "IP_ADDRESS", "value": example["content"][27297:27305], "start": 27297, "end": 27305}
        key_2 = {"tag": "IP_ADDRESS", "value": example["content"][27337:27349], "start": 27337, "end": 27349}
        key_3 = {"tag": "IP_ADDRESS", "value": example["content"][27771:27779], "start": 27771, "end": 27779}
        key_4 = {"tag": "IP_ADDRESS", "value": example["content"][27811:27823], "start": 27811, "end": 27823}
        key_5 = {"tag": "IP_ADDRESS", "value": example["content"][28238:28246], "start": 28238, "end": 28246}
        key_6 = {"tag": "IP_ADDRESS", "value": example["content"][28278:28290], "start": 28278, "end": 28290}
        key_7 = {"tag": "IP_ADDRESS", "value": example["content"][29843:29851], "start": 29843, "end": 29851}
        key_8 = {"tag": "IP_ADDRESS", "value": example["content"][29883:29895], "start": 29883, "end": 29895}
        key_9 = {"tag": "IP_ADDRESS", "value": example["content"][30324:30332], "start": 30324, "end": 30332}
        key_10 = {"tag": "IP_ADDRESS", "value": example["content"][30364:30376], "start": 30364, "end": 30376}
        pii.extend([key_1, key_2, key_3, key_4, key_5, key_6, key_7, key_8, key_9, key_10])
        new_example["pii"] = json.dumps(pii)
        new_example["pii_modified"] = json.dumps(pii)
        print(f"new pii for {example['id']}: {pii}\n")

    elif example["id"] == 52:
        # remove key
        new_pii = [key for key in pii if (key["start"], key["end"]) != (42191, 42198)]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}\n")
    
    elif example["id"] == 89:
        # remove key
        new_pii = [key for key in pii if (key["start"], key["end"]) != (942, 949)]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}\n")
    
    elif example["id"] == 129:
        # remove all keys in there
        indexes = [(23855, 23862), (34255, 34262), (23866, 23873), (22803, 22810)]
        new_pii = [key for key in pii if (key["start"], key["end"]) not in indexes]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}\n")
    
    elif example["id"] == 150:
        # remove key
        new_pii = [key for key in pii if (key["start"], key["end"]) != (988, 995)]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}\n")
    
    elif example["id"] == 327:
        # remove key
        new_pii = [key for key in pii if (key["start"], key["end"]) not in [(2269, 2276), (2920, 2927)]]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}\n")
    
    elif example["id"] == 314:
        # remove key
        indexes = [(17731, 17766), (9587, 9624), (9737, 9774), (9544, 9581), (9694, 9731)]
        new_pii = [key for key in pii if (key["start"], key["end"]) not in indexes]
        new_example["pii"] = json.dumps(new_pii)
        new_example["pii_modified"] = json.dumps(new_pii)
        print(f"new pii for {example['id']}: {new_pii}\n")
    return new_example        

In [158]:
new_ds_3 = new_ds_2.map(update_pii_3, load_from_cache_file=False)

  0%|          | 0/400 [00:00<?, ?ex/s]

new pii for 52: []

new pii for 89: []

new pii for 129: [{'tag': 'IP_ADDRESS', 'value': '10.0.0.0/16', 'start': 14710, 'end': 14721, 'context': 'ource_group.test.name}"\n  address_space       = ["10.0.0.0/16"]\n  location            = "${azurerm_resource_gro'}, {'tag': 'USERNAME', 'value': 'adminuser', 'start': 27671, 'end': 27680, 'context': ' instances           = 1\n  admin_username      = "adminuser"\n  admin_password      = "P@ssword1234!"\n\n  disab'}, {'tag': 'PASSWORD', 'value': 'P@ssword1234!', 'start': 35135, 'end': 35148, 'context': 'rname      = "adminuser"\n  admin_password      = "P@ssword1234!"\n\n  disable_password_authentication = false\n\n  so'}, {'tag': 'USERNAME', 'value': 'adminuser', 'start': 41753, 'end': 41762, 'context': ' instances           = 1\n  admin_username      = "adminuser"\n  admin_password      = "P@ssword1234!"\n\n  disab'}, {'tag': 'IP_ADDRESS', 'value': '10.0.0.0/24', 'start': 15018, 'end': 15029, 'context': 'ual_network.test.name}"\n  address_pr

In [159]:
new_ds_3.to_json("new_ds.json")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3720260