- Join mimesis, faker and dessi data to one dataset  
- Provide binary labels (personal/non-personal)  
- Ensure that classes with unique values are still unique

In [1]:
import pandas as pd
import numpy as np
import random
random.seed(42)

In [2]:
dessi_train = pd.read_csv("../dessi_unique/train.csv")
dessi_train_labels = pd.read_csv("../dessi_unique/train_labels.csv")
dessi_dev = pd.read_csv("../dessi_unique/dev.csv")
dessi_dev_labels = pd.read_csv("../dessi_unique/dev_labels.csv")
dessi_test = pd.read_csv("../dessi_unique/test.csv")
dessi_test_labels = pd.read_csv("../dessi_unique/test_labels.csv")

mimesis_train = pd.read_csv("../mimesis/train.csv")
mimesis_train_labels = pd.read_csv("../mimesis/train_labels.csv")
mimesis_train_classes = pd.read_csv("../mimesis/train_classes.csv")
mimesis_dev = pd.read_csv("../mimesis/dev.csv")
mimesis_dev_labels = pd.read_csv("../mimesis/dev_labels.csv")
mimesis_dev_classes = pd.read_csv("../mimesis/dev_classes.csv")
mimesis_test = pd.read_csv("../mimesis/test.csv")
mimesis_test_labels = pd.read_csv("../mimesis/test_labels.csv")
mimesis_test_classes = pd.read_csv("../mimesis/test_classes.csv")

faker_train = pd.read_csv("../faker/train.csv")
faker_train_labels = pd.read_csv("../faker/train_labels.csv")
faker_train_classes = pd.read_csv("../faker/train_classes.csv")
faker_dev = pd.read_csv("../faker/dev.csv")
faker_dev_labels = pd.read_csv("../faker/dev_labels.csv")
faker_dev_classes = pd.read_csv("../faker/dev_classes.csv")
faker_test = pd.read_csv("../faker/test.csv")
faker_test_labels = pd.read_csv("../faker/test_labels.csv")
faker_test_classes = pd.read_csv("../faker/test_classes.csv")

  mimesis_train = pd.read_csv("../mimesis/train.csv")
  faker_train = pd.read_csv("../faker/train.csv")


In [3]:
all_dessi_labels = pd.concat([dessi_train_labels, dessi_dev_labels, dessi_test_labels])

In [4]:
[a for a in all_dessi_labels["label"].value_counts().keys()]

['Phone_number',
 'Other_data',
 'NIN',
 'Date',
 'Geolocation',
 'Gender',
 'NIN,Date',
 'Phone_number,NIN',
 'NIN,Phone_number',
 'SWIFT/BIC',
 'Date,NIN',
 'NIN,Person',
 'NIN,Email',
 'Person',
 'Person,NIN',
 'Phone_number,Email',
 'Email,Phone_number',
 'Email,NIN',
 'Passport',
 'Organization,Person',
 'Person,Organization',
 'Person,Email',
 'IBAN',
 'Email,Person',
 'Religion',
 'Sexuality',
 'Nationality',
 'Email,Address',
 'Address,Email',
 'Address,Geolocation',
 'Geolocation,Address',
 'CCN',
 'ID_Card',
 'Race',
 'Organization',
 'Address',
 'GPE',
 'Email',
 'Address,Phone_number',
 'Organization,Phone_number',
 'Address,Person,Phone_number',
 'Person,Phone_number',
 'Organization,Address',
 'Address,Person',
 'Person,Date']

In [5]:
NON_PERSONAL = ["GPE", "Organization", "Date", "SWIFT/BIC"]
DROP = ["Other_data", "Address", "Person"]  # "Person" is dropped it can contain a full name but also only a surename

In [6]:
for a in all_dessi_labels["label"].unique():
    for d in DROP:
        if d in a and d != a:
            DROP.append(a)
DROP

['Other_data',
 'Address',
 'Person',
 'Person,Email',
 'NIN,Person',
 'Organization,Person',
 'Person,Organization',
 'Email,Person',
 'Person,NIN',
 'Geolocation,Address',
 'Email,Address',
 'Address,Email',
 'Address,Geolocation',
 'Address,Person',
 'Address,Person',
 'Address,Phone_number',
 'Address,Person,Phone_number',
 'Address,Person,Phone_number',
 'Address,Person,Phone_number',
 'Address,Person,Phone_number',
 'Person,Phone_number',
 'Organization,Address',
 'Person,Date']

In [7]:
for a in all_dessi_labels["label"].unique():
    if a not in DROP:
        for d in NON_PERSONAL:
            if d in a and d != a:
                print("Possible non-personal: ", a)
NON_PERSONAL

Possible non-personal:  NIN,Date
Possible non-personal:  Date,NIN
Possible non-personal:  Organization,Phone_number


['GPE', 'Organization', 'Date', 'SWIFT/BIC']

NIN,Date and Date,NIN are personal as they contain NIN, do not append it to NON-PERSONAL list

In [8]:
PERSONAL = set([a for a in all_dessi_labels["label"].value_counts().keys()]) - set(DROP) - set(NON_PERSONAL)
PERSONAL

{'CCN',
 'Date,NIN',
 'Email',
 'Email,NIN',
 'Email,Phone_number',
 'Gender',
 'Geolocation',
 'IBAN',
 'ID_Card',
 'NIN',
 'NIN,Date',
 'NIN,Email',
 'NIN,Phone_number',
 'Nationality',
 'Organization,Phone_number',
 'Passport',
 'Phone_number',
 'Phone_number,Email',
 'Phone_number,NIN',
 'Race',
 'Religion',
 'Sexuality'}

In [9]:
def prepare_dessi(data, labels):
    # label columns as personal, non-personal or drop it
    dropping = []
    classes = []
    for i in range(data.shape[1]):
        if labels.loc[i].values[0] in DROP:
            dropping.append(i)
            continue
        elif labels.loc[i].values[0] in PERSONAL:
            classes.append(labels.loc[i].values[0])
            labels.loc[i] = "personal"
        else:
            classes.append(labels.loc[i].values[0])
            labels.loc[i] = "non-personal"
    data = data.iloc[:, [i for i in range(data.shape[1]) if i not in dropping]]
    labels.drop(dropping, inplace=True)
    labels = labels.reset_index(drop=True)
    return data, labels, pd.DataFrame(classes).rename(columns={0: "class"})

In [10]:
dessi_train_prepared, dessi_train_labels_prepared, dessi_train_classes_prepared = prepare_dessi(dessi_train.copy(), dessi_train_labels.copy())
dessi_dev_prepared, dessi_dev_labels_prepared, dessi_dev_classes_prepared = prepare_dessi(dessi_dev.copy(), dessi_dev_labels.copy())
dessi_test_prepared, dessi_test_labels_prepared, dessi_test_classes_prepared = prepare_dessi(dessi_test.copy(), dessi_test_labels.copy())

In [11]:
def add_split_type(data, split_type):
    if data.shape[1] == 1:
        split_df = pd.DataFrame(data.shape[0] * [split_type])
        split_df.columns = ["split_type"]
        return pd.concat([data, split_df], axis=1).reset_index(drop=True)
    else:
        split_df = pd.DataFrame(data.shape[1] * [split_type]).T
        split_df.columns = data.columns
        return pd.concat([data, split_df]).reset_index(drop=True)
    
def add_split_type_equal(data):
    if data.shape[1] == 1:
        split_labels = round(data.shape[0] * 0.6) * ["train"] + round(data.shape[0] * 0.2) * ["dev"] + round(data.shape[0] * 0.2) * ["test"]
        if len(split_labels) > data.shape[0]:
            split_labels = split_labels[:data.shape[0]]
        elif len(split_labels) < data.shape[0]:
            split_labels += ["train"] * (data.shape[0] - len(split_labels))
        split_df = pd.DataFrame(split_labels)
        split_df.columns = ["split_type"]
        return pd.concat([data, split_df], axis=1).reset_index(drop=True)
    else:
        split_labels = round(data.shape[1] * 0.6) * ["train"] + round(data.shape[1] * 0.2) * ["dev"] + round(data.shape[1] * 0.2) * ["test"]
        if len(split_labels) > data.shape[1]:
            split_labels = split_labels[:data.shape[1]]
        elif len(split_labels) < data.shape[1]:
            split_labels += ["train"] * (data.shape[1] - len(split_labels))
        split_df = pd.DataFrame([split_labels])
        split_df.columns = data.columns
        return pd.concat([data, split_df]).reset_index(drop=True)

Add data of other datasets with respecting unique values for some classes

In [12]:
mimesis_check_unique = {# map mimesis attribute to dessi attribute to check uniqueness
    "email": ["Email", "personal"], 
    "phone_number": ["Phone_number", "personal"],
    "credit_card_number": ["CCN", "personal"]
}
faker_check_unique = {# map faker attribute to dessi and mimesis attribute to check uniqueness
    "address": [None, "address", "personal"],
    "iban": ["IBAN", None, "personal"],
    "swift": ["SWIFT/BIC", None, "non-personal"],
    "credit_card_number": ["CCN", "credit_card_number", "personal"],
    "email": ["Email", "email", "personal"],
    "name": [None, "full_name", "personal"],
    "phone_number": ["Phone_number", "phone_number", "personal"],
    "ssn": ["NIN", None, "personal"],
    "passport_number": ["Passport", None, "personal"],
    "current_location": ["Geolocation", None, "personal"],    
}

In [13]:
all_data = pd.concat([add_split_type(dessi_train_prepared, "train"), add_split_type(dessi_dev_prepared, "dev"), add_split_type(dessi_test_prepared, "test")], axis=1)
all_labels = pd.concat([add_split_type(dessi_train_labels_prepared, "train"), add_split_type(dessi_dev_labels_prepared, "dev"), 
                             add_split_type(dessi_test_labels_prepared, "test")]).reset_index(drop=True)
all_classes = pd.concat([add_split_type(dessi_train_classes_prepared, "train"), add_split_type(dessi_dev_classes_prepared, "dev"), 
                         add_split_type(dessi_test_classes_prepared, "test")]).reset_index(drop=True)

mimesis_data = pd.concat([add_split_type(mimesis_train, "train"), add_split_type(mimesis_dev, "dev"), add_split_type(mimesis_test, "test")], axis=1)
mimesis_labels = pd.concat([add_split_type(mimesis_train_labels, "train"), add_split_type(mimesis_dev_labels, "dev"), add_split_type(mimesis_test_labels, "test")]).reset_index(drop=True)
mimesis_classes = pd.concat([add_split_type(mimesis_train_classes, "train"), add_split_type(mimesis_dev_classes, "dev"), add_split_type(mimesis_test_classes, "test")]).reset_index(drop=True)

faker_data = pd.concat([add_split_type(faker_train, "train"), add_split_type(faker_dev, "dev"), add_split_type(faker_test, "test")], axis=1)
faker_labels = pd.concat([add_split_type(faker_train_labels, "train"), add_split_type(faker_dev_labels, "dev"), add_split_type(faker_test_labels, "test")]).reset_index(drop=True)
faker_classes = pd.concat([add_split_type(faker_train_classes, "train"), add_split_type(faker_dev_classes, "dev"), add_split_type(faker_test_classes, "test")]).reset_index(drop=True)

dessi_all = pd.concat([dessi_train_prepared, dessi_dev_prepared, dessi_test_prepared], axis=1)
dessi_all_classes = pd.concat([dessi_train_classes_prepared, dessi_dev_classes_prepared, dessi_test_classes_prepared]).reset_index(drop=True)
mimesis_all = pd.concat([mimesis_train, mimesis_dev, mimesis_test], axis=1)
mimesis_all_classes = pd.concat([mimesis_train_classes, mimesis_dev_classes, mimesis_test_classes]).reset_index(drop=True)

keep = []
for a in range(mimesis_data.shape[1]):
    if "mixed" in mimesis_classes.iloc[a].values[0]:
        val = mimesis_classes.iloc[a].values[0][:-6]
    else:
        val = mimesis_classes.iloc[a].values[0][:-3]
    if val not in mimesis_check_unique.keys():
        keep.append(a)
all_dataset = pd.concat([pd.DataFrame(all_data.shape[1] * ["dessi"]), pd.DataFrame(mimesis_data.iloc[:, keep].shape[1] * ["mimesis"])]).reset_index(drop=True).rename(columns={0: "dataset"})
all_data = pd.concat([all_data, mimesis_data.iloc[:, keep]], axis=1)
all_labels = pd.concat([all_labels, mimesis_labels.iloc[keep]]).reset_index(drop=True)
all_classes = pd.concat([all_classes, mimesis_classes.iloc[keep]]).reset_index(drop=True)

In [14]:
keep = []
for a in range(faker_data.shape[1]):
    if "mixed" in faker_classes.iloc[a].values[0] or "fr_FR" in faker_classes.iloc[a].values[0] or "de_DE" in faker_classes.iloc[a].values[0]:
        val = faker_classes.iloc[a].values[0][:-6]
    else:
        val = faker_classes.iloc[a].values[0][:-3]
    if val not in faker_check_unique.keys():
        keep.append(a)
all_dataset = pd.concat([all_dataset, pd.DataFrame(faker_data.iloc[:, keep].shape[1] * ["faker"]).rename(columns={0: "dataset"})]).reset_index(drop=True)
all_data = pd.concat([all_data, faker_data.iloc[:, keep]], axis=1)
all_labels = pd.concat([all_labels, faker_labels.iloc[keep]]).reset_index(drop=True)
all_classes = pd.concat([all_classes, faker_classes.iloc[keep]]).reset_index(drop=True)

In [15]:
for val in mimesis_check_unique.keys():
    check_set = set(dessi_all.iloc[:,[a for a in dessi_all_classes["class"].index if mimesis_check_unique[val][0] 
                      in dessi_all_classes["class"].loc[a]]].values.flatten())
    for lan in ["en", "de", "fr", "mixed"]:
        cols = []
        for a in range(mimesis_data.shape[1]):
            if f"{val}_{lan}" == mimesis_classes.iloc[a].values[0]:
                cols.append(a)
        value_set = set(mimesis_data.iloc[:100, cols].values.flatten())
        add_values = value_set - check_set
        try:
            add_values = sorted(add_values)
        except TypeError as e:
            add_values = sorted([str(a) for a in add_values])
        random.shuffle(add_values)
        add_values = add_values[:int(np.floor(len(add_values)/100))*100]
        add_df = pd.DataFrame(np.array(add_values).reshape(100, int(len(add_values)/100)))
        add_df.columns = mimesis_data.columns[cols[:add_df.shape[1]]]
        add_df = add_split_type_equal(add_df)
        all_data = pd.concat([all_data, add_df], axis=1)
        labels_add = pd.DataFrame([mimesis_check_unique[val][1]]*add_df.shape[1])
        labels_add.columns = ["label"]
        labels_add = add_split_type_equal(labels_add)
        all_labels = pd.concat([all_labels, labels_add], ignore_index=True)
        classes_add = pd.DataFrame([f"{val}_{lan}"]*add_df.shape[1])
        classes_add.columns = ["class"]
        classes_add = add_split_type_equal(classes_add)
        all_classes = pd.concat([all_classes, classes_add], ignore_index=True)
        all_dataset = pd.concat([all_dataset, pd.DataFrame(["mimesis"]*add_df.shape[1]).rename(columns={0: "dataset"})], ignore_index=True)
        
for val in faker_check_unique.keys():
    if faker_check_unique[val][0] != None:
        check_set = set(dessi_all.iloc[:,[a for a in dessi_all_classes["class"].index if faker_check_unique[val][0] 
                      in dessi_all_classes["class"].loc[a]]].values.flatten())
    else:
        check_set = set()
    if faker_check_unique[val][1] != None:
        s1 = set([a for a in mimesis_all_classes["class"] if faker_check_unique[val][1] in a])
        check_set2 = set(mimesis_all.iloc[:, mimesis_all_classes.index[mimesis_all_classes["class"].isin(s1)]].values.flatten())
        check_set = check_set.union(check_set2)
    for lan in ["en", "de_DE", "fr_FR", "mixed"]:
        cols = []
        for a in range(faker_data.shape[1]):
            if f"{val}_{lan}" == faker_classes.iloc[a].values[0]:
                cols.append(a)
        value_set = set(faker_data.iloc[:100, cols].values.flatten())
        add_values = value_set - check_set
        try:
            add_values = sorted(add_values)
        except TypeError as e:
            add_values = sorted([str(a) for a in add_values])
        random.shuffle(add_values)
        add_values = add_values[:int(np.floor(len(add_values)/100))*100]
        add_df = pd.DataFrame(np.array(add_values).reshape(100, int(len(add_values)/100)))
        add_df.columns = faker_data.columns[cols[:add_df.shape[1]]]
        add_df = add_split_type_equal(add_df)
        all_data = pd.concat([all_data, add_df], axis=1)
        labels_add = pd.DataFrame([faker_check_unique[val][2]]*add_df.shape[1])
        labels_add.columns = ["label"]
        labels_add = add_split_type_equal(labels_add)
        all_labels = pd.concat([all_labels, labels_add], ignore_index=True)
        classes_add = pd.DataFrame([f"{val}_{lan}"]*add_df.shape[1])
        classes_add.columns = ["class"]
        classes_add = add_split_type_equal(classes_add)
        all_classes = pd.concat([all_classes, classes_add], ignore_index=True)
        all_dataset = pd.concat([all_dataset, pd.DataFrame(["faker"]*add_df.shape[1]).rename(columns={0: "dataset"})], ignore_index=True)

In [16]:
def get_shuffled_splitted_data(data, labels, classes, dataset, split_type):
    data = data.iloc[:, [a for a in range(data.shape[1]) if data.iloc[100,a] == split_type]]
    labels = labels.loc[labels["split_type"] == split_type]
    classes = classes.loc[classes["split_type"] == split_type]
    dataset = dataset.loc[classes.loc[classes["split_type"] == split_type].index]
    data = data.sample(frac=1, axis=1, random_state=42).reset_index(drop=True)
    labels = labels.sample(frac=1, random_state=42).reset_index(drop=True)
    classes = classes.sample(frac=1, random_state=42).reset_index(drop=True)
    dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True)
    return data.iloc[:100,:], labels.iloc[:,:1], classes.iloc[:,:1], dataset

train_all, train_all_labels, train_all_classes, train_all_dataset = get_shuffled_splitted_data(all_data, all_labels, all_classes, all_dataset, "train")
train_all.to_csv("train.csv", index=False)
train_all_labels.to_csv("train_labels_personal.csv", index=False)
train_all_classes.to_csv("train_classes.csv", index=False)
train_all_dataset.to_csv("train_dataset.csv", index=False)

dev_all, dev_all_labels, dev_all_classes, dev_all_dataset = get_shuffled_splitted_data(all_data, all_labels, all_classes, all_dataset, "dev")
dev_all.to_csv("dev.csv", index=False)
dev_all_labels.to_csv("dev_labels_personal.csv", index=False)
dev_all_classes.to_csv("dev_classes.csv", index=False)
dev_all_dataset.to_csv("dev_dataset.csv", index=False)

test_all, test_all_labels, test_all_classes, test_all_dataset = get_shuffled_splitted_data(all_data, all_labels, all_classes, all_dataset, "test")
test_all.to_csv("test.csv", index=False)
test_all_labels.to_csv("test_labels_personal.csv", index=False)
test_all_classes.to_csv("test_classes.csv", index=False)
test_all_dataset.to_csv("test_dataset.csv", index=False)

Create Multiclass Labels

In [17]:
mapping_multiclass = {
    "ccn": "credit_card_number",
    "current_location": "longitude_and_latitude",
    "geolocation": "longitude_and_latitude",
    "name": "full_name",
    "isbn13": "isbn",
    "nin": "national_identification_number",   #rename some classes so that GPT can understand them
    "ssn": "national_identification_number",
    "pyfloat": "float_number",
    "pyint": "integer_number",
    "swift": "SWIFT/BIC code",
    "swift/bic": "SWIFT/BIC code",
    "address": "full_address",
    "ean": "EAN_code",
    "occupation": "job",
    "organization": "company",
    "organization,phone_number": "company,phone_number",
    "passport": "passport_number",
    "religion": "religion/worldview",   #values in these columns contain both classes
    "worldview": "religion/worldview",
    "academic_degree": "academic_degree/title",          
    "title": "academic_degree/title",
    "blood_type": "blood_group",
    "sex": "gender"
}

def convert_classes(data_classes):
    new_classes = []
    for i in data_classes["class"]:
        if "mixed" in i or "de_DE" in i or "fr_FR" in i:
            new_classes.append(i[:-6].lower())
        elif "_en" in i or "_de" in i or "_fr" in i:
            new_classes.append(i[:-3].lower())
        else:
            new_classes.append(i.lower())
    for i in range(len(new_classes)):
        for a in mapping_multiclass.keys():
            if (a == new_classes[i]) or (a in new_classes[i] and "," in new_classes[i]):
                new_classes[i] = new_classes[i].replace(a, mapping_multiclass[a])
    return pd.DataFrame(new_classes).rename(columns={0: "label"})

In [18]:
convert_classes(train_all_classes).to_csv("train_labels_multi.csv", index=False)
convert_classes(dev_all_classes).to_csv("dev_labels_multi.csv", index=False)
convert_classes(test_all_classes).to_csv("test_labels_multi.csv", index=False)