# Create binary pii labels (pii/non-pii) of the DeSSI-MF dataset

In [1]:
import pandas as pd

Map classes to pii/non-pii

In [2]:
def load_all_data(split):
    all_data_train = pd.read_csv(f"{split}.csv")
    all_data_classes = pd.read_csv(f"{split}_classes.csv")
    all_data_dataset = pd.read_csv(f"{split}_dataset.csv")
    return all_data_train, all_data_classes, all_data_dataset

train, train_classes, train_dataset = load_all_data("train")
dev, dev_classes, dev_dataset = load_all_data("dev")
test, test_classes, test_dataset = load_all_data("test")

  all_data_train = pd.read_csv(f"{split}.csv")


Map all labels to pii/non-pii

In [3]:
all_classes = pd.concat([train_classes, dev_classes, test_classes]).reset_index(drop=True)

In [4]:
data_classes = set()
for i in range(all_classes.shape[0]):
    if "mixed" in all_classes.iloc[i].values[0] or "_de_DE" in all_classes.iloc[i].values[0] or "_fr_FR" in all_classes.iloc[i].values[0]:
        data_classes.add(all_classes.iloc[i].values[0][:-6])
    elif "_en" in all_classes.iloc[i].values[0] or "_fr" in all_classes.iloc[i].values[0] or "_de" in all_classes.iloc[i].values[0]:
        data_classes.add(all_classes.iloc[i].values[0][:-3])
    else:
        data_classes.add(all_classes.iloc[i].values[0])
data_classes

{'CCN',
 'Date',
 'Date,NIN',
 'Email',
 'Email,NIN',
 'Email,Phone_number',
 'GPE',
 'Gender',
 'Geolocation',
 'IBAN',
 'ID_Card',
 'NIN',
 'NIN,Date',
 'NIN,Email',
 'NIN,Phone_number',
 'Nationality',
 'Organization',
 'Organization,Phone_number',
 'Passport',
 'Phone_number',
 'Phone_number,Email',
 'Phone_number,NIN',
 'Race',
 'Religion',
 'SWIFT/BIC',
 'Sexuality',
 'academic_degree',
 'address',
 'answer',
 'blood_group',
 'blood_type',
 'city',
 'color',
 'company',
 'cpu',
 'credit_card_number',
 'credit_card_provider',
 'currency',
 'current_location',
 'date',
 'dish',
 'drink',
 'duration',
 'ean',
 'email',
 'first_name',
 'float_number',
 'full_name',
 'gender',
 'graphics',
 'iban',
 'integer_number',
 'isbn',
 'isbn13',
 'job',
 'language',
 'last_name',
 'manufacturer',
 'measure_unit',
 'name',
 'nationality',
 'occupation',
 'passport_number',
 'phone_model',
 'phone_number',
 'political_views',
 'programming_language',
 'pyfloat',
 'pyint',
 'resolution',
 'sex',


In [5]:
manual_mapping = {
    "CCN": "pii",
    "Date": "non-pii",
    'Date,NIN': "pii",
    "Email": "pii",
    "Email,NIN": "pii",
    'Email,Phone_number' : "pii",
    "GPE": "non-pii",
    "Gender": "non-pii",
    "Geolocation": "pii",
    "IBAN": "pii",
    "ID_Card": "pii",
    "NIN": "pii",
    "NIN,Date": "pii",
    "NIN,Email" : "pii",
    "NIN,Phone_number" : "pii",
    "Nationality": "non-pii",
    "Organization": "non-pii",
    "Organization,Phone_number": "pii",
    "Passport": "pii",
    "Phone_number": "pii",
    "Phone_number,Email": "pii",
    "Phone_number,NIN" : "pii",
    "Race": "non-pii",
    "Religion": "non-pii",
    "SWIFT/BIC": "non-pii",
    "Sexuality": "non-pii",
    "academic_degree": "non-pii",
    "address": "pii",
    "answer": "non-pii",
    "blood_type": "non-pii",
    "blood_group": "non-pii",
    "city": "non-pii",
    "color": "non-pii",
    "company": "non-pii",
    "cpu": "non-pii",
    "credit_card_number": "pii",
    "credit_card_provider": "non-pii",
    "currency": "non-pii",
    "current_location": "pii",
    "date": "non-pii",
    'dish': "non-pii",
    'drink': "non-pii",
    'duration': "non-pii",
    'ean': "non-pii",
    'email': "pii",
    'first_name': "non-pii",
    'float_number': "non-pii",
    'full_name': "pii",
    'gender': "non-pii",
    'graphics': "non-pii",
    'iban': "pii",
    'integer_number': "non-pii",
    'isbn': "non-pii",
    'isbn13': "non-pii",
    'job': "non-pii",
    'language': "non-pii",
    'last_name': "non-pii",
    'manufacturer': "non-pii",
    'measure_unit': "non-pii",
    'name': "pii",
    'nationality': "non-pii",
    'occupation': "non-pii",
    'passport_number': "pii",
    'phone_model': "non-pii",
    'phone_number': "pii",
    'political_views': "non-pii",
    'programming_language': "non-pii",
    'pyfloat': "non-pii",
    'pyint': "non-pii",
    'resolution': "non-pii",
    'sex': 'non-pii',
    'ssn': "pii",
    'swift': "non-pii",
    'system_quality_attribute': "non-pii",
    'title': "non-pii",
    'url': "non-pii",
    'user_agent': "non-pii",
    'version': "non-pii",
    'word': "non-pii",
    'worldview': "non-pii"
}

In [6]:
all_labels = all_classes.copy()
for i in range(len(all_labels)):
    if "mixed" in all_classes.iloc[i,0] or "_de_DE" in all_classes.iloc[i,0] or "_fr_FR" in all_classes.iloc[i,0]:
        cla = all_classes.iloc[i,0][:-6]
    elif "_en" in all_classes.iloc[i,0]or "_fr" in all_classes.iloc[i,0] or "_de" in all_classes.iloc[i,0]:
        cla = all_classes.iloc[i,0][:-3]
    else:
        cla = all_classes.iloc[i,0]
    all_labels.iloc[i,0] = manual_mapping[cla]

In [7]:
train_labels = all_labels.iloc[:train_classes.shape[0]]
dev_labels = all_labels.iloc[train_classes.shape[0]:train_classes.shape[0]+dev_classes.shape[0]]
test_labels = all_labels.iloc[train_classes.shape[0]+dev_classes.shape[0]:]
train_labels = train_labels.rename(columns={"class": "label"})
dev_labels = dev_labels.rename(columns={"class": "label"})
test_labels = test_labels.rename(columns={"class": "label"})

In [8]:
train_labels.to_csv("train_labels_pii.csv", index=False)
dev_labels.to_csv("dev_labels_pii.csv", index=False)
test_labels.to_csv("test_labels_pii.csv", index=False)