Collect datasets based on real-world data, that are personal related  
Search for specific personal-related categories (credit-card, insurance ...)

In [1]:
import os
import pandas as pd
import json
import openml

# OpenML

In [2]:
dataset = openml.datasets.get_dataset(42371)
name = str(dataset.name)
X, y, _, _ = dataset.get_data()
print("Dataset Name: " + name)
print("Description: "+ dataset.description)
pd.concat([X, y])

Dataset Name: CSM
Description: Conventional and Social Media Movies (CSM) - Dataset 2014 and 2015 Data Set



12 features categorized as conventional and social media features. Both conventional features, collected from movies databases on Web as well as social media features(YouTube,Twitter).


Unnamed: 0,Year,Ratings,Genre,Gross,Budget,Screens,Sequel,Sentiment,Views,Likes,Dislikes,Comments,Aggregate.Followers
0,2014.0,6.3,8,9130.0,4000000.0,45.0,1,0.0,3280543.0,4632.0,425.0,636.0,1120000.0
1,2014.0,7.1,1,192000000.0,50000000.0,3306.0,2,2.0,583289.0,3465.0,61.0,186.0,12350000.0
2,2014.0,6.2,1,30700000.0,28000000.0,2872.0,1,0.0,304861.0,328.0,34.0,47.0,483000.0
3,2014.0,6.3,1,106000000.0,110000000.0,3470.0,2,0.0,452917.0,2429.0,132.0,590.0,568000.0
4,2014.0,4.7,8,17300000.0,3500000.0,2310.0,2,0.0,3145573.0,12163.0,610.0,1082.0,1923800.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,2015.0,6.4,4,1210000.0,50000000.0,66.0,1,4.0,3701061.0,9325.0,641.0,1859.0,
227,2015.0,5.5,15,21000000.0,37000000.0,2815.0,1,13.0,7119456.0,18803.0,1128.0,2290.0,
228,2015.0,5.4,8,10200000.0,35000000.0,2777.0,1,7.0,3450614.0,6823.0,325.0,409.0,
229,2015.0,5.4,1,12300000.0,3000000.0,,1,10.0,66872.0,400.0,67.0,201.0,


In [3]:
ids = [45929, 46105, 46103, 46087, 43743,       #personal data
       46382, 46381, 46351, 45934, 42371]               #non-personal data
for id in ids:
    dataset = openml.datasets.get_dataset(id)
    name = str(dataset.name)
    os.makedirs(f"{name}", exist_ok=True)
    X, y, _, _ = dataset.get_data()
    pd.concat([X, y], axis=1).to_csv(f"{name}/data.csv", index=False)
    metadata_dict = {"Dataset Name: " : name,
                     "Description: ": str(dataset.description),
                     "Features: ": str(list(dataset.features.values()))
                     }
    with open(f"{name}/metadata.json", 'w') as file:
        json.dump(metadata_dict, file, indent=4)

automatically label the column as personal-related if it is from the first ten datasets       
Personal-related is in this context data that can be used in combination with other data to identify a person

In [75]:
def create_json_labels_personal():
    folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
    for folder in folders:
        path =  folder
        csv_file = "/data.csv"
        df = pd.read_csv(path + "/" + csv_file)
        columns_personal = dict()
        #first label everything as personal automatically and then relabel them afterwards manually in the json file
        if folder in ["Amazon_Prime_Fiction", "DATASETBANK", "FitBit_HeartRate", "TVS_Loan_Default", "Oilst_Customers_Dataset"]:
            lab = "personal"  
        else:
            lab = "non-personal"            
        for c in df.columns:
            columns_personal[c] = lab
        columns_personal["overall"] = lab
        with open(f'{folder}/{csv_file}-labels_personal.json', 'w') as file:
            json.dump(columns_personal, file, indent=4)
            
#line to execute function is commented so that it is not always executed, labeling process only happens once
#create_json_labels_personal()

  df = pd.read_csv(path + "/" + csv_file)


**SOME LABELS WERE ADJUSTED MANUALLY AFTERWARDS IN THE JSON FILES**

Convert the labels into a suitable csv format

In [4]:
folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
for folder in folders:
    path_folder =  folder
    csv_file = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels' not in f][0]
    path = path_folder + "/" + csv_file + "-labels_personal.json"
    with open(path, 'r') as file:
        labels_json = json.load(file)
    pd.DataFrame(labels_json, index=[0]).T.rename(columns={0: "label"}).to_csv(path_folder + "/labels_personal.csv", index=False)

Concatenate all datasets and labels for CASSED

In [5]:
folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
all_dfs = pd.DataFrame()
all_dfs_labels = pd.DataFrame()
dataset_name = []
for folder in folders:
    path_folder =  folder
    csv_file = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels' not in f][0]
    csv_file_label = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels_personal' in f][0]
    df_add = pd.read_csv(path_folder + "/" + csv_file)
    all_dfs = pd.concat([all_dfs, df_add.iloc[:100,:]], axis = 1)
    add_dfs_labels = pd.read_csv(path_folder + "/" + csv_file_label)
    all_dfs_labels = pd.concat([all_dfs_labels, add_dfs_labels.iloc[:-1]]).reset_index(drop=True)
    dataset_name += [folder] * df_add.shape[1]
all_dfs.to_csv("all_datasets.csv", index=False)
all_dfs_labels.to_csv("all_datasets_labels_personal.csv", index=False)
pd.DataFrame(dataset_name).rename(columns={0: "dataset"}).to_csv("all_datasets_names.csv", index=False)

  df_add = pd.read_csv(path_folder + "/" + csv_file)


## PII

automatically label the column as pii if it only contains unique values  
Afterwards look trough all .json files and check the column labeling      
only columns which can contain personal identifiable information without a combination with other information are marked as pii 

In [6]:
def create_json_labels_pii():
    folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
    for folder in folders:
        path =  folder
        csv_file = [f for f in os.listdir(path) if f.endswith('.csv') and 'labels' not in f][0]
        with open(path + "/" + csv_file, 'r') as file:
            first_line = file.readline()
            comma_count = first_line.count(',')
            semicolon_count = first_line.count(';')
            if comma_count > semicolon_count:
                sep = ","
            else:
                sep = ";"
        df = pd.read_csv(path + "/" + csv_file, sep=sep)
        columns_personal = dict()
        for c in df.columns:
            columns_personal[c] = "non-pii"
        columns_personal["overall"] = "non-pii"
        with open(f'{folder}/{csv_file}-labels_pii.json', 'w') as file:
            json.dump(columns_personal, file, indent=4)

#line to execute function is commented so that it is not always executed, labeling process only happens once
#create_json_labels_pii()

  df = pd.read_csv(path + "/" + csv_file, sep=sep)


**SOME LABELS WERE ADJUSTED MANUALLY AFTERWARDS IN THE JSON FILES**

Convert the labels into a suitable csv format

In [6]:
folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
for folder in folders:
    path_folder =  folder
    csv_file = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels' not in f][0]
    path = path_folder + "/" + csv_file + "-labels_pii.json"
    with open(path, 'r') as file:
        labels_json = json.load(file)
    pd.DataFrame(labels_json, index=[0]).T.rename(columns={0: "label"}).to_csv(path_folder + "/labels_pii.csv", index=False)

Concatenate all datasets and labels for CASSED

In [7]:
folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
all_dfs = pd.DataFrame()
all_dfs_labels = pd.DataFrame()
for folder in folders:
    path_folder =  folder
    csv_file = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels' not in f][0]
    csv_file_label = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels_pii' in f][0]
    
    with open(path_folder + "/" + csv_file, 'r') as file:
        first_line = file.readline()
        comma_count = first_line.count(',')
        semicolon_count = first_line.count(';')
        if comma_count > semicolon_count:
            sep = ","
        else:
            sep = ";"
    df_add = pd.read_csv(path_folder + "/" + csv_file, sep=sep)
    
    all_dfs = pd.concat([all_dfs, df_add.iloc[:100,:]], axis = 1)
    add_dfs_labels = pd.read_csv(path_folder + "/" + csv_file_label)
    all_dfs_labels = pd.concat([all_dfs_labels, add_dfs_labels.iloc[:-1]]).reset_index(drop=True)
all_dfs.to_csv("all_datasets.csv", index=False)
all_dfs_labels.to_csv("all_datasets_labels_pii.csv", index=False)

  df_add = pd.read_csv(path_folder + "/" + csv_file, sep=sep)
