Collect ten random datasets of OpenML

In [1]:
import os
import pandas as pd
import json
import openml
import random

# OpenML

Get the ten most suitable datasets out of the twenty:  
- 41710, 41767, 41818 are all FOREX datasets, one is enough   
- 44326 is an image dataset  
- 802, 44682, 45652, 260, 754 is lacking a description to classify the columns  

In [2]:
dataset_list = openml.datasets.list_datasets(output_format='dataframe')

# Select 10 random datasets
random.seed(42)                                    # limit NumberOfFeatures to 25 to avoid intensive manual labeling
dataset_filtered = dataset_list.loc[(dataset_list["NumberOfFeatures"] <= 25)]
random_dataset_ids = random.sample(dataset_filtered['did'].tolist(), 20) 
random_dataset_ids = list(set(random_dataset_ids) - {41710, 41767, 41818, 44326, 802, 44682, 45652, 719, 260, 754})
random_dataset_ids

[41729, 44962, 43557, 42055, 854, 1016, 44058, 46075, 222, 287]

In [3]:
ids = random_dataset_ids               
for id in ids:
    dataset = openml.datasets.get_dataset(id)
    name = str(dataset.name)
    os.makedirs(f"{name}", exist_ok=True)
    X, y, _, _ = dataset.get_data()
    pd.concat([X, y], axis=1).to_csv(f"{name}/data.csv", index=False)
    metadata_dict = {"Dataset Name: " : name,
                     "Description: ": str(dataset.description),
                     "Features: ": str(list(dataset.features.values()))
                     }
    with open(f"{name}/metadata.json", 'w') as file:
        json.dump(metadata_dict, file, indent=4)

automatically label the column as personal-related if it is from the first ten datasets       
Personal-related is in this context data that can be used in combination with other data to identify a person

In [57]:
def create_json_labels_personal():
    folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
    for folder in folders:
        path = folder
        csv_file = "/data.csv"
        df = pd.read_csv(path + "/" + csv_file)
        columns_personal = dict()
        #first label everything as non-personal automatically and then relabel them afterwards manually in the json file
        lab = "non-personal"            
        for c in df.columns:
            columns_personal[c] = lab
        columns_personal["overall"] = lab
        with open(f'{folder}/{csv_file}-labels_personal.json', 'w') as file:
            json.dump(columns_personal, file, indent=4)
            
#line to execute function is commented so that it is not always executed, labeling process only happens once
#create_json_labels_personal()

**SOME LABELS WERE ADJUSTED MANUALLY AFTERWARDS IN THE JSON FILES**

Convert the labels into a suitable csv format

In [2]:
folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
for folder in folders:
    path_folder = folder
    csv_file = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels' not in f][0]
    path = path_folder + "/" + csv_file + "-labels_personal.json"
    with open(path, 'r') as file:
        labels_json = json.load(file)
    pd.DataFrame(labels_json, index=[0]).T.rename(columns={0: "label"}).to_csv(path_folder + "/labels_personal.csv", index=False)

Concatenate all datasets and labels for CASSED

In [3]:
folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
all_dfs = pd.DataFrame()
all_dfs_labels = pd.DataFrame()
dataset_name = []
for folder in folders:
    path_folder = folder
    csv_file = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels' not in f][0]
    csv_file_label = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels_personal' in f][0]
    df_add = pd.read_csv(path_folder + "/" + csv_file)
    all_dfs = pd.concat([all_dfs, df_add.iloc[:100,:]], axis = 1)
    add_dfs_labels = pd.read_csv(path_folder + "/" + csv_file_label)
    all_dfs_labels = pd.concat([all_dfs_labels, add_dfs_labels.iloc[:-1]]).reset_index(drop=True)
    dataset_name += [folder] * df_add.shape[1]
all_dfs.to_csv("all_datasets.csv", index=False)
all_dfs_labels.to_csv("all_datasets_labels_personal.csv", index=False)
pd.DataFrame(dataset_name).rename(columns={0: "dataset"}).to_csv("all_datasets_names.csv", index=False)

## PII

automatically label the column as pii if it only contains unique values  
Afterwards look trough all .json files and check the column labeling      
only columns which can contain personal identifiable information without a combination with other information are marked as pii 

In [8]:

def create_json_labels_pii():
    folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
    for folder in folders:
        path = folder
        csv_file = [f for f in os.listdir(path) if f.endswith('.csv') and 'labels' not in f][0]
        with open(path + "/" + csv_file, 'r') as file:
            first_line = file.readline()
            comma_count = first_line.count(',')
            semicolon_count = first_line.count(';')
            if comma_count > semicolon_count:
                sep = ","
            else:
                sep = ";"
        df = pd.read_csv(path + "/" + csv_file, sep=sep)
        columns_personal = dict()
        for c in df.columns:
            columns_personal[c] = "non-pii"
        columns_personal["overall"] = "pii"
        with open(f'{folder}/{csv_file}-labels_pii.json', 'w') as file:
            json.dump(columns_personal, file, indent=4)
#line to execute function is commented so that it is not always executed, labeling process only happens once
#create_json_labels_pii()

**SOME LABELS WERE ADJUSTED MANUALLY AFTERWARDS IN THE JSON FILES**

Convert the labels into a suitable csv format

In [4]:
folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
for folder in folders:
    path_folder = folder
    csv_file = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels' not in f][0]
    path = path_folder + "/" + csv_file + "-labels_pii.json"
    with open(path, 'r') as file:
        labels_json = json.load(file)
    pd.DataFrame(labels_json, index=[0]).T.rename(columns={0: "label"}).to_csv(path_folder + "/labels_pii.csv", index=False)

Concatenate all datasets and labels for CASSED

In [5]:
folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
all_dfs = pd.DataFrame()
all_dfs_labels = pd.DataFrame()
for folder in folders:
    path_folder = folder
    csv_file = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels' not in f][0]
    csv_file_label = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels_pii' in f][0]
    
    with open(path_folder + "/" + csv_file, 'r') as file:
        first_line = file.readline()
        comma_count = first_line.count(',')
        semicolon_count = first_line.count(';')
        if comma_count > semicolon_count:
            sep = ","
        else:
            sep = ";"
    df_add = pd.read_csv(path_folder + "/" + csv_file, sep=sep)
    
    all_dfs = pd.concat([all_dfs, df_add.iloc[:100,:]], axis = 1)
    add_dfs_labels = pd.read_csv(path_folder + "/" + csv_file_label)
    all_dfs_labels = pd.concat([all_dfs_labels, add_dfs_labels.iloc[:-1]]).reset_index(drop=True)
all_dfs.to_csv("all_datasets.csv", index=False)
all_dfs_labels.to_csv("all_datasets_labels_pii.csv", index=False)