Collect datasets based on real-world data, that are personal related  
Search for specific personal-related categories (credit-card, insurance ...)

In [1]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi
import pandas as pd
import json

# Kaggle

In [2]:
# Authenticate the Kaggle API, an token is needed!
api = KaggleApi()
api.authenticate()

In [3]:
def download_dataset(link, path):
    api.dataset_download_files(link, path=path, unzip=True)
    api.dataset_metadata(link, path=path)

Multiple options to filter/search after datasets, look up in kaggle docs, same filter options as in web

In [15]:
# 5 Datasets with PII Information in single columns
download_dataset("kewagbln/absenteeism-at-work-uci-ml-repositiory", "absenteeism/")
download_dataset('yasserh/titanic-dataset','titanic/')
download_dataset("mohansacharya/graduate-admissions", "graduate_admissions/")
os.remove("graduate_admissions/Admission_Predict_Ver1.1.csv")
download_dataset("redwankarimsony/heart-disease-data", "heart_disease/")
download_dataset("naserabdullahalam/phishing-email-dataset", "phishing_email/")
os.remove("phishing_email/Enron.csv")
os.remove("phishing_email/Ling.csv")
os.remove("phishing_email/Nazario.csv")
os.remove("phishing_email/Nigerian_Fraud.csv")
os.remove("phishing_email/SpamAssasin.csv")
os.remove("phishing_email/phishing_email.csv")

# 5 Datasets without PII Information in single columns but the whole dataset contains PII as combined information
download_dataset("priyamchoksi/adult-census-income-dataset", "adult_census/")
download_dataset("volodymyrgavrysh/bank-marketing-campaigns-dataset", "bank_marketing/")
download_dataset("dskagglemt/student-performance-data-set", "student_performance/")
os.remove("student_performance/student-por.csv")
download_dataset("alakaaay/diabetes-uci-dataset", "diabetes/")
download_dataset("uciml/indian-liver-patient-records", "indian_liver/")

# 5 Datasets without PII Information in single columns and the whole dataset does not contain PII in combination
download_dataset("rummagelabs/pixar-movies", "pixar/")
download_dataset("jakewright/house-price-data", "house_price/")
download_dataset("rowhitswami/all-indian-companies-registration-data-1900-2019", "indian_companies/")
download_dataset("imtkaggleteam/agriculture-dataset-karnataka", "agriculture/")
download_dataset("mohitkumar282/used-car-dataset", "used_car/")

Dataset URL: https://www.kaggle.com/datasets/kewagbln/absenteeism-at-work-uci-ml-repositiory
Dataset URL: https://www.kaggle.com/datasets/yasserh/titanic-dataset
Dataset URL: https://www.kaggle.com/datasets/mohansacharya/graduate-admissions
Dataset URL: https://www.kaggle.com/datasets/redwankarimsony/heart-disease-data
Dataset URL: https://www.kaggle.com/datasets/naserabdullahalam/phishing-email-dataset
Dataset URL: https://www.kaggle.com/datasets/priyamchoksi/adult-census-income-dataset
Dataset URL: https://www.kaggle.com/datasets/volodymyrgavrysh/bank-marketing-campaigns-dataset
Dataset URL: https://www.kaggle.com/datasets/dskagglemt/student-performance-data-set
Dataset URL: https://www.kaggle.com/datasets/alakaaay/diabetes-uci-dataset
Dataset URL: https://www.kaggle.com/datasets/uciml/indian-liver-patient-records
Dataset URL: https://www.kaggle.com/datasets/rummagelabs/pixar-movies
Dataset URL: https://www.kaggle.com/datasets/jakewright/house-price-data
Dataset URL: https://www.kagg

## Personal 

automatically label the column as personal-related if it is from the first ten datasets       
Personal-related is in this context data that can be used in combination with other data to identify a person

In [29]:
def create_json_labels_personal():
    folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
    for folder in folders:
        path = folder
        csv_file = [f for f in os.listdir(path) if f.endswith('.csv') and 'labels' not in f][0]
        with open(path + "/" + csv_file, 'r') as file:
            first_line = file.readline()
            comma_count = first_line.count(',')
            semicolon_count = first_line.count(';')
            if comma_count > semicolon_count:
                sep = ","
            else:
                sep = ";"
        df = pd.read_csv(path + "/" + csv_file, sep=sep)
        columns_personal = dict()
        if any(a in folder for a in ["pixar", "house_price", "indian_companies", "auto_sales", "graduate_admissions"]):
            lab = "non-personal"
        else:
            lab = "personal"
        for c in df.columns:
            columns_personal[c] = lab
        columns_personal["overall"] = lab
        with open(f'{folder}/{csv_file}-labels_personal.json', 'w') as file:
            json.dump(columns_personal, file, indent=4)

#line to execute function is commented so that it is not always executed, labeling process only happens once
#create_json_labels_personal()

  df = pd.read_csv(path + "/" + csv_file, sep=sep)


**SOME LABELS WERE ADJUSTED MANUALLY AFTERWARDS IN THE JSON FILES**

Convert the labels into a suitable csv format

In [7]:
folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
for folder in folders:
    path_folder = folder
    csv_file = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels' not in f][0]
    path = path_folder + "/" + csv_file + "-labels_personal.json"
    with open(path, 'r') as file:
        labels_json = json.load(file)
    pd.DataFrame(labels_json, index=[0]).T.rename(columns={0: "label"}).to_csv(path_folder + "/labels_personal.csv", index=False)

Concatenate all datasets and labels for CASSED

In [8]:
folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
all_dfs = pd.DataFrame()
all_dfs_labels = pd.DataFrame()
dataset_name = []
for folder in folders:
    path_folder = folder
    csv_file = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels' not in f][0]
    csv_file_label = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels_personal' in f][0]
    
    with open(path_folder + "/" + csv_file, 'r') as file:
        first_line = file.readline()
        comma_count = first_line.count(',')
        semicolon_count = first_line.count(';')
        if comma_count > semicolon_count:
            sep = ","
        else:
            sep = ";"
    df_add = pd.read_csv(path_folder + "/" + csv_file, sep=sep)
    
    all_dfs = pd.concat([all_dfs, df_add.iloc[:100,:]], axis = 1)
    add_dfs_labels = pd.read_csv(path_folder + "/" + csv_file_label)
    all_dfs_labels = pd.concat([all_dfs_labels, add_dfs_labels.iloc[:-1]]).reset_index(drop=True)
    dataset_name += [folder] * df_add.shape[1]
all_dfs.to_csv("all_datasets.csv", index=False)
all_dfs_labels.to_csv("all_datasets_labels_personal.csv", index=False)
pd.DataFrame(dataset_name).rename(columns={0: "dataset"}).to_csv("all_datasets_names.csv", index=False)

  df_add = pd.read_csv(path_folder + "/" + csv_file, sep=sep)


## PII

automatically label the column as pii if it only contains unique values  
Afterwards look trough all .json files and check the column labeling      
only columns which can contain personal identifiable information without a combination with other information are marked as pii 

In [None]:
def create_json_labels_pii():
    folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
    for folder in folders:
        path = folder
        csv_file = [f for f in os.listdir(path) if f.endswith('.csv') and 'labels' not in f][0]
        with open(path + "/" + csv_file, 'r') as file:
            first_line = file.readline()
            comma_count = first_line.count(',')
            semicolon_count = first_line.count(';')
            if comma_count > semicolon_count:
                sep = ","
            else:
                sep = ";"
        df = pd.read_csv(path + "/" + csv_file, sep=sep)
        columns_personal = dict()
        for c in df.columns:
            columns_personal[c] = "non-pii"
        columns_personal["overall"] = "pii"
        with open(f'{folder}/{csv_file}-labels_pii.json', 'w') as file:
            json.dump(columns_personal, file, indent=4)
            
#line to execute function is commented so that it is not always executed, labeling process only happens once
#create_json_labels_pii()

**SOME LABELS WERE ADJUSTED MANUALLY AFTERWARDS IN THE JSON FILES**

Convert the labels into a suitable csv format

In [9]:
folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
for folder in folders:
    path_folder = folder
    csv_file = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels' not in f][0]
    path = path_folder + "/" + csv_file + "-labels_pii.json"
    with open(path, 'r') as file:
        labels_json = json.load(file)
    pd.DataFrame(labels_json, index=[0]).T.rename(columns={0: "label"}).to_csv(path_folder + "/labels_pii.csv", index=False)

Concatenate all datasets and labels for CASSED

In [10]:
folders = [name for name in os.listdir(".") if os.path.isdir(os.path.join(".", name))]
all_dfs = pd.DataFrame()
all_dfs_labels = pd.DataFrame()
for folder in folders:
    path_folder = folder
    csv_file = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels' not in f][0]
    csv_file_label = [f for f in os.listdir(path_folder) if f.endswith('.csv') and 'labels_pii' in f][0]
    
    with open(path_folder + "/" + csv_file, 'r') as file:
        first_line = file.readline()
        comma_count = first_line.count(',')
        semicolon_count = first_line.count(';')
        if comma_count > semicolon_count:
            sep = ","
        else:
            sep = ";"
    df_add = pd.read_csv(path_folder + "/" + csv_file, sep=sep)
    
    all_dfs = pd.concat([all_dfs, df_add.iloc[:100,:]], axis = 1)
    add_dfs_labels = pd.read_csv(path_folder + "/" + csv_file_label)
    all_dfs_labels = pd.concat([all_dfs_labels, add_dfs_labels.iloc[:-1]]).reset_index(drop=True)
all_dfs.to_csv("all_datasets.csv", index=False)
all_dfs_labels.to_csv("all_datasets_labels_pii.csv", index=False)

  df_add = pd.read_csv(path_folder + "/" + csv_file, sep=sep)
