In [1]:
from pathlib import Path
import os

import pandas as pd

In [2]:
csv_files = {
    'german': 'german.data',
    'compas': 'compas.csv',
    'adult': 'adult.data'
}
dataset = 'german'

data_path = Path('datasets').absolute() / dataset / csv_files[dataset]

In [3]:
csv_files = {
    'german': 'german.data',
    'compas': 'compas.csv',
    'adult': 'adult.data'
}

cols_german = [
    "checking_status", "duration", "credit_history", "purpose", "credit_amount",
    "savings_status", "employment", "installment_commitment", "personal_status",
    "other_parties", "residence_since", "property_magnitude", "age", "other_payment_plans",
    "housing", "existing_credits", "job", "num_dependents", "own_telephone",
    "foreign_worker", "target"
]

cols_adult = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"
]

cols = {
    'german': cols_german,
    'adult': cols_adult,
    'compas': None,
}

seps = {
    'german': " ",
    'adult': ",",
    'compas': ",",
}

In [4]:
mappings_german = {
    "checking_status": {
        "A11": "less than 0 DM", "A12": "between 0 DM and 200 DM", "A13": "over 200 DM", "A14": "no checking account"
    },
    "credit_history": {
        "A30": "no previous credits", "A31": "all previous credits paid back duly",
        "A32": "existing credits paid back duly so far", "A33": "some delays in the past",
        "A34": "critical account or other credits existing"
    },
    "purpose": {
        "A40": "a new car", "A41": "a used car", "A42": "furniture or equipment",
        "A43": "a radio or television", "A44": "domestic appliances",
        "A45": "home repairs", "A46": "education", "A47": "a vacation",
        "A48": "retraining", "A49": "a business", "A410": "other purposes"
    },
    "savings_status": {
        "A61": "less than 100 DM", "A62": "between 100 DM and 500 DM",
        "A63": "between 500 and 1000 DM", "A64": "over 1000 DM", "A65": "no savings account"
    },
    "employment": {
        "A71": "unemployed", "A72": "employed for less than 1 year",
        "A73": "employed for 1–4 years", "A74": "employed for 4–7 years",
        "A75": "employed for more than 7 years"
    },
    "personal_status": {
        "A91": ("male", "divorced or separated"),
        "A92": ("female", "divorced, separated, or married"),
        "A93": ("male", "single"),
        "A94": ("male", "married or widowed"),
        "A95": ("female", "single")
    },
    "other_parties": {"A101": "none", "A102": "a co-applicant", "A103": "a guarantor"},
    "property_magnitude": {
        "A121": "real estate", "A122": "life insurance",
        "A123": "a car or other asset", "A124": "no tangible property"
    },
    "other_payment_plans": {"A141": "none", "A142": "through a bank", "A143": "through stores"},
    "housing": {"A151": "rented", "A152": "owned", "A153": "living with parents"},
    "job": {
        "A171": "unskilled and non-resident",
        "A172": "unskilled but resident",
        "A173": "skilled employee or official",
        "A174": "management or self-employed"
    },
    "own_telephone": {"A191": "no", "A192": "yes"},
    "foreign_worker": {"A201": "yes", "A202": "no"},
}


In [52]:
def describe_german(row):
    gender, marital = ("", None)
    if isinstance(row["personal_status"], tuple):
        gender, marital = row["personal_status"]
        gender = " " + gender

    desc = f"A {int(row['age'])}-year-old{gender} applicant who"
    if marital:
        desc += f" is {marital} and"

    if isinstance(row["employment"], str) and "unemployed" not in row["employment"]:
        desc += f" has been {row['employment']}"
    elif "unemployed" in str(row["employment"]):
        desc += " is currently unemployed"
    desc += "."

    desc += (
        f" They applied for a loan of {int(row['credit_amount'])} Deutsche Marks "
        f"to finance {row['purpose']} lasting {int(row['duration'])} months."
    )
    desc += (
        f" Their checking account balance is {row['checking_status']} "
        f"and they have savings of {row['savings_status']}."
    )
    desc += f" Their credit history shows {row['credit_history']}."
    desc += f" They {('own' if row['housing'] == 'owned' else 'rent')} their home and possess {row['property_magnitude']}."
    if row["existing_credits"] > 1:
        desc += f" They currently hold {int(row['existing_credits'])} existing credits."
    else:
        desc += " They currently hold only one existing credit."
    if row["num_dependents"] > 0:
        desc += f" They support {int(row['num_dependents'])} dependent(s)."
    if row["own_telephone"] == "yes":
        desc += " They have a telephone registered in their name."
    else:
        desc += " They do not have a personal telephone."
    if row["foreign_worker"] == "yes":
        desc += " The applicant is a foreign worker."
    else:
        desc += " The applicant is not a foreign worker."
    return desc


def describe_adult(row):
    desc = f"A {int(row['age'])}-year-old {row['sex'].lower()}"
    if isinstance(row["marital_status"], str):
        desc += f" who is {row['marital_status'].replace('-', ' ').lower()}"
    if isinstance(row["education"], str):
        desc += f" and has an education level of {row['education'].lower()}"
    desc += "."
    if isinstance(row["occupation"], str):
        desc += f" They work as a {row['occupation'].replace('-', ' ').lower()}"
    if isinstance(row["workclass"], str):
        desc += f" in the {row['workclass'].replace('-', ' ').lower()} sector"
    desc += f", typically working {int(row['hours_per_week'])} hours per week."
    desc += f" Their race is {row['race'].lower()}, and they are from {row['native_country']}."
    return desc


def describe_compas(row):
    desc = f"A {int(row['age'])}-year-old {row['sex'].lower()} of {row['race'].lower()} race"

    # Type of current charge
    if 'c_charge_degree' in row and pd.notna(row['c_charge_degree']):
        charge_type = str(row['c_charge_degree'])
        if charge_type == 'F':
            desc += " currently facing a felony charge"
        elif charge_type == 'M':
            desc += " currently facing a misdemeanor charge"
        else:
            desc += f" currently facing a charge of type {charge_type}"
    desc += "."

    # Number of prior offenses
    if 'priors_count' in row and pd.notna(row['priors_count']):
        priors = int(row['priors_count'])
        if priors == 0:
            desc += " They have no prior offenses."
        elif priors == 1:
            desc += " They have one prior offense."
        else:
            desc += f" They have {priors} prior offenses."

    # Juvenile history
    juv_vars = ['juv_fel_count', 'juv_misd_count', 'juv_other_count']
    juv_desc = []
    for col in juv_vars:
        if col in row and pd.notna(row[col]) and int(row[col]) > 0:
            juv_desc.append(f"{int(row[col])} {col.replace('_', ' ')}")
    if juv_desc:
        desc += " Their juvenile record includes: " + ", ".join(juv_desc) + "."

    return desc


desc_fun = {
    'german': describe_german,
    'adult': describe_adult,
    'compas': describe_compas,
}

In [45]:
def process(dataset: str):
    path = Path("datasets").absolute() / dataset / csv_files[dataset]
    sep = seps[dataset]

    if dataset == "compas":
        df = pd.read_csv(path, sep=sep)
    else:
        df = pd.read_csv(path, sep=sep, names=cols.get(dataset))
        df.dropna(inplace=True)

    if dataset == "german":
        for col, mapping in mappings_german.items():
            df[col] = df[col].map(mapping).fillna(df[col])

    df["description"] = df.apply(desc_fun[dataset], axis=1)

    return df

In [53]:
df = process('compas')