In [None]:
# default_exp utils

In [None]:
# hide
from ipynb_path import *

In [None]:
# export
from counterfactual.import_essentials import *

In [None]:
# export
def dict2json(dictionary: dict, file_name: str):
    with open(file_name, "w") as outfile:
        json.dump(dictionary, outfile, indent = 4)

def load_json(file_name: str):
    with open(file_name) as json_file:
        return json.load(json_file)

def update_json_file(param: dict, file_name: str):
    if os.path.exists(file_name):
        old_param = load_json(file_name)
    else:
        old_param = {}
    # copy to old_param
    for k in param.keys():
        old_param[k] = param[k]
    dict2json(old_param, file_name)
    return old_param

In [None]:
def simple_transform(df: pd.DataFrame, cat_cols: list, outcome_col: str):
    """construct features to [[--cont_features--], [--cat_features--], outcome_col]"""
    cols = df.columns.tolist()
    
    assert outcome_col in cols
    
    for col in cat_cols:
        cols.remove(col)
    cols.remove(outcome_col)
    cols += cat_cols
    cols += [outcome_col]
    return df[cols]

def simple_init_params(cols: list, cat_cols: list, outcome_col: str, file_name: str):
    # load configs of adult for no reasons
    param = load_json("../counterfactual/configs/adult.json")
    for col in cat_cols:
        cols.remove(col)
    cols.remove(outcome_col)
    # copy to cont_cols
    cont_cols = cols
    param['continous_cols'] = cont_cols
    param['cat_cols'] = cat_cols
    return update_json_file(param, file_name)

# Dummy Dataset

In [None]:
# export
def bn_func(x1, x2, x3, x4):
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))
    return sigmoid(10.5 * ((x1 * x2) / 8100) + 10 - np.random.normal(1, 0.1, 10000) * x3 + 1e-3 * x4)

def x1_to_x3(x1):
    return 1/3 * x1 - 5

def x1x2_to_x4(x1, x2):
    return x1 * np.log(x2 **2) / 10 - 10

def bn_gen():
    """
    modify code from: https://github.com/divyat09/cf-feasibility/blob/master/generativecf/scripts/simple-bn-gen.py
    """
    x1 = np.random.normal(50, 15, 10000)
    x2 = np.random.normal(35, 17, 10000)
    x3 = x1_to_x3(x1) + np.random.normal(0, 1, 10000)
    x4 = x1x2_to_x4(x1, x2) + np.random.normal(0, 1, 10000)
    y= bn_func(x1, x2, x3, x4)

    data = np.zeros((x1.shape[0], 5))
    data[:, 0] = x1
    data[:, 1] = x2
    data[:, 2] = x3
    data[:, 3] = x4
    data[:, 4] = np.array(y > .5, dtype=np.int)

    return pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4', 'y'])

In [None]:
data = bn_gen()
data

Unnamed: 0,x1,x2,x3,x4,y
0,46.444967,57.013846,11.928117,25.851832,1.0
1,37.128196,59.523073,7.821358,19.228805,1.0
2,36.446772,-0.975861,8.394476,-10.424044,1.0
3,50.349917,21.483236,11.082811,21.117522,0.0
4,45.720811,9.802985,9.591727,10.688967,1.0
...,...,...,...,...,...
9995,34.491868,53.871741,6.297894,17.766362,1.0
9996,43.381417,18.289577,10.738537,15.179331,1.0
9997,43.219984,55.060190,9.571270,25.386525,1.0
9998,71.956096,39.327834,20.292500,42.318122,0.0


In [None]:
sum(data['y'])

5669.0

In [None]:
# data = bn_gen()
data.to_csv('../data/dummy_data.csv', index=False)

# Adult Dataset

In [None]:
# export

def load_adult_income_dataset(path=None):
    """Loads adult income dataset from https://archive.ics.uci.edu/ml/datasets/Adult and prepares the data for data analysis based on https://rpubs.com/H_Zhu/235617
    :return adult_data: returns preprocessed adult income dataset.

    copy from https://github.com/interpretml/DiCE/blob/master/dice_ml/utils/helpers.py
    """
    if path is None:
        raw_data = np.genfromtxt(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
            delimiter=', ',
            dtype=str
        )
    else:
        raw_data = np.genfromtxt(
            path,
            delimiter=', ',
            dtype=str
        )

    #  column names from "https://archive.ics.uci.edu/ml/datasets/Adult"
    column_names = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
                    'marital-status', 'occupation', 'relationship', 'race', 'gender',
                    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

    adult_data = pd.DataFrame(raw_data, columns=column_names)

    # For more details on how the below transformations are made, please refer to https://rpubs.com/H_Zhu/235617
    adult_data = adult_data.astype(
        {"age": np.int64, "educational-num": np.int64, "hours-per-week": np.int64})

    adult_data = adult_data.replace(
        {'workclass': {'Without-pay': 'Other/Unknown', 'Never-worked': 'Other/Unknown'}})
    adult_data = adult_data.replace({'workclass': {
                                    'Federal-gov': 'Government', 'State-gov': 'Government', 'Local-gov': 'Government'}})
    adult_data = adult_data.replace(
        {'workclass': {'Self-emp-not-inc': 'Self-Employed', 'Self-emp-inc': 'Self-Employed'}})
    adult_data = adult_data.replace(
        {'workclass': {'Never-worked': 'Self-Employed', 'Without-pay': 'Self-Employed'}})
    adult_data = adult_data.replace({'workclass': {'?': 'Other/Unknown'}})

    adult_data = adult_data.replace({'occupation': {'Adm-clerical': 'White-Collar', 'Craft-repair': 'Blue-Collar',
                                                    'Exec-managerial': 'White-Collar', 'Farming-fishing': 'Blue-Collar',
                                                    'Handlers-cleaners': 'Blue-Collar',
                                                    'Machine-op-inspct': 'Blue-Collar', 'Other-service': 'Service',
                                                    'Priv-house-serv': 'Service',
                                                    'Prof-specialty': 'Professional', 'Protective-serv': 'Service',
                                                    'Tech-support': 'Service',
                                                    'Transport-moving': 'Blue-Collar', 'Unknown': 'Other/Unknown',
                                                    'Armed-Forces': 'Other/Unknown', '?': 'Other/Unknown'}})

    adult_data = adult_data.replace({'marital-status': {'Married-civ-spouse': 'Married',
                                                        'Married-AF-spouse': 'Married', 'Married-spouse-absent': 'Married', 'Never-married': 'Single'}})

    adult_data = adult_data.replace({'race': {'Black': 'Other', 'Asian-Pac-Islander': 'Other',
                                              'Amer-Indian-Eskimo': 'Other'}})

    adult_data = adult_data[['age', 'hours-per-week', 'workclass', 'education', 'marital-status',
                             'occupation', 'race', 'gender', 'income']]

    adult_data = adult_data.replace({'income': {'<=50K': 0, '>50K': 1}})

    adult_data = adult_data.replace({'education': {'Assoc-voc': 'Assoc', 'Assoc-acdm': 'Assoc',
                                                   '11th': 'School', '10th': 'School', '7th-8th': 'School', '9th': 'School',
                                                   '12th': 'School', '5th-6th': 'School', '1st-4th': 'School', 'Preschool': 'School'}})

    adult_data = adult_data.rename(
        columns={'marital-status': 'marital_status', 'hours-per-week': 'hours_per_week'})

    return adult_data

In [None]:
adult = load_adult_income_dataset('../data/adult.data')
adult.to_csv('../data/adult.csv', index=False)

In [None]:
s_adult = shuffle(adult)
s_adult.to_csv('../data/s_adult.csv', index=False)

In [None]:
adult['y'] = adult['income'].apply(lambda x: uniform(0.05, 0.2) if x == 0 else uniform(0.8, 0.95))

# OULAD

In [None]:
#export
def load_learning_analytic_data(path='../data/oulad'):
    def weighted_score(x):
        d = {}
        total_weight = sum(x['weight'])
        d['weight'] = total_weight
        if sum(x['weight']) == 0:
            d['weighted_score'] = sum(x['score']) / len(x['score'])
        else:
            d['weighted_score'] = sum(x['score'] * x['weight']) / sum(x['weight'])
        return pd.DataFrame(d, index=[0])

    def clicks(x):
        types = x['activity_type']
        sum_clicks = x['sum_click']
    #     for t, c in zip(types, sum_clicks):
    #         x[f"{t}_click"] = c
        return pd.DataFrame({f"{t}_click": c for t, c in zip(types, sum_clicks)}, index=[0])

    print('loading pandas dataframes...')

    assessment = pd.read_csv(f'{path}/assessments.csv')
    courses = pd.read_csv(f'{path}/courses.csv')
    student_assessment = pd.read_csv(f'{path}/studentAssessment.csv')
    student_info = pd.read_csv(f'{path}/studentInfo.csv')
    student_regist = pd.read_csv(f'{path}/studentRegistration.csv')
    student_vle = pd.read_csv(f'{path}/studentVle.csv')
    vle = pd.read_csv(f'{path}/vle.csv')

    print('preprocessing assessment...')

    # note: only count for submitted assessment, not weighted for unsubmitted ones
    assessment_merged = student_assessment.merge(assessment)
    assessment_grouped = assessment_merged.groupby(['code_module', 'code_presentation', 'id_student']).apply(weighted_score)
    assessment_df = assessment_grouped.reset_index(None).drop(['level_3'], axis=1)

    print('preprocessing vle...')

    # vle
    grouped_vle = student_vle.merge(vle).groupby(['activity_type', 'code_module', 'code_presentation', 'id_student'])
    sumed_vle = grouped_vle.sum().drop(['id_site', 'date', 'week_from', 'week_to'], axis=1).reset_index()
    grouped_vle = sumed_vle.groupby(['code_module', 'code_presentation', 'id_student']).apply(clicks)
    vle_df = grouped_vle.reset_index(None).drop(['level_3'], axis=1)

    student_df = student_info.merge(assessment_df, on=['code_module', 'code_presentation', 'id_student'], how='left')\
        .merge(vle_df, on=['code_module', 'code_presentation', 'id_student'], how='left')

    return student_df[['num_of_prev_attempts', 'weight', 'weighted_score',
                       'forumng_click', 'homepage_click', 'oucontent_click',
                       'resource_click', 'subpage_click', 'url_click', 'dataplus_click',
                       'glossary_click', 'oucollaborate_click', 'quiz_click',
                       'ouelluminate_click', 'sharedsubpage_click', 'questionnaire_click',
                       'page_click', 'externalquiz_click', 'ouwiki_click', 'dualpane_click',
                       'folder_click', 'repeatactivity_click', 'htmlactivity_click',
                                      'code_module', 'gender', 'region',
                       'highest_education', 'imd_band', 'age_band','studied_credits',
                       'disability', 'final_result']]

In [None]:
%%time
student_df = load_learning_analytic_data(path='../data/oulad')

loading pandas dataframes...
preprocessing assessment...
preprocessing vle...
Wall time: 1min 15s


In [None]:
student_df = shuffle(student_df)
student_df.to_csv('../data/s_student.csv', index=None)

In [None]:
student_df = pd.read_csv('../data/s_student.csv')

In [None]:
student_df['final_result'] = student_df['final_result'].apply(lambda x: -1 if x == 0 else 1)
student_df

Unnamed: 0,num_of_prev_attempts,weight,weighted_score,forumng_click,homepage_click,oucontent_click,resource_click,subpage_click,url_click,dataplus_click,...,htmlactivity_click,code_module,gender,region,highest_education,imd_band,age_band,studied_credits,disability,final_result
0,0,0.0,0.000000,5.0,16.0,81.0,0.0,8.0,2.0,0.0,...,0.0,AAA,M,East Anglian Region,A Level or Equivalent,70-80%,0-35,120,N,-1
1,0,82.0,80.731707,558.0,191.0,14.0,63.0,79.0,14.0,0.0,...,0.0,BBB,F,Yorkshire Region,HE Qualification,30-40%,35-55,60,N,1
2,0,0.0,0.000000,46.0,7.0,3.0,0.0,3.0,0.0,0.0,...,0.0,FFF,M,South Region,A Level or Equivalent,90-100%,0-35,90,N,-1
3,0,200.0,31.375000,914.0,730.0,434.0,71.0,142.0,8.0,0.0,...,0.0,CCC,F,North Western Region,A Level or Equivalent,10-20,0-35,90,N,-1
4,0,12.5,90.000000,20.0,8.0,17.0,7.0,18.0,1.0,0.0,...,0.0,FFF,F,North Western Region,A Level or Equivalent,60-70%,0-35,120,N,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32588,0,100.0,65.500000,466.0,266.0,0.0,36.0,88.0,14.0,0.0,...,0.0,BBB,F,East Anglian Region,A Level or Equivalent,10-20,0-35,60,N,1
32589,2,100.0,64.500000,41.0,57.0,319.0,3.0,52.0,3.0,1.0,...,1.0,FFF,M,Scotland,HE Qualification,10-20,35-55,120,N,1
32590,0,70.0,58.071429,36.0,115.0,0.0,35.0,56.0,2.0,0.0,...,0.0,CCC,M,West Midlands Region,A Level or Equivalent,40-50%,0-35,60,N,-1
32591,1,0.0,0.000000,8.0,25.0,0.0,4.0,2.0,2.0,0.0,...,0.0,BBB,F,South Region,Lower Than A Level,40-50%,0-35,120,Y,-1


In [None]:
student_df.to_csv('../data/s_margin_student.csv', index=None)

In [None]:
assessment = pd.read_csv('../data/oulad/assessments.csv')
courses = pd.read_csv('../data/oulad/courses.csv')
student_assessment = pd.read_csv('../data/oulad/studentAssessment.csv')
student_info = pd.read_csv('../data/oulad/studentInfo.csv')
student_regist = pd.read_csv('../data/oulad/studentRegistration.csv')
student_vle = pd.read_csv('../data/oulad/studentVle.csv')
vle = pd.read_csv('../data/oulad/vle.csv')


In [None]:
s_assess = student_assessment.merge(assessment)
s_assess[['id_student', 'is_banked', 'score', 'code_module', 'assessment_type', 'weight']]

Unnamed: 0,id_student,is_banked,score,code_module,assessment_type,weight
0,11391,0,78.0,AAA,TMA,10.0
1,28400,0,70.0,AAA,TMA,10.0
2,31604,0,72.0,AAA,TMA,10.0
3,32885,0,69.0,AAA,TMA,10.0
4,38053,0,79.0,AAA,TMA,10.0
...,...,...,...,...,...,...
173907,527538,0,60.0,GGG,CMA,0.0
173908,534672,0,100.0,GGG,CMA,0.0
173909,546286,0,80.0,GGG,CMA,0.0
173910,546724,0,100.0,GGG,CMA,0.0


In [None]:
%%time
def weighted_score(x):
    d = {}
    total_weight = sum(x['weight'])
    d['weight'] = total_weight
    if sum(x['weight']) == 0:
        d['weighted_score'] = sum(x['score']) / len(x['score'])
    else:
        d['weighted_score'] = sum(x['score'] * x['weight']) / sum(x['weight'])
    return pd.DataFrame(d, index=[0])

s_assess = s_assess.groupby(['code_module', 'code_presentation', 'id_student']).apply(weighted_score).reset_index(None)

Wall time: 15.9 s


In [None]:
%%time

def clicks(x):
    
    types = x['activity_type']
    sum_clicks = x['sum_click']
#     for t, c in zip(types, sum_clicks):
#         x[f"{t}_click"] = c
    return pd.DataFrame({f"{t}_click": c for t, c in zip(types, sum_clicks)}, index=[0])

grouped = student_vle.merge(vle).groupby(['activity_type', 'code_module', 'code_presentation', 'id_student'])
grouped_vle = grouped.sum().drop(['id_site', 'date', 'week_from', 'week_to'], axis=1).reset_index()
grouped_vle = grouped_vle.groupby(['code_module', 'code_presentation', 'id_student']).apply(clicks)
grouped_vle = grouped_vle.reset_index(None).drop(['level_3'], axis=1)


Wall time: 54 s


In [None]:
grouped_vle

Unnamed: 0,code_module,code_presentation,id_student,forumng_click,homepage_click,oucontent_click,resource_click,subpage_click,url_click,dataplus_click,...,ouelluminate_click,sharedsubpage_click,questionnaire_click,page_click,externalquiz_click,ouwiki_click,dualpane_click,folder_click,repeatactivity_click,htmlactivity_click
0,AAA,2013J,11391,193.0,138.0,553.0,13.0,32.0,5.0,,...,,,,,,,,,,
1,AAA,2013J,28400,417.0,324.0,537.0,12.0,87.0,48.0,10.0,...,,,,,,,,,,
2,AAA,2013J,30268,126.0,59.0,66.0,4.0,22.0,4.0,,...,,,,,,,,,,
3,AAA,2013J,31604,634.0,432.0,836.0,19.0,144.0,90.0,2.0,...,,,,,,,,,,
4,AAA,2013J,32885,194.0,204.0,494.0,45.0,79.0,14.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29223,GGG,2014J,2640965,,22.0,6.0,4.0,9.0,,,...,,,,,,,,,,
29224,GGG,2014J,2645731,65.0,167.0,348.0,109.0,47.0,,,...,,,,,,,,,,
29225,GGG,2014J,2648187,,63.0,79.0,19.0,20.0,,,...,,,,,,,,,,
29226,GGG,2014J,2679821,118.0,65.0,40.0,9.0,12.0,,,...,,,,,,,,,,


In [None]:
student_info.merge(grouped_vle, on=['code_module', 'code_presentation', 'id_student'], how='right').merge(s_assess, on=['code_module', 'code_presentation', 'id_student'], how='right')

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,...,page_click,externalquiz_click,ouwiki_click,dualpane_click,folder_click,repeatactivity_click,htmlactivity_click,level_3,weight,weighted_score
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0.0,240.0,...,,,,,,,,0,100.0,82.400000
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0.0,60.0,...,,,,,,,,0,100.0,65.400000
2,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0.0,60.0,...,,,,,,,,0,100.0,76.300000
3,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0.0,60.0,...,,,,,,,,0,100.0,55.000000
4,AAA,2013J,38053,M,Wales,A Level or Equivalent,80-90%,35-55,0.0,60.0,...,,,,,,,,0,100.0,66.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25838,GGG,2014J,2620947,F,Scotland,A Level or Equivalent,80-90%,0-35,0.0,30.0,...,,,,,,,,0,0.0,88.888889
25839,GGG,2014J,2645731,F,East Anglian Region,Lower Than A Level,40-50%,35-55,0.0,30.0,...,,,,,,,,0,0.0,88.111111
25840,GGG,2014J,2648187,F,South Region,A Level or Equivalent,20-30%,0-35,0.0,30.0,...,,,,,,,,0,0.0,76.666667
25841,GGG,2014J,2679821,F,South East Region,Lower Than A Level,90-100%,35-55,0.0,30.0,...,,,,,,,,0,0.0,91.500000


# HELOC

home equity line of credit
https://community.fico.com/s/explainable-machine-learning-challenge

In [None]:
df = pd.read_csv("../data/heloc_dataset.csv")
df

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,Bad,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,Bad,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0
2,Bad,67,66,5,24,9,0,0,100,-7,...,44,0,4,4,53,66,4,2,1,86
3,Bad,66,169,1,73,28,1,1,93,76,...,57,0,5,4,72,83,6,4,3,91
4,Bad,81,333,27,132,12,0,0,100,-7,...,25,0,1,1,51,89,3,1,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,Good,73,131,5,57,21,0,0,95,80,...,19,7,0,0,26,-8,5,2,0,100
10455,Bad,65,147,39,68,11,0,0,92,28,...,42,1,1,1,86,53,2,2,1,80
10456,Bad,74,129,6,64,18,1,1,100,-7,...,33,3,4,4,6,-8,5,-8,0,56
10457,Bad,72,234,12,113,42,2,2,96,35,...,20,6,0,0,19,-8,4,1,0,38


In [None]:
def map_MaxDelq2PublicRecLast12M(x):
    if x == 0:
        return "Derogatory Comment"
    elif x == 1:
        return "120+ Days Delinquent"
    elif x == 2:
        return "90 Days Delinquent"
    elif x == 3:
        return "60 Days Delinquent"
    elif x == 4:
        return "30 Days Delinquent"
    elif x == 7:
        return "Never Delinquent"
    else:
        return None

def map_MaxDelqEver(x):
    if x == 2:
        return "Derogatory Comment"
    elif x == 3:
        return "120+ Days Delinquent"
    elif x == 4:
        return "90 Days Delinquent"
    elif x == 5:
        return "60 Days Delinquent"
    elif x == 6:
        return "30 Days Delinquent"
    elif x == 8:
        return "Never Delinquent"
    else:
        return None
        
df['MaxDelq2PublicRecLast12M'] = df['MaxDelq2PublicRecLast12M'].apply(map_MaxDelq2PublicRecLast12M)
df['MaxDelqEver'] = df['MaxDelqEver'].apply(map_MaxDelqEver)

In [None]:
columns = df.columns.tolist()
cat_cols_list = ['MaxDelq2PublicRecLast12M', 'MaxDelqEver', 'RiskPerformance']

for col in cat_cols_list:
    columns.remove(col)
columns += cat_cols_list
df = df[columns]
df

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,NumTotalTrades,...,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,MaxDelq2PublicRecLast12M,MaxDelqEver,RiskPerformance
0,55,144,4,84,20,3,0,83,2,23,...,0,33,-8,8,1,1,69,60 Days Delinquent,60 Days Delinquent,Bad
1,61,58,15,41,2,4,4,100,-7,7,...,0,0,-8,0,-8,-8,0,Derogatory Comment,Never Delinquent,Bad
2,67,66,5,24,9,0,0,100,-7,9,...,4,53,66,4,2,1,86,Never Delinquent,Never Delinquent,Bad
3,66,169,1,73,28,1,1,93,76,30,...,4,72,83,6,4,3,91,,30 Days Delinquent,Bad
4,81,333,27,132,12,0,0,100,-7,12,...,1,51,89,3,1,0,80,Never Delinquent,Never Delinquent,Bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,73,131,5,57,21,0,0,95,80,21,...,0,26,-8,5,2,0,100,,30 Days Delinquent,Good
10455,65,147,39,68,11,0,0,92,28,12,...,1,86,53,2,2,1,80,,30 Days Delinquent,Bad
10456,74,129,6,64,18,1,1,100,-7,18,...,4,6,-8,5,-8,0,56,,Never Delinquent,Bad
10457,72,234,12,113,42,2,2,96,35,45,...,0,19,-8,4,1,0,38,,Derogatory Comment,Bad


In [None]:
df.columns.tolist()

['ExternalRiskEstimate',
 'MSinceOldestTradeOpen',
 'MSinceMostRecentTradeOpen',
 'AverageMInFile',
 'NumSatisfactoryTrades',
 'NumTrades60Ever2DerogPubRec',
 'NumTrades90Ever2DerogPubRec',
 'PercentTradesNeverDelq',
 'MSinceMostRecentDelq',
 'NumTotalTrades',
 'NumTradesOpeninLast12M',
 'PercentInstallTrades',
 'MSinceMostRecentInqexcl7days',
 'NumInqLast6M',
 'NumInqLast6Mexcl7days',
 'NetFractionRevolvingBurden',
 'NetFractionInstallBurden',
 'NumRevolvingTradesWBalance',
 'NumInstallTradesWBalance',
 'NumBank2NatlTradesWHighUtilization',
 'PercentTradesWBalance',
 'MaxDelq2PublicRecLast12M',
 'MaxDelqEver',
 'RiskPerformance']

# Default of credit card clients

http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients

In [None]:
df = pd.read_csv('../data/extra/credit_card.csv')
df = df.drop(['ID'], axis=1)
df

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Y
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,1,3,1,39,0,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,150000,1,3,2,43,-1,-1,-1,-1,0,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,80000,1,3,1,41,1,-1,0,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [None]:
cat_cols = ['SEX', 'EDUCATION', 'MARRIAGE',]
outcome_col = 'Y'
file_name = "data/extra/s_credit_cart.csv"

In [None]:
df = simple_transform(df, cat_cols, outcome_col)
df = shuffle(df)
df.to_csv('../' + file_name, index=False)

In [None]:
# load configs of adult for no reasons
param = load_json("../counterfactual/configs/adult.json")
cols = df.columns.tolist()
for col in cat_cols:
    cols.remove(col)
cols.remove(outcome_col)
# copy to cont_cols
cont_cols = cols
param['data_dir'] = file_name
param['continous_cols'] = cont_cols
param['discret_cols'] = cat_cols
update_json_file(param, "../counterfactual/configs/extra/s_credit_cart.json")

{'data_dir': 'data/extra/s_credit_cart.csv',
 'lr': 0.01,
 'batch_size': 128,
 'lambda_1': 1.0,
 'lambda_2': 0.01,
 'lambda_3': 1.0,
 'threshold': 1.0,
 'continous_cols': ['LIMIT_BAL',
  'AGE',
  'PAY_0',
  'PAY_2',
  'PAY_3',
  'PAY_4',
  'PAY_5',
  'PAY_6',
  'BILL_AMT1',
  'BILL_AMT2',
  'BILL_AMT3',
  'BILL_AMT4',
  'BILL_AMT5',
  'BILL_AMT6',
  'PAY_AMT1',
  'PAY_AMT2',
  'PAY_AMT3',
  'PAY_AMT4',
  'PAY_AMT5',
  'PAY_AMT6'],
 'discret_cols': ['SEX', 'EDUCATION', 'MARRIAGE'],
 'encoder_dims': [29, 50, 10],
 'decoder_dims': [10, 10],
 'explainer_dims': [10, 50],
 'loss_1': 'mse',
 'loss_2': 'mse',
 'loss_3': 'mse'}

# German Credit Card Dataset

http://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)

In [None]:
data = np.genfromtxt('../data/extra/german.data', delimiter=' ',
            dtype=str)
cols = ['status', 'duration', 'history', 'purpose', 'credit_amount', 
        'savings', 'employment', 'installment_rate', 'sex', 'other_debtors',
       'residence', 'property', 'age', 'other_plan', 'housing', 'credits', 'job',
       'num_people_liable', 'telephone', 'foreign_worker', 'Y']
cat_cols = ['status', 'history','purpose', 'savings', 'employment','sex','other_debtors',
           'property','other_plan','housing','job','telephone', 'foreign_worker']

In [None]:
df = pd.DataFrame(data, columns=cols)
# df = df.infer_objects()
df['Y'] = df['Y'].astype(int).apply(lambda x: x - 1)

In [None]:
outcome_col = 'Y'
file_name = "data/extra/s_german_credit.csv"

In [None]:
df = simple_transform(df, cat_cols=cat_cols, outcome_col='Y')
df = shuffle(df)
df.to_csv('../' + file_name, index=False)

In [None]:
# load configs of adult for no reasons
param = load_json("../counterfactual/configs/adult.json")
cols = df.columns.tolist()
for col in cat_cols:
    cols.remove(col)
cols.remove(outcome_col)
# copy to cont_cols
cont_cols = cols
param['data_dir'] = file_name
param['continous_cols'] = cont_cols
param['discret_cols'] = cat_cols
update_json_file(param, "../counterfactual/configs/extra/s_german_credit.json")

{'data_dir': 'data/extra/s_german_credit.csv',
 'lr': 0.01,
 'batch_size': 128,
 'lambda_1': 1.0,
 'lambda_2': 0.01,
 'lambda_3': 1.0,
 'threshold': 1.0,
 'continous_cols': ['duration',
  'credit_amount',
  'installment_rate',
  'residence',
  'age',
  'credits',
  'num_people_liable'],
 'discret_cols': ['status',
  'history',
  'purpose',
  'savings',
  'employment',
  'sex',
  'other_debtors',
  'property',
  'other_plan',
  'housing',
  'job',
  'telephone',
  'foreign_worker'],
 'encoder_dims': [29, 50, 10],
 'decoder_dims': [10, 10],
 'explainer_dims': [10, 50],
 'loss_1': 'mse',
 'loss_2': 'mse',
 'loss_3': 'mse'}

# Student Performance

http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
https://dash-xai.herokuapp.com/

In [None]:
df = pd.read_csv('../data/extra/student_performance.csv')
df

Unnamed: 0,G2,G1,failures,higher,age,school,goout,Mjob,Fjob,health,freetime,absences,Walc,famrel,Medu,Fedu,G3
0,D,F,0,yes,18,GP,4,at_home,teacher,3,3,4,1,4,4,4,D
1,D,F,0,yes,17,GP,3,at_home,other,3,3,2,1,5,1,1,D
2,C,C,0,yes,15,GP,2,at_home,other,3,3,6,3,4,1,1,C
3,B,B,0,yes,15,GP,2,health,services,5,2,0,1,3,4,2,B
4,C,D,0,yes,16,GP,2,other,other,5,3,0,2,4,3,3,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,D,D,1,yes,19,MS,2,services,other,5,4,4,2,5,2,3,D
645,B,B,0,yes,18,MS,4,teacher,services,1,3,4,1,4,3,1,A
646,C,D,0,yes,18,MS,1,other,other,5,1,6,1,1,1,1,F
647,D,D,0,yes,17,MS,5,services,services,2,4,6,4,2,3,1,D


In [None]:
cat_cols = ['G2', 'G1', 'higher', 'school', 'goout', 'Mjob', 'Fjob', 'health', 
            'freetime', 'absences', 'Walc', 'famrel', 'Medu', 'Fedu']
outcome_col = 'G3'
file_name = "data/extra/s_student_performance.csv"

In [None]:
df['G3'] = df['G3'].apply(lambda x: 1 if (x == 'A' or x == 'B') else 0)

In [None]:
df = simple_transform(df, cat_cols, outcome_col)
df = shuffle(df)
df.to_csv('../' + file_name, index=False)

In [None]:
# load configs of adult for no reasons
param = load_json("../counterfactual/configs/adult.json")
cols = df.columns.tolist()
for col in cat_cols:
    cols.remove(col)
cols.remove(outcome_col)
# copy to cont_cols
cont_cols = cols
param['data_dir'] = file_name
param['continous_cols'] = cont_cols
param['discret_cols'] = cat_cols
update_json_file(param, "../counterfactual/configs/extra/s_student_performance.json")

{'data_dir': 'data/extra/s_student_performance.csv',
 'lr': 0.01,
 'batch_size': 128,
 'lambda_1': 1.0,
 'lambda_2': 0.01,
 'lambda_3': 1.0,
 'threshold': 1.0,
 'continous_cols': ['failures', 'age'],
 'discret_cols': ['G2',
  'G1',
  'higher',
  'school',
  'goout',
  'Mjob',
  'Fjob',
  'health',
  'freetime',
  'absences',
  'Walc',
  'famrel',
  'Medu',
  'Fedu'],
 'encoder_dims': [29, 50, 10],
 'decoder_dims': [10, 10],
 'explainer_dims': [10, 50],
 'loss_1': 'mse',
 'loss_2': 'mse',
 'loss_3': 'mse'}

# Heart

https://www.kaggle.com/andrewmvd/heart-failure-clinical-data

In [None]:
df = pd.read_csv('../data/extra/heart.csv')
df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [None]:
cat_cols = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']
outcome_col = 'DEATH_EVENT'
file_name = "data/extra/s_heart.csv"

In [None]:
df = simple_transform(df, cat_cols=cat_cols, outcome_col=outcome_col)
df = shuffle(df)
df.to_csv('../' + file_name, index=False)

In [None]:
# load configs of adult for no reasons
param = load_json("../counterfactual/configs/adult.json")
cols = df.columns.tolist()
for col in cat_cols:
    cols.remove(col)
cols.remove(outcome_col)
# copy to cont_cols
cont_cols = cols
param['data_dir'] = file_name
param['continous_cols'] = cont_cols
param['discret_cols'] = cat_cols
update_json_file(param, "../counterfactual/configs/extra/heart.json")

{'data_dir': 'data/extra/s_heart.csv',
 'lr': 0.01,
 'batch_size': 128,
 'lambda_1': 1.0,
 'lambda_2': 0.01,
 'lambda_3': 1.0,
 'threshold': 1.0,
 'continous_cols': ['age',
  'creatinine_phosphokinase',
  'ejection_fraction',
  'platelets',
  'serum_creatinine',
  'serum_sodium',
  'time'],
 'discret_cols': ['anaemia',
  'diabetes',
  'high_blood_pressure',
  'sex',
  'smoking'],
 'encoder_dims': [29, 50, 10],
 'decoder_dims': [10, 10],
 'explainer_dims': [10, 50],
 'loss_1': 'mse',
 'loss_2': 'mse',
 'loss_3': 'mse'}

# Titanic

https://www.kaggle.com/ak1352/titanic-cl?select=train_cl.csv

In [None]:
df = pd.read_csv('../data/extra/titanic.csv')
df = df.drop(['Embarked_0'], axis=1)
df

Unnamed: 0,Survived,Sex,Age,Fare,family,AgeGroup_1,AgeGroup_2,AgeGroup_3,AgeGroup_4,SibSp_0,...,Pclass_2,Pclass_3,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,isAlone
0,0,0,22.000000,7.2500,2,0,0,0,1,0,...,0,1,1,0,0,0,0,0,0,0
1,1,1,38.000000,71.2833,2,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,1,26.000000,7.9250,1,0,0,1,0,1,...,0,1,1,0,0,0,0,0,0,1
3,1,1,35.000000,53.1000,2,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,35.000000,8.0500,1,0,0,0,1,1,...,0,1,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0,27.000000,13.0000,1,0,0,0,1,1,...,1,0,1,0,0,0,0,0,0,1
887,1,1,19.000000,30.0000,1,0,0,1,0,1,...,0,0,1,0,0,0,0,0,0,1
888,0,1,29.699118,23.4500,4,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
889,1,0,26.000000,30.0000,1,0,0,0,1,1,...,0,0,1,0,0,0,0,0,0,1


In [None]:
df.columns

Index(['Survived', 'Sex', 'Age', 'Fare', 'family', 'AgeGroup_1', 'AgeGroup_2',
       'AgeGroup_3', 'AgeGroup_4', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3',
       'SibSp_4', 'SibSp_5', 'SibSp_8', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5',
       'Parch_6', 'isAlone'],
      dtype='object')

In [None]:
cat_cols = ['Sex', 'family', 'AgeGroup_1', 'AgeGroup_2',
       'AgeGroup_3', 'AgeGroup_4', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3',
       'SibSp_4', 'SibSp_5', 'SibSp_8', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5',
       'Parch_6', 'isAlone']
outcome_col = 'Survived'
file_name = "data/extra/s_titanic.csv"

In [None]:
df = simple_transform(df, cat_cols=cat_cols, outcome_col=outcome_col)
df = shuffle(df)
df.to_csv('../' + file_name, index=False)

In [None]:
# load configs of adult for no reasons
param = load_json("../counterfactual/configs/adult.json")
cols = df.columns.tolist()
for col in cat_cols:
    cols.remove(col)
cols.remove(outcome_col)
# copy to cont_cols
cont_cols = cols
param['data_dir'] = file_name
param['continous_cols'] = cont_cols
param['discret_cols'] = cat_cols
update_json_file(param, "../counterfactual/configs/extra/titanic.json")

{'data_dir': 'data/extra/s_titanic.csv',
 'lr': 0.01,
 'batch_size': 128,
 'lambda_1': 1.0,
 'lambda_2': 0.01,
 'lambda_3': 1.0,
 'threshold': 1.0,
 'continous_cols': ['Age', 'Fare'],
 'discret_cols': ['Sex',
  'family',
  'AgeGroup_1',
  'AgeGroup_2',
  'AgeGroup_3',
  'AgeGroup_4',
  'SibSp_0',
  'SibSp_1',
  'SibSp_2',
  'SibSp_3',
  'SibSp_4',
  'SibSp_5',
  'SibSp_8',
  'Pclass_1',
  'Pclass_2',
  'Pclass_3',
  'Parch_0',
  'Parch_1',
  'Parch_2',
  'Parch_3',
  'Parch_4',
  'Parch_5',
  'Parch_6',
  'isAlone'],
 'encoder_dims': [29, 50, 10],
 'decoder_dims': [10, 10],
 'explainer_dims': [10, 50],
 'loss_1': 'mse',
 'loss_2': 'mse',
 'loss_3': 'mse'}

# breast-cancer-wisconsin

In [None]:
data = np.genfromtxt('../data/wdbc.data', delimiter=',',
            dtype=str)

In [None]:
cols = ['ID', 'Diagnosis']
for i in range(1, 31):
    cols.append(f'X_{i}')

In [None]:
cat_cols = []
outcome_col = 'Diagnosis'
file_name = "data/extra/s_breast_cancer.csv"

In [None]:
df = pd.DataFrame(data, columns=cols)
df = df.drop(['ID'], axis=1)
# df = df.infer_objects()
df['Diagnosis'] = df['Diagnosis'].apply(lambda x: int(x == 'B'))

df = simple_transform(df, cat_cols=cat_cols, outcome_col=outcome_col)
df = shuffle(df)
df.to_csv('../' + file_name, index=False)

In [None]:
# load configs of adult for no reasons
param = load_json("../counterfactual/configs/adult.json")
cols = df.columns.tolist()
for col in cat_cols:
    cols.remove(col)
cols.remove(outcome_col)
# copy to cont_cols
cont_cols = cols
param['data_dir'] = file_name
param['continous_cols'] = cont_cols
param['discret_cols'] = cat_cols
update_json_file(param, "../counterfactual/configs/extra/breast_cancer.json")

{'data_dir': 'data/extra/s_breast_cancer.csv',
 'lr': 0.01,
 'batch_size': 128,
 'lambda_1': 1.0,
 'lambda_2': 0.01,
 'lambda_3': 1.0,
 'threshold': 1.0,
 'continous_cols': ['X_1',
  'X_2',
  'X_3',
  'X_4',
  'X_5',
  'X_6',
  'X_7',
  'X_8',
  'X_9',
  'X_10',
  'X_11',
  'X_12',
  'X_13',
  'X_14',
  'X_15',
  'X_16',
  'X_17',
  'X_18',
  'X_19',
  'X_20',
  'X_21',
  'X_22',
  'X_23',
  'X_24',
  'X_25',
  'X_26',
  'X_27',
  'X_28',
  'X_29',
  'X_30'],
 'discret_cols': [],
 'encoder_dims': [29, 50, 10],
 'decoder_dims': [10, 10],
 'explainer_dims': [10, 50],
 'loss_1': 'mse',
 'loss_2': 'mse',
 'loss_3': 'mse'}

In [None]:
df = simple_transform(df, [], 'Diagnosis')

In [None]:
data[0]

array(['842302', 'M', '17.99', '10.38', '122.8', '1001', '0.1184',
       '0.2776', '0.3001', '0.1471', '0.2419', '0.07871', '1.095',
       '0.9053', '8.589', '153.4', '0.006399', '0.04904', '0.05373',
       '0.01587', '0.03003', '0.006193', '25.38', '17.33', '184.6',
       '2019', '0.1622', '0.6656', '0.7119', '0.2654', '0.4601', '0.1189'],
      dtype='<U9')

# Describe Data

In [None]:
configs = [
    ("adult", load_json("../counterfactual/configs/adult.json")),
    ("home", load_json("../counterfactual/configs/home.json")),
    ("student", load_json("../counterfactual/configs/student.json")),
    ("breast_cancer", load_json("../counterfactual/configs/extra/breast_cancer.json")),
    ("credit_card", load_json("../counterfactual/configs/extra/credit_card.json")),
    ("german_credit", load_json("../counterfactual/configs/extra/german_credit.json")),
    ("heart", load_json("../counterfactual/configs/extra/heart.json")),
    ("student_performance", load_json("../counterfactual/configs/extra/student_performance.json")),
    ("titanic", load_json("../counterfactual/configs/extra/titanic.json")),
]

In [None]:
def describe(configs: list):
    r = {"size": {}, "# of Cont": {}, "# of Cat": {}}
    for data_name, config in configs:
        data = pd.read_csv(f"../{config['data_dir']}")
        data_size = len(data)
        cat_len = len(config['discret_cols'])
        cont_len = len(config['continous_cols'])
        r['size'][data_name] = data_size
        r['# of Cont'][data_name] = cont_len
        r['# of Cat'][data_name] = cat_len
    
    pd.DataFrame.from_dict(r).to_csv("../results/data_describe.csv")
    return r

In [None]:
describe(configs)

{'size': {'adult': 32561,
  'home': 10459,
  'student': 32593,
  'breast_cancer': 569,
  'credit_card': 30000,
  'german_credit': 1000,
  'heart': 299,
  'student_performance': 649,
  'titanic': 891},
 '# of Cont': {'adult': 2,
  'home': 21,
  'student': 23,
  'breast_cancer': 30,
  'credit_card': 20,
  'german_credit': 7,
  'heart': 7,
  'student_performance': 2,
  'titanic': 2},
 '# of Cat': {'adult': 6,
  'home': 2,
  'student': 8,
  'breast_cancer': 0,
  'credit_card': 3,
  'german_credit': 13,
  'heart': 5,
  'student_performance': 14,
  'titanic': 24}}

# Pytorch Utils

In [None]:
# export

class NumpyDataset(TensorDataset):
    def __init__(self, *arrs, ):
        super(NumpyDataset, self).__init__()
        # init tensors
        # small patch: skip continous or discrete array without content
        self.tensors = [torch.tensor(arr).float() for arr in arrs if arr.shape[-1] != 0]
        assert all(self.tensors[0].size(0) == tensor.size(0) for tensor in self.tensors)

    def data_loader(self, batch_size=128, shuffle=True, num_workers=4):
        return DataLoader(self, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)

    def features(self, test=False):
        return tuple(self.tensors[:-1] if not test else self.tensors)

    def target(self, test=False):
        return self.tensors[-1] if not test else None

class PandasDataset(NumpyDataset):
    def __init__(self, df: pd.DataFrame):
        cols = df.columns
        X = df[cols[:-1]].to_numpy()
        y = df[cols[-1]].to_numpy()
        super().__init__(X, y)

In [None]:
x = np.random.normal(50, 15, 100)
y = np.random.normal(50, 15, 100)
df_test = pd.DataFrame({'x': x, 'y': y})
arrs = np.column_stack((x, y))
np_dataset = NumpyDataset(x, y)
pd_dataset = PandasDataset(df_test)

assert (arrs == df_test.to_numpy()).all()
assert len(np_dataset) == len(pd_dataset)

for i in range(len(np_dataset)):
    assert np_dataset[i] == pd_dataset[i]


In [None]:
(np.column_stack((x, y)) == df_test.to_numpy()).all()

True