In [1]:
# https://www.kaggle.com/c/titanic/data

#!kaggle competitions download -c titanic

In [2]:
import os
import math
import logging

import numpy as np
import pandas as pd

from model_wrangler.model.losses import accuracy

from model_wrangler.model_wrangler import ModelWrangler
from model_wrangler.model_wrangler import LOGGER as MW_LOGGER
from model_wrangler.dataset_managers import BalancedDatasetManager
from model_wrangler.dataset_managers import LOGGER as DM_LOGGER

from model_wrangler.model.corral.debiased_classifier import DebiasedClassifier

  from ._conv import register_converters as _register_converters


In [3]:
DM_LOGGER.setLevel(logging.WARN)
MW_LOGGER.setLevel(logging.WARN)

In [4]:
DATA_DIR = os.path.expanduser(
    os.path.join('~', '.kaggle', 'competitions', 'titanic')
)

_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
_df.Age = _df.Age.fillna(value=_df.Age.dropna().mean())
df_data_train = _df.loc[(_df.PassengerId % 5) != 0]
df_data_holdout = _df.loc[(_df.PassengerId % 5) == 0]

In [5]:
feature_names = [
    'is_female',
    'age',
    'is_class1',
    'is_class2',
#    'is_class3',
#    'fare',
]

def df_to_numeric(df_in):
    """Function takes the dataframe of titanic data and
    turns it into a numeric array of engineered features
    """
    
    feats = np.array([
        (df_in.Sex == 'female').astype(float).tolist(),
        df_in.Age.astype(float).tolist(),
        (df_in.Pclass == 1).astype(float).tolist(),
        (df_in.Pclass == 2).astype(float).tolist(),
#        (df_in.Pclass == 3).astype(float).tolist(),
#        df_in.Fare.astype(float).tolist(),
    ]).T
    
    dv = df_in['Survived'].astype(float).tolist()
    dv = np.array(dv).reshape(-1, 1)

    group_idx = (df_in.Sex == 'female').astype(int).tolist()
    group_idx = np.array(group_idx).reshape(-1, 1)

    return [feats], [dv], [group_idx]

In [9]:
hidden_layer_config = [
    {
        'num_units': 12,
        'bias': True,
        'activation': 'tanh',
        'activity_reg': {'l1': 0.1},
        'dropout_rate': 0.0,
    },
]

embed_config = {
    'num_units': 4,
    'bias': True,
}

tb_scalars = {
    'scalars': ['embed_l1', 'embed_mean']
}

MODEL_PARAMS_BASE = {
    'name': 'titanic_example',
    'path': './titanic_example',
    'graph': {
        'in_sizes': [len(feature_names), 1],
        'hidden_params': hidden_layer_config,
        'embed_params': embed_config,
        'out_sizes': [1], 
        'debias_weight': None
    },
    'tensorboard': tb_scalars
}


MODEL_PARAMS_DEBIAS = {
    'name': 'titanic_debiased_example',
    'path': './titanic_debiased_example',
    'graph': {
        'in_sizes': [len(feature_names), 1],
        'hidden_params': hidden_layer_config,
        'embed_params': embed_config,
        'out_sizes': [1],
        'debias_weight': 1e-6
    },
    'tensorboard': tb_scalars
}

TRAIN_PARAMS = {
    'num_epochs': 50,
    'batch_size': 32
}

In [10]:
def get_data_managers(X_tr, Y_tr, X_ho, Y_ho, pos_classes=[[1.0]]):
    data_train = BalancedDatasetManager(X_tr, Y_tr)
    data_holdout = BalancedDatasetManager(X_ho, Y_ho)

    data_train.set_positive_class([pos_classes])
    data_holdout.set_positive_class([pos_classes])
    return data_train, data_holdout

def train_model(model_class, model_params, train_params, X_tr, Y_tr, X_ho, Y_ho):

    dm_train, dm_holdout = get_data_managers(X_tr, Y_tr, X_ho, Y_ho)

    model = ModelWrangler(model_class, model_params)
    model.add_train_params(train_params)
    model.add_data(dm_train, dm_holdout)

    pre_accy = 100 * model.score(X_ho, Y_ho, score_func=accuracy)
    pre_loss = model.score(X_ho, Y_ho)

    model.train()

    post_accy = 100 * model.score(X_ho, Y_ho, score_func=accuracy)
    post_loss =  model.score(X_ho, Y_ho)
    
    print("Pre-training")
    print("\tacc'y: {:.1f}%".format(pre_accy))
    print("\tloss: {:.4f}".format(pre_loss))

    print("Post-training")
    print("\tacc'y: {:.1f}%".format(post_accy))
    print("\tloss: {:.4f}".format(post_loss))

    return model

In [11]:
# Turn dataframe into numpy numerics
train_X, train_Y, train_groups = df_to_numeric(df_data_train)
ho_X, ho_Y, ho_groups = df_to_numeric(df_data_holdout)

# Set and train models
print('Baseline model')
orig_model = train_model(
    DebiasedClassifier, MODEL_PARAMS_BASE, TRAIN_PARAMS,
    train_X + train_groups, train_Y,
    ho_X + ho_groups, ho_Y
)

print('\n"De-Biased" model')
debias_model = train_model(
    DebiasedClassifier, MODEL_PARAMS_DEBIAS, TRAIN_PARAMS,
    train_X + train_groups, train_Y,
    ho_X + ho_groups, ho_Y
)

Baseline model
Pre-training
	acc'y: 37.6%
	loss: 1.0332
Post-training
	acc'y: 37.6%
	loss: 1.5882

"De-Biased" model
Pre-training
	acc'y: 38.8%
	loss: 0.7728
Post-training
	acc'y: 60.7%
	loss: 0.6905


In [13]:
def get_feature_importances(model, X, Y):
    feat_imp = model.feature_importance(X, Y, input_idxs=[0]).ravel()
    feat_imp /= feat_imp.mean()
    return feat_imp

df_feat_imp = pd.DataFrame({
    'feature': feature_names,
    'biased importance': get_feature_importances(orig_model, ho_X + ho_groups, ho_Y),
    'debiased importance': get_feature_importances(debias_model, ho_X + ho_groups, ho_Y)
}).set_index('feature')

df_feat_imp

Unnamed: 0_level_0,biased importance,debiased importance
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
is_female,1.770287,1.585143
age,0.103247,0.28112
is_class1,0.502262,1.143448
is_class2,1.624204,0.990289


In [35]:
def sigmoid(x):
  return 1 / (1 + math.exp(-x))

def get_error_summary(model, inputs, outputs, groups):

    preds = model.get_from_model(
        'preact_0/BiasAdd',
        model.make_data_dict(inputs, outputs, is_training=False)
    )
    
    bias = (outputs[0] - sigmoid(preds[0])).ravel()
    error = bias ** 2

    df_err = pd.DataFrame({
        'err': error,
        'bias': bias,
        'groups': groups[0].ravel()
    })
    
    return df_err.groupby('groups').mean()

In [36]:
orig_errors_by_group = get_error_summary(
    orig_model,
    ho_X + ho_groups,
    ho_Y,
    ho_groups
)

debiased_errors_by_group = get_error_summary(
    debias_model,
    ho_X + ho_groups,
    ho_Y,
    ho_groups
)

print('Original bias-spread across groups: {:.2f}'.format(orig_errors_by_group['bias'].std()))
print('De-biased bias-spread across groups: {:.2f}'.format(debiased_errors_by_group['bias'].std()))

Original bias-spread across groups: 0.32
De-biased bias-spread across groups: 0.32


In [37]:
orig_errors_by_group = get_error_summary(
    orig_model,
    train_X + train_groups,
    train_Y,
    train_groups
)

debiased_errors_by_group = get_error_summary(
    debias_model,
    train_X + train_groups,
    train_Y,
    train_groups
)

print('Original bias-spread across groups: {:.2f}'.format(orig_errors_by_group['bias'].std()))
print('De-biased bias-spread across groups: {:.2f}'.format(debiased_errors_by_group['bias'].std()))

Original bias-spread across groups: 0.41
De-biased bias-spread across groups: 0.41


In [18]:
orig_errors_by_group

Unnamed: 0_level_0,bias,err
groups,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-0.28423,0.229048
1,0.293772,0.269202


In [19]:
debiased_errors_by_group

Unnamed: 0_level_0,bias,err
groups,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-0.281269,0.227373
1,0.296732,0.27095
