In [None]:
# Parameters
x_encoding = [
    {"ordinal": True, "column": "prev_contribution"},
    {"ordinal": False, "column": "prev_punishment"},
    {
        "etype": "interaction",
        "a": {"ordinal": True, "column": "prev_contribution"},
        "b": {"ordinal": False, "column": "prev_punishment"},
    },
]
y_encoding = {"ordinal": False, "column": "contribution"}
model_config = {"max_iter": 10000, "C": 1.0}
n_contributions = 21
n_punishments = 31
n_cross_val = 10
fraction_training = 0.1
data = "../../data/pilot1_player_round_slim.csv"
output_path = "../../data/dev"
labels = {}


In [2]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from aimanager.artificial_humans.cross_validation import split_xy, get_cross_validations, get_fraction_of_groups
from aimanager.generic.encoder import int_to_ordinal, ordinal_to_int, joined_encoder, int_encode
from aimanager.artificial_humans.metrics import create_metrics, create_confusion_matrix
from aimanager.artificial_humans.synthesize_data import syn_con_pun
from aimanager.utils.array_to_df import add_labels, using_multiindex
from aimanager.utils.utils import make_dir

output_path = os.path.join(output_path, 'data')

In [3]:
def predict(model, x_enc, ordinal_y, n_levels):
    y_pred = model.predict(x_enc)
    y_pred_proba = model.predict_proba(x_enc)
    if ordinal_y:
        y_pred_proba = np.stack(y_pred_proba, axis=1)
        y_pred_ordinal = y_pred
        y_pred = ordinal_to_int(y_pred)
    else:
        y_pred_ordinal = int_to_ordinal(y_pred, n_levels=n_levels)
        y_pred_proba_ = np.zeros((len(y_pred_proba), n_levels))
        y_pred_proba_[:, model.classes_] = y_pred_proba
        y_pred_proba = y_pred_proba_
    return y_pred, y_pred_ordinal, y_pred_proba

In [4]:
ordinal_y = y_encoding['ordinal']

df = pd.read_csv(data)

df['contribution'] = pd.Categorical(
    df['contribution'], categories=np.arange(n_contributions), ordered=True
)
df['punishment'] = pd.Categorical(
    df['punishment'], categories=np.arange(n_punishments), ordered=True
)

metrics = []
confusion_matrix = []
syn_pred = []

x_df, y_sr = split_xy(df)
for i, split in enumerate(get_cross_validations(x_df, y_sr, n_cross_val)):
    x_train_df, y_train_sr, x_test_df, y_test_sr = split
    x_train_df, y_train_sr = get_fraction_of_groups(x_train_df, y_train_sr, fraction_training)
    y_test_ord = int_encode(y_test_sr, ordinal=True)
    y_train_ord = int_encode(y_train_sr, ordinal=True)
    y_test = int_encode(y_test_sr, ordinal=False)[:,0]
    y_train = int_encode(y_train_sr, ordinal=False)[:,0]

    y = y_train_ord if ordinal_y else y_train

    x_train_enc = joined_encoder(x_train_df, x_encoding)
    x_test_enc = joined_encoder(x_test_df, x_encoding)

    if ordinal_y:
        pipe = Pipeline([('scaler', StandardScaler()), ('log_reg', LogisticRegression(**model_config))])
        model = MultiOutputClassifier(pipe)
        model.fit(x_train_enc, y_train_ord)
    else:
        model = Pipeline([('scaler', StandardScaler()), ('log_reg', LogisticRegression(**model_config))])

        y_train = pd.Categorical(
            y_train, categories=np.arange(n_contributions), ordered=True
        )

        model.fit(x_train_enc, y_train)

    # training set performance
    y_pred, y_pred_ordinal, y_pred_proba = predict(model, x_test_enc, ordinal_y, n_levels=n_contributions)
    metrics += create_metrics(y_test_sr, y_pred, set='test', cv_split=i)
    confusion_matrix += create_confusion_matrix(y_test_sr, y_pred, set='test', cv_split=i)

    # test set performance
    y_pred, y_pred_ordinal, y_pred_proba = predict(model, x_train_enc, ordinal_y, n_levels=n_contributions)
    metrics += create_metrics(y_train_sr, y_pred, set='train', cv_split=i)
    confusion_matrix += create_confusion_matrix(y_train_sr, y_pred, set='train', cv_split=i)

    # eval synthesized data
    x_syn_df = syn_con_pun(n_contributions, n_punishments)
    x_syn = joined_encoder(x_syn_df, x_encoding)
    y_pred, y_pred_ordinal, y_pred_proba = predict(model, x_syn, ordinal_y, n_levels=n_contributions)
    if ordinal_y:
        y_pred_proba = np.concatenate([np.ones_like(y_pred_proba[:,[0],1]), y_pred_proba[:,:,1]], axis=1)
    proba_df = using_multiindex(y_pred_proba, ['sample_idx', 'contribution']).rename(columns={'value': 'proba'})
    x_syn_df['contribution_pred'] = y_pred
    proba_df =  x_syn_df.merge(proba_df)
    proba_df['predicted'] = proba_df['contribution_pred'] == proba_df['contribution']
    proba_df = proba_df.drop(columns = ['contribution_pred'])
    proba_df = add_labels(proba_df, {'set': 'train', 'cv_split': i})
    syn_pred += proba_df.to_dict('records')

In [5]:
make_dir(output_path)

metrics_df = pd.DataFrame(metrics)
metrics_df = add_labels(metrics_df, labels)
metrics_df.to_parquet(os.path.join(output_path, 'metrics.parquet'))

confusion_matrix_df = pd.DataFrame(confusion_matrix)
confusion_matrix_df = add_labels(confusion_matrix_df, labels)
confusion_matrix_df.to_parquet(os.path.join(output_path, 'confusion_matrix.parquet'))

syn_pred_df = pd.DataFrame(syn_pred)
syn_pred_df = add_labels(syn_pred_df, labels)
syn_pred_df.to_parquet(os.path.join(output_path, 'synthetic_predicitions.parquet'))