In [1]:
# Parameters
x_encoding = [
    {"encoding": "ordinal", "column": "prev_contribution"},
    {"encoding": "ordinal", "column": "prev_punishment"},
]
y_encoding = "ordinal"
n_contributions = 21
n_punishments = 31
n_cross_val = 2
fraction_training = 1.0
data = "../data/pilot1_player_round_slim.csv"
output_path = "../data/dev"
labels = {}
model_args = {"n_layers": 2, "hidden_size": 40}
optimizer_args = {"lr": 0.0001, "weight_decay": 1e-05}
train_args = {"epochs": 30, "batch_size": 40, "clamp_grad": 1, "eval_period": 10}
device = "cpu"


In [2]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import torch as th
from aimanager.model.cross_validation import split_xy, get_cross_validations, get_fraction_of_groups
from aimanager.model.encoder import ordinal_to_int, onehot_to_int, joined_encoder, int_encode
from aimanager.model.metrics import create_metrics, create_confusion_matrix
from aimanager.model.synthesize_data import syn_con_pun
from aimanager.utils.array_to_df import add_labels, using_multiindex
from aimanager.utils.utils import make_dir
from aimanager.model.neural.mlp import MultiLayer

output_path = os.path.join(output_path, 'data')

In [3]:
def predict(model, x_enc):
    y_pred_logit = model(x_enc)
    if model.y_encoding == 'ordinal':
        y_pred_proba = th.sigmoid(y_pred_logit).detach().cpu().numpy()
        y_pred = ordinal_to_int(y_pred_proba)
        y_pred_proba = np.concatenate([np.ones_like(y_pred_proba[:,[0]]), y_pred_proba[:,:]], axis=1)
    elif model.y_encoding == 'onehot': 
        y_pred_proba = th.nn.functional.softmax(y_pred_logit, dim=-1).detach().cpu().numpy()
        y_pred = onehot_to_int(y_pred_proba)
    elif model.y_encoding == 'numeric': 
        y_pred = th.sigmoid(y_pred_logit).detach().cpu().numpy()
        # TODO: n_contributions is hardcoded here
        y_pred = np.around(y_pred*21, decimals=0).astype(np.int64)
        y_pred_proba = None
    else:
        raise ValueError(f'Unkown y encoding {model.y_encoding}')
    return y_pred, y_pred_proba

In [4]:
df = pd.read_csv(data)

df['contribution'] = pd.Categorical(
    df['contribution'], categories=np.arange(n_contributions), ordered=True
)
df['punishment'] = pd.Categorical(
    df['punishment'], categories=np.arange(n_punishments), ordered=True
)

In [5]:
class Evaluator:
    def __init__(self):
        self.metrics = []
        self.confusion_matrix = []
        self.synthetic_predicitions = []

    def set_data(self,data):
        self.data = data

    def set_labels(self, **labels):
        self.labels = labels

    def eval_set(self, model, set_name):
        model.eval()
        y_pred, y_pred_proba = predict(model, self.data['x_enc'][set_name])
        self.metrics += create_metrics(self.data['y_sr'][set_name], y_pred, set=set_name, **self.labels)
        self.confusion_matrix += create_confusion_matrix(
            self.data['y_sr'][set_name], y_pred, set=set_name, **self.labels)

    def eval_sync(self, model):
        model.eval()
        pred_df = self.data['x_df']['syn'].copy()
        y_pred, y_pred_proba = predict(model, self.data['x_enc']['syn'])
        if y_pred_proba is not None:
            pred_df['contribution_pred'] = y_pred
            proba_df = using_multiindex(y_pred_proba, ['sample_idx', 'contribution']).rename(columns={'value': 'proba'})
            pred_df =  pred_df.merge(proba_df)
            pred_df['predicted'] = pred_df['contribution_pred'] == pred_df['contribution']
            pred_df = pred_df.drop(columns = ['contribution_pred'])
        else:
            pred_df['contribution'] = y_pred
            pred_df['predicted'] = True
        pred_df = add_labels(pred_df, {'set': 'train', 'cv_split': i})
        self.synthetic_predicitions += pred_df.to_dict('records')

    def add_loss(self, loss):
        self.metrics.append(dict(name='loss', value=loss, **self.labels))

    def save(self, output_path, labels):
        make_dir(output_path)
        self._save_metric(self.metrics, 'metrics.parquet', output_path, labels)
        self._save_metric(self.confusion_matrix, 'confusion_matrix.parquet', output_path, labels)
        self._save_metric(self.synthetic_predicitions, 'synthetic_predicitions.parquet', output_path, labels)


    @staticmethod
    def _save_metric(rec, filename, output_path, labels):
        df = pd.DataFrame(rec)
        df = add_labels(df, labels)
        df.to_parquet(os.path.join(output_path, filename))

In [6]:
th_device = th.device(device)

metrics = []
confusion_matrix = []
syn_pred = []
ev = Evaluator()

x_df, y_sr = split_xy(df)
for i, split in enumerate(get_cross_validations(x_df, y_sr, n_cross_val)):
    x_train_df, y_train_sr, x_test_df, y_test_sr = split
    x_train_df, y_train_sr = get_fraction_of_groups(x_train_df, y_train_sr, fraction_training)
    x_syn_df = syn_con_pun(n_contributions, n_punishments)
    data = {
        'x_df': {'train': x_train_df, 'syn': x_syn_df, 'test': x_test_df },
        'y_sr': {'train': y_train_sr, 'test': y_test_sr }}
    data['x_enc'] = {
        k: th.tensor(joined_encoder(x, x_encoding), dtype=th.float, device=th_device)
        for  k, x in data['x_df'].items()
    }
    data['y_enc'] = {
        k: th.tensor(
            int_encode(y, encoding=y_encoding, add_axis=False),
            dtype=th.float, device=th_device)
        for  k, y in data['y_sr'].items()
    }
    ev.set_data(data)

    if y_encoding == 'ordinal':
        output_size = (n_contributions - 1)
    elif y_encoding == 'onehot':
        output_size = n_contributions
    elif y_encoding == 'numeric':
        output_size = 1
    else:
        raise ValueError(f'Unkown y encoding {y_encoding}')
    
    model = MultiLayer(
        input_size=data['x_enc']['train'].shape[1], output_size=output_size, **model_args).to(th_device)
    model.y_encoding = y_encoding
    optimizer = th.optim.Adam(model.parameters(), **optimizer_args)

    if y_encoding == 'ordinal':
        loss_fn = th.nn.BCEWithLogitsLoss()
    elif y_encoding == 'onehot':
        loss_fn = th.nn.CrossEntropyLoss()
    elif y_encoding == 'numeric':
        mse = th.nn.MSELoss()
        sig = th.nn.Sigmoid()
        def _loss_fn(yhat,y):
            yhat = sig(yhat)*n_contributions
            return mse(yhat[:,0],y)
        loss_fn = _loss_fn

    sum_loss = 0
    n_steps = 0
    batch_size = train_args['batch_size']

    for e in range(train_args['epochs']):
        ev.set_labels(cv_split=i, epoch=e)
        model.train()
        for start_idx in range(0, len(data['x_enc']['train']), batch_size):
            tx = data['x_enc']['train'][start_idx:start_idx+batch_size]
            ty = data['y_enc']['train'][start_idx:start_idx+batch_size]
            
            optimizer.zero_grad()

            py = model(tx)
            loss = loss_fn(py, ty)
            loss.backward()

            if train_args['clamp_grad']:
                for param in model.parameters():
                    param.grad.data.clamp_(-train_args['clamp_grad'], train_args['clamp_grad'])
            optimizer.step()
            sum_loss += loss.item()
            n_steps +=1
        
        if e % train_args['eval_period'] == 0:
            avg_loss = sum_loss/n_steps
            print(f'CV {i} | Epoch {e} | Loss {avg_loss}')
            ev.add_loss(avg_loss)
            ev.eval_set(model, 'train')
            ev.eval_set(model, 'test')
            sum_loss = 0
            n_steps = 0

    ev.eval_sync(model)

ev.save(output_path, labels)

  Variable._execution_engine.run_backward(


CV 0 | Epoch 0 | Loss 0.6886756139643052
CV 0 | Epoch 10 | Loss 0.5954662143307574
CV 0 | Epoch 20 | Loss 0.46376721942249466
CV 1 | Epoch 0 | Loss 0.6944719307562884
CV 1 | Epoch 10 | Loss 0.5917453217155794
CV 1 | Epoch 20 | Loss 0.45243263588670424
