In [95]:
# Parameters
x_encoding = [
    {"encoding": "ordinal", "column": "prev_contribution"},
    {"encoding": "ordinal", "column": "prev_punishment"},
]
y_encoding = "numeric"
n_contributions = 21
n_punishments = 31
n_cross_val = 10
fraction_training = 1.0
data = "../data/pilot1_player_round_slim.csv"
output_path = "../data/dev"
labels = {}
model_args = {"n_layers": 1, "hidden_size": None}
optimizer_args = {"lr": 0.0001, "weight_decay": 1e-05}
train_args = {"epochs": 100, "batch_size": 40, "clamp_grad": None, "eval_period": 10}


In [96]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
from aimanager.model.cross_validation import split_xy, get_cross_validations, get_fraction_of_groups
from aimanager.model.encoder import int_to_ordinal, ordinal_to_int, onehot_to_int, int_to_onehot, joined_encoder, int_encode
from aimanager.model.metrics import create_metrics, create_confusion_matrix
from aimanager.model.synthesize_data import syn_con_pun
from aimanager.utils.array_to_df import add_labels, using_multiindex
from aimanager.utils.utils import make_dir

output_path = os.path.join(output_path, 'data')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [97]:
import torch as th
from typing import Literal, Optional


class FeedForwardLayer(th.nn.Module):
    def __init__(
            self, *,
            input_size: int, 
            hidden_size: int, 
            dropout: Optional[float], 
            activation: Optional[Literal['relu', 'logit', 'softmax']]):
        super(FeedForwardLayer, self).__init__()
        self.input_size = input_size
        self.hidden_size  = hidden_size
        self.lin = th.nn.Linear(self.input_size, self.hidden_size)

        if activation == 'relu':
            self.activation = th.nn.ReLU()
        elif activation == 'logit':
            self.activation = th.nn.Logit()
        elif activation == 'softmax':
            self.activation = th.nn.Softmax()
        else:
            self.activation = None

        if dropout:
            self.dropout = th.nn.Dropout(dropout)
        else:
            self.dropout = None

    def forward(self, x):
        x = self.lin(x)
        if self.dropout:
            x = self.dropout(x)
        if self.activation:
            x = self.activation(x)
        return x



class MultiLayer(th.nn.Module):
    def __init__(self, *, 
            n_layers: int, 
            hidden_size: Optional[int]=None, 
            input_size: int, 
            output_size: int, 
            dropout: Optional[float]=None):
        super(MultiLayer, self).__init__()
        
        assert not ((hidden_size == None) and (n_layers > 1))

        self.layers = th.nn.Sequential(
            *(FeedForwardLayer(
                input_size=hidden_size if i > 0 else input_size,
                hidden_size=output_size if i == (n_layers - 1) else hidden_size,
                dropout=dropout,
                activation=None if i == (n_layers - 1) else 'relu'
            )
            for i in range(n_layers))
        )
            
    def forward(self, x):
        return self.layers(x)

# weight_decay == L2 regularisation
# https://stackoverflow.com/questions/42704283/adding-l1-l2-regularization-in-pytorch

# def train(
#         x, y, *, model_args, optimizer_args, y_encoding, 
#         epochs, batch_size, clamp_grad=None, eval_period=0, eval_func=None):

#     return model


In [98]:
from typing import Literal

def predict(model, x_enc, y_encoding: Literal["ordinal", "onehot", "numeric"]):
    y_pred_logit = model(x_enc)
    if y_encoding == 'ordinal':
        y_pred_proba = th.sigmoid(y_pred_logit).detach().cpu().numpy()
        y_pred = ordinal_to_int(y_pred_proba)
    elif y_encoding in ['onehot', 'numeric']: 
        y_pred_proba = th.nn.functional.softmax(y_pred_logit, dim=-1).detach().cpu().numpy()
        y_pred = onehot_to_int(y_pred_proba)
    return y_pred, y_pred_proba

In [99]:
df = pd.read_csv(data)

df['contribution'] = pd.Categorical(
    df['contribution'], categories=np.arange(n_contributions), ordered=True
)
df['punishment'] = pd.Categorical(
    df['punishment'], categories=np.arange(n_punishments), ordered=True
)

metrics = []
confusion_matrix = []
syn_pred = []

x_df, y_sr = split_xy(df)
for i, split in enumerate(get_cross_validations(x_df, y_sr, n_cross_val)):
    x_train_df, y_train_sr, x_test_df, y_test_sr = split
    x_train_df, y_train_sr = get_fraction_of_groups(x_train_df, y_train_sr, fraction_training)

    y_test = int_encode(y_test_sr, encoding='numeric')[:,0]
    y_train = int_encode(y_train_sr, encoding='numeric')[:,0]

    x_train_enc = joined_encoder(x_train_df, x_encoding)
    x_test_enc = joined_encoder(x_test_df, x_encoding)
    y_test_enc = int_encode(y_test_sr, encoding=y_encoding)
    y_train_enc = int_encode(y_train_sr, encoding=y_encoding)

    x_train_enc = th.tensor(x_train_enc, dtype=th.float)
    x_test_enc = th.tensor(x_test_enc, dtype=th.float)

    if y_encoding == 'numeric':
        y_test_enc = th.tensor(y_test_enc[:,0], dtype=th.long)
        y_train_enc = th.tensor(y_train_enc[:,0], dtype=th.long)
    else:
        y_test_enc = th.tensor(y_test_enc, dtype=th.float)
        y_train_enc = th.tensor(y_train_enc, dtype=th.float)

    # train func

    epochs = train_args['epochs']
    batch_size = train_args['batch_size']
    clamp_grad = train_args['clamp_grad']
    eval_period = train_args['eval_period']
    x = x_train_enc
    y = y_train_enc
#         epochs, batch_size, clamp_grad=None, eval_period=0

    output_size = n_contributions if not y_encoding == 'ordinal' else n_contributions - 1

    model = MultiLayer(input_size=x.shape[1], output_size=output_size, **model_args)
    optimizer = th.optim.Adam(model.parameters(), **optimizer_args)

    if y_encoding == 'ordinal':
        loss_fn = th.nn.BCEWithLogitsLoss()
    elif y_encoding in ['onehot', 'numeric']:
        loss_fn = th.nn.CrossEntropyLoss()
    
    sum_loss = 0
    n_steps = 0

    for e in range(epochs):
        model.eval()
        for start_idx in range(0, len(x), batch_size):
            tx = x[start_idx:start_idx+batch_size]
            ty = y[start_idx:start_idx+batch_size]
            
            optimizer.zero_grad()

            py = model(tx)
            loss = loss_fn(py, ty)
            loss.backward()

            if clamp_grad:
                for param in model.parameters():
                    param.grad.data.clamp_(-clamp_grad, clamp_grad)
            optimizer.step()
            sum_loss += loss.item()
            n_steps +=1
        
        if e % eval_period == 0:
            metrics.append({
                'name': 'loss',
                'value': sum_loss / n_steps,
                'cv_split': i,
                'epoch': e
            })

            # training set performance
            y_pred, y_pred_proba = predict(model, x_test_enc, y_encoding=y_encoding)
            metrics += create_metrics(y_test, y_pred, set='test', cv_split=i)
            confusion_matrix += create_confusion_matrix(y_test, y_pred, set='test', cv_split=i, epoch=e)

            # test set performance
            y_pred, y_pred_proba = predict(model, x_train_enc, y_encoding=y_encoding)
            metrics += create_metrics(y_train, y_pred, set='train', cv_split=i)
            confusion_matrix += create_confusion_matrix(y_train, y_pred, set='train', cv_split=i, epoch=e)
            sum_loss = 0
            n_steps = 0   
    
    # break

    model.eval()

    # eval synthesized data
    x_syn_df = syn_con_pun(n_contributions, n_punishments)
    x_syn = joined_encoder(x_syn_df, x_encoding)

    x_syn = th.tensor(x_syn, dtype=th.float)
    
    y_pred, y_pred_proba = predict(model, x_syn, y_encoding=y_encoding)
    if y_encoding == 'ordinal':
        y_pred_proba = np.concatenate([np.ones_like(y_pred_proba[:,[0]]), y_pred_proba[:,:]], axis=1)
    proba_df = using_multiindex(y_pred_proba, ['sample_idx', 'contribution']).rename(columns={'value': 'proba'})
    x_syn_df['contribution_pred'] = y_pred
    proba_df =  x_syn_df.merge(proba_df)
    proba_df['predicted'] = proba_df['contribution_pred'] == proba_df['contribution']
    proba_df = proba_df.drop(columns = ['contribution_pred'])
    proba_df = add_labels(proba_df, {'set': 'train', 'cv_split': i})
    syn_pred += proba_df.to_dict('records')


In [None]:
make_dir(output_path)

metrics_df = pd.DataFrame(metrics)
metrics_df = add_labels(metrics_df, labels)
metrics_df.to_parquet(os.path.join(output_path, 'metrics.parquet'))

confusion_matrix_df = pd.DataFrame(confusion_matrix)
confusion_matrix_df = add_labels(confusion_matrix_df, labels)
confusion_matrix_df.to_parquet(os.path.join(output_path, 'confusion_matrix.parquet'))

syn_pred_df = pd.DataFrame(syn_pred)
syn_pred_df = add_labels(syn_pred_df, labels)
syn_pred_df.to_parquet(os.path.join(output_path, 'synthetic_predicitions.parquet'))