In [1]:
# Parameters
x_encoding = [{"encoding": "ordinal", "column": "prev_contribution"}, {"encoding": "ordinal", "column": "prev_punishment"}]
y_encoding = "onehot"
n_contributions = 21
n_punishments = 31
n_cross_val = 2
fraction_training = 1.0
data = "../data/pilot1_player_round_slim.csv"
output_path = "../data/dev"
labels = {}
model_args = {"n_layers": 2, "hidden_size": 40}
optimizer_args = {"lr": 0.0001, "weight_decay": 1e-05}
train_args = {"epochs": 100, "batch_size": 40, "clamp_grad": 1, "eval_period": 10}
device = "cuda"


In [2]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
from aimanager.model.cross_validation import split_xy, get_cross_validations, get_fraction_of_groups
from aimanager.model.encoder import ordinal_to_int, onehot_to_int, joined_encoder, int_encode
from aimanager.model.metrics import create_metrics, create_confusion_matrix
from aimanager.model.synthesize_data import syn_con_pun
from aimanager.utils.array_to_df import add_labels, using_multiindex
from aimanager.utils.utils import make_dir

output_path = os.path.join(output_path, 'data')

In [3]:
import torch as th
from typing import Literal, Optional


class FeedForwardLayer(th.nn.Module):
    def __init__(
            self, *,
            input_size: int, 
            hidden_size: int, 
            dropout: Optional[float], 
            activation: Optional[Literal['relu', 'logit', 'softmax']]):
        super(FeedForwardLayer, self).__init__()
        self.input_size = input_size
        self.hidden_size  = hidden_size
        self.lin = th.nn.Linear(self.input_size, self.hidden_size)

        if activation == 'relu':
            self.activation = th.nn.ReLU()
        elif activation == 'logit':
            self.activation = th.nn.Logit()
        elif activation == 'softmax':
            self.activation = th.nn.Softmax()
        else:
            self.activation = None

        if dropout:
            self.dropout = th.nn.Dropout(dropout)
        else:
            self.dropout = None

    def forward(self, x):
        x = self.lin(x)
        if self.dropout:
            x = self.dropout(x)
        if self.activation:
            x = self.activation(x)
        return x



class MultiLayer(th.nn.Module):
    def __init__(self, *, 
            n_layers: int, 
            hidden_size: Optional[int]=None, 
            input_size: int, 
            output_size: int, 
            dropout: Optional[float]=None):
        super(MultiLayer, self).__init__()
        
        assert not ((hidden_size == None) and (n_layers > 1))

        self.layers = th.nn.Sequential(
            *(FeedForwardLayer(
                input_size=hidden_size if i > 0 else input_size,
                hidden_size=output_size if i == (n_layers - 1) else hidden_size,
                dropout=dropout,
                activation=None if i == (n_layers - 1) else 'relu'
            )
            for i in range(n_layers))
        )
            
    def forward(self, x):
        return self.layers(x)

# weight_decay == L2 regularisation
# https://stackoverflow.com/questions/42704283/adding-l1-l2-regularization-in-pytorch
from typing import Literal

def predict(model, x_enc):
    y_pred_logit = model(x_enc)
    if model.y_encoding == 'ordinal':
        y_pred_proba = th.sigmoid(y_pred_logit).detach().cpu().numpy()
        y_pred = ordinal_to_int(y_pred_proba)
    elif model.y_encoding in ['onehot', 'numeric']: 
        y_pred_proba = th.nn.functional.softmax(y_pred_logit, dim=-1).detach().cpu().numpy()
        y_pred = onehot_to_int(y_pred_proba)
    return y_pred, y_pred_proba

In [4]:
df = pd.read_csv(data)

df['contribution'] = pd.Categorical(
    df['contribution'], categories=np.arange(n_contributions), ordered=True
)
df['punishment'] = pd.Categorical(
    df['punishment'], categories=np.arange(n_punishments), ordered=True
)

In [5]:
class Evaluator:
    def __init__(self):
        self.metrics = []
        self.confusion_matrix = []
        self.synthetic_predicitions = []

    def set_data(self,data):
        self.data = data

    def set_labels(self, **labels):
        self.labels = labels

    def eval_set(self, model, set_name):
        model.eval()
        y_pred, y_pred_proba = predict(model, self.data['x_enc'][set_name])
        self.metrics += create_metrics(self.data['y_sr'][set_name], y_pred, set=set_name, **self.labels)
        self.confusion_matrix += create_confusion_matrix(
            self.data['y_sr'][set_name], y_pred, set=set_name, **self.labels)

    def eval_sync(self, model):
        model.eval()
        x_syn_df = self.data['x_df']['syn'].copy()
        y_pred, y_pred_proba = predict(model, self.data['x_enc']['syn'])
        if model.y_encoding == 'ordinal':
            y_pred_proba = np.concatenate([np.ones_like(y_pred_proba[:,[0]]), y_pred_proba[:,:]], axis=1)
        proba_df = using_multiindex(y_pred_proba, ['sample_idx', 'contribution']).rename(columns={'value': 'proba'})
        x_syn_df['contribution_pred'] = y_pred
        proba_df =  x_syn_df.merge(proba_df)
        proba_df['predicted'] = proba_df['contribution_pred'] == proba_df['contribution']
        proba_df = proba_df.drop(columns = ['contribution_pred'])
        proba_df = add_labels(proba_df, {'set': 'train', 'cv_split': i})
        self.synthetic_predicitions += proba_df.to_dict('records')

    def add_loss(self, loss):
        self.metrics.append(dict(name='loss', value=loss, **self.labels))

    def save(self, output_path, labels):
        make_dir(output_path)
        self._save_metric(self.metrics, 'metrics.parquet', output_path, labels)
        self._save_metric(self.confusion_matrix, 'confusion_matrix.parquet', output_path, labels)
        self._save_metric(self.synthetic_predicitions, 'synthetic_predicitions.parquet', output_path, labels)


    @staticmethod
    def _save_metric(rec, filename, output_path, labels):
        df = pd.DataFrame(rec)
        df = add_labels(df, labels)
        df.to_parquet(os.path.join(output_path, filename))

In [6]:
th_device = th.device(device)

metrics = []
confusion_matrix = []
syn_pred = []
ev = Evaluator()

x_df, y_sr = split_xy(df)
for i, split in enumerate(get_cross_validations(x_df, y_sr, n_cross_val)):
    x_train_df, y_train_sr, x_test_df, y_test_sr = split
    x_train_df, y_train_sr = get_fraction_of_groups(x_train_df, y_train_sr, fraction_training)
    x_syn_df = syn_con_pun(n_contributions, n_punishments)
    data = {
        'x_df': {'train': x_train_df, 'syn': x_syn_df, 'test': x_test_df },
        'y_sr': {'train': y_train_sr, 'test': y_test_sr }}
    data['x_enc'] = {
        k: th.tensor(joined_encoder(x, x_encoding), dtype=th.float, device=th_device)
        for  k, x in data['x_df'].items()
    }
    data['y_enc'] = {
        k: th.tensor(
            int_encode(y, encoding=y_encoding),
            dtype=th.long if y_encoding == 'numeric' else th.float, device=th_device)
        for  k, y in data['y_sr'].items()
    }
    ev.set_data(data)

    output_size = n_contributions if not y_encoding == 'ordinal' else n_contributions - 1
    model = MultiLayer(
        input_size=data['x_enc']['train'].shape[1], output_size=output_size, **model_args).to(th_device)
    model.y_encoding = y_encoding
    optimizer = th.optim.Adam(model.parameters(), **optimizer_args)

    if y_encoding == 'ordinal':
        loss_fn = th.nn.BCEWithLogitsLoss()
    elif y_encoding in ['onehot', 'numeric']:
        loss_fn = th.nn.CrossEntropyLoss()
    
    sum_loss = 0
    n_steps = 0
    batch_size = train_args['batch_size']

    for e in range(train_args['epochs']):
        ev.set_labels(cv_split=i, epoch=e)
        model.train()
        for start_idx in range(0, len(data['x_enc']['train']), batch_size):
            tx = data['x_enc']['train'][start_idx:start_idx+batch_size]
            ty = data['y_enc']['train'][start_idx:start_idx+batch_size]
            
            optimizer.zero_grad()

            py = model(tx)
            loss = loss_fn(py, ty)
            loss.backward()

            if train_args['clamp_grad']:
                for param in model.parameters():
                    param.grad.data.clamp_(-train_args['clamp_grad'], train_args['clamp_grad'])
            optimizer.step()
            sum_loss += loss.item()
            n_steps +=1
        
        if e % train_args['eval_period'] == 0:
            avg_loss = sum_loss/n_steps
            print(f'CV {i} | Epoch {e} | Loss {avg_loss}')
            ev.add_loss(avg_loss)
            ev.eval_set(model, 'train')
            ev.eval_set(model, 'test')
            sum_loss = 0
            n_steps = 0

    ev.eval_sync(model)

ev.save(output_path, labels)

CV 0 | Epoch 0 | Loss 3.0282807069666244


CV 0 | Epoch 10 | Loss 2.737477055016686


CV 0 | Epoch 20 | Loss 2.299350770606714


CV 0 | Epoch 30 | Loss 2.2090599833604165


CV 0 | Epoch 40 | Loss 2.1588081803830232


CV 0 | Epoch 50 | Loss 2.1242162626455814


CV 0 | Epoch 60 | Loss 2.098835714950281


CV 0 | Epoch 70 | Loss 2.078922360083636


CV 0 | Epoch 80 | Loss 2.0625303825911354


CV 0 | Epoch 90 | Loss 2.048712911588304


CV 1 | Epoch 0 | Loss 2.952941936605117


CV 1 | Epoch 10 | Loss 2.625563754053677


CV 1 | Epoch 20 | Loss 2.3010482348063412


CV 1 | Epoch 30 | Loss 2.2091709024327644


CV 1 | Epoch 40 | Loss 2.153635649558376


CV 1 | Epoch 50 | Loss 2.1166726331062176


CV 1 | Epoch 60 | Loss 2.0901624436325887


CV 1 | Epoch 70 | Loss 2.069773058593273


CV 1 | Epoch 80 | Loss 2.0533034887383965


CV 1 | Epoch 90 | Loss 2.039355210214853
