In [403]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from aimanager.model.cross_validation import split_xy, get_cross_validations
from aimanager.model.encoder import int_to_ordinal, ordinal_to_int, joined_encoder
from aimanager.model.metrics import create_metrics, create_confusion_matrix
from aimanager.utils.array_to_df import add_labels
from aimanager.utils.utils import make_dir

import yaml


In [404]:
default = """
x_encoding:
    a: 
        ordinal: true
        n_levels: 21
        column: contribution
    b: 
        ordinal: false
        column: punishment
    c:
        etype: interaction
        a: 
            ordinal: true
            n_levels: 21
            column: contribution
        b: 
            ordinal: false
            column: punishment
y_encoding:
    ordinal: true
    n_levels: 21
    column: contribution
model_config:
    max_iter: 10000
    C: 1.0
    # penalty: elasticnet
    # l1_ratio: 0.0
    # solver: saga
ordinal_y: true
n_cross_val: 10
data: ../data/pilot1_player_round_slim.csv
"""
config = yaml.safe_load(default)
labels = {}
output_folder = '../data/dev/logreg'

In [405]:
def predict(model, x_enc, ordinal_y):
    y_pred = model.predict(x_enc)
    y_pred_proba = model.predict_proba(x_enc)
    if ordinal_y:
        y_pred_proba = np.stack(y_pred_proba, axis=1)
        y_pred_ordinal = y_pred
        y_pred = ordinal_to_int(y_pred)
    else:
        y_pred_ordinal = int_to_ordinal(y_pred)
    return y_pred, y_pred_ordinal, y_pred_proba

In [406]:
ordinal_y = config['y_encoding']['ordinal']

df = pd.read_csv(config['data'])

metrics = []
confusion_matrix = []

x_df, y_sr = split_xy(df)
for i, split in enumerate(get_cross_validations(x_df, y_sr, config['n_cross_val'])):
    x_train_df, y_train_sr, x_test_df, y_test_sr = split
    y_test = y_test_sr.astype(int).values
    y_train = y_train_sr.astype(int).values
    y_test_ord = int_to_ordinal(y_test, n_levels=config['y_encoding']['n_levels'])
    y_train_ord = int_to_ordinal(y_train, n_levels=config['y_encoding']['n_levels'])

    y = y_train_ord if ordinal_y else y_train

    pipe = Pipeline([('scaler', StandardScaler()), ('log_reg', LogisticRegression(**config['model_config']))])

    model = MultiOutputClassifier(pipe)

    x_train_enc = joined_encoder(x_train_df, config['x_encoding'])
    x_test_enc = joined_encoder(x_test_df, config['x_encoding'])
    model.fit(x_train_enc, y_train_ord)

    # training set performance
    y_pred, y_pred_ordinal, y_pred_proba = predict(model, x_test_enc, ordinal_y)
    metrics += create_metrics(y_test, y_test_ord, y_pred, y_pred_ordinal, y_pred_proba, ordinal_y=ordinal_y, set='test', cv_split=i)
    confusion_matrix += create_confusion_matrix(y_test, y_pred, set='test', cv_split=i)

    # test set performance
    y_pred, y_pred_ordinal, y_pred_proba = predict(model, x_train_enc, ordinal_y)
    metrics += create_metrics(y_train, y_train_ord, y_pred, y_pred_ordinal, y_pred_proba, ordinal_y=ordinal_y, set='train', cv_split=i)
    confusion_matrix += create_confusion_matrix(y_train, y_pred, set='train', cv_split=i)


In [407]:
make_dir(output_folder)

metrics_df = pd.DataFrame(metrics)
metrics_df = add_labels(metrics_df, labels)
metrics_df.to_parquet(os.path.join(output_folder, 'metrics.parquet'))


confusion_matrix_df = pd.DataFrame(confusion_matrix)
confusion_matrix_df = add_labels(confusion_matrix_df, labels)
confusion_matrix_df.to_parquet(os.path.join(output_folder, 'confusion_matrix.parquet'))