## Data preparation and sanitization

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sb

from core.utils.preprocessing import df_to_xy


# Read and sanitize the data
drop_cols = ["worker_id", "resp_worker_id", "prop_worker_id", "updated", "status", "job_id", "status", "timestamp", "rowid", "offer_dss", "offer", "offer_final", "completion_code", 'Unnamed: 0']
TREATMENTS = ["t00", "t10", "t11"]
df = None
df_full = None
for treatment in TREATMENTS:
    df_tmp = pd.read_csv(f"../data/{treatment}/export/result__{treatment}_prop.csv")
    if "offer_final" not in df_tmp.columns:
        df_tmp["offer_final"] = df_tmp["offer"]
    df_full_tmp = df_tmp.copy()
    df_tmp = df_tmp[[col for col in df_tmp.columns if col not in drop_cols]]
    df_tmp = df_tmp.dropna()
    if df is None:
        df = df_tmp
        df_full = df_full_tmp
    else:
        cols = df.columns
        for col in cols:
            if col not in df_tmp:
                df_tmp[col] = None
        cols_full = df_full.columns
        for col in cols_full:
            if col not in df_full_tmp.columns:
                df_full[col] = None
        df = df.append(df_tmp[cols], ignore_index=True)
        df_full = df_full.append(df_full_tmp[cols], ignore_index=True)


df_full = df.copy()
drop_cols = ["worker_id", "resp_worker_id", "prop_worker_id", "updated", "status", "job_id", "status", "timestamp", "rowid", "offer_dss", "offer", "offer_final"]
df = df[[col for col in df.columns if col not in drop_cols]]
df = df.dropna()


x, y = df_to_xy(df, fuse_risk=False, centered=True, min_target=0, max_target=100, normalize=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


## Linear regression (continuous dataset)

#### Benchmark

In [2]:
from core.utils.benchmark import process_benchmarks, process_benchmark_cv, process_model
from core.models.deep import KerasModel, loss_tf, gain_tf
from core.models import EMModel, RandomModel
from core.models import ConservativeModel
from sklearn.linear_model import LinearRegression, LogisticRegression
from core.models import RandomModel
from sklearn.neural_network import MLPClassifier

Using TensorFlow backend.


**Keras Model**

In [3]:
from core.models.deep import KerasModel, loss_tf, gain_tf

**Featureless model (fixed value)**

In [4]:
from core.models import EMModel, RandomModel

**Random Forest**

In [5]:
from sklearn.neural_network import MLPClassifier

**Acceptance model**

**Baseline models**

In [6]:
from sklearn.svm import SVC, SVR
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from core.utils.benchmark import benchmark_functions
from core.models.metrics import cross_compute

benchmark_models = {
    "random": RandomModel(),
    "conservative": ConservativeModel(),
    "em": EMModel(),
    "svc": SVC(gamma='auto'),
    "svr": SVR(gamma='auto'),
    "linear_regressor": LinearRegression(copy_X=False),
    "rf": RandomForestClassifier(n_estimators=32),
#     "linear_regressor_mse": keras_linear_regression(loss="mse"),
}
results = process_benchmarks(benchmark_models, x, y.ravel(), augment_data=[None])

results_mean = {key: item.mean() for key, item in results.items()}
results_mean["human"] = {f.__name__:cross_compute(df.min_offer, df_full['min_offer'], f) for f in benchmark_functions}
results_std = {key: item.std() for key, item in results.items()}
results_df = pd.DataFrame(results_mean).T
results_df.sort_values("avg_loss_ratio", inplace=True)
results_df

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,invariance,loss_sum,mse,rejection_ratio
em,12.616891,0.213763,12.56983,1.0,734.0,386.690532,0.061952
svc,12.616891,0.213763,12.56983,1.0,734.0,386.690532,0.061952
rf,17.40678,0.316238,12.900283,0.258168,1013.0,643.661601,0.178726
human,29.429742,0.536312,13.762239,1.0,8564.054983,1456.46367,0.404743
random,33.335422,0.601662,23.783108,0.188078,1939.417172,1565.6364,0.395383
svr,31.790482,0.609591,16.72519,0.998516,1850.533838,1419.456736,0.508416
linear_regressor,34.799768,0.637513,15.884061,0.2172,2025.7253,1740.664261,0.54284
conservative,54.691409,0.914008,56.671536,1.0,3183.0,3336.472823,0.034424


In [41]:

# first neural network with keras make predictions
from numpy import loadtxt
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from core.models.deep import loss_tf, gain_tf
# load the dataset
# split into input (X) and output (y) variables
# X = dataset[:,0:8]
# y = dataset[:,8]
# define the keras model
model = Sequential()
model.add(Dense(256, input_dim=x.shape[1], activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss=loss_tf, optimizer='adam', metrics=[gain_tf])
# fit the keras model on the dataset
#model.fit(x, to_categorical(y), epochs=500, batch_size=32, verbose=2, validation_split=0.3)
model.fit(x, y, epochs=500, batch_size=32, verbose=2, validation_split=0.3)

Train on 203 samples, validate on 88 samples
Epoch 1/500
 - 1s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 2/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 3/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 4/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 5/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 6/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 7/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 8/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 9/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 10/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 11/500
 - 0s - loss: 0.9827 

Epoch 89/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 90/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 91/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 92/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 93/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 94/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 95/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 96/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 97/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 98/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 99/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752

Epoch 177/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 178/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 179/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 180/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 181/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 182/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 183/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 184/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 185/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 186/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 187/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_l

Epoch 265/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 266/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 267/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 268/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 269/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 270/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 271/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 272/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 273/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 274/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 275/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_l

Epoch 353/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 354/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 355/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 356/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 357/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 358/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 359/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 360/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 361/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 362/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 363/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_l

Epoch 441/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 442/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 443/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 444/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 445/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 446/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 447/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 448/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 449/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 450/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_loss: 0.9752 - val_gain_tf: 2.2500
Epoch 451/500
 - 0s - loss: 0.9827 - gain_tf: 1.4631 - val_l

<keras.callbacks.History at 0x7f81f94f65c0>

In [None]:
df_full["min_offer"].value_counts()

In [None]:
df_full["offer_final"].value_counts()