## Data preparation and sanitization

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sb

from core.utils.preprocessing import df_to_xy


# Read and sanitize the data
df = pd.read_csv("../data/t00/data_t00.csv")
df_full = df.copy()
drop_cols = ["worker_id", "resp_worker_id", "prop_worker_id", "updated", "status", "job_id", "status", "timestamp", "rowid", "offer_dss", "offer", "offer_final"]
df = df[[col for col in df.columns if col not in drop_cols]]
df = df.dropna()


x, y = df_to_xy(df, fuse_risk=False, centered=True, min_target=0, max_target=100, normalize=True)

## Linear regression (continuous dataset)

#### Benchmark

In [2]:
from core.utils.benchmark import process_benchmarks, process_benchmark_cv, process_model
from core.models.deep import KerasModel, loss_tf, gain_tf
from core.models import EMModel, RandomModel
from core.models import ConservativeModel
from sklearn.linear_model import LinearRegression, LogisticRegression
from core.models import RandomModel
from sklearn.neural_network import MLPClassifier

Using TensorFlow backend.


**Keras Model**

In [3]:
from core.models.deep import KerasModel, loss_tf, gain_tf

**Featureless model (fixed value)**

In [4]:
from core.models import EMModel, RandomModel

**Random Forest**

In [5]:
from sklearn.neural_network import MLPClassifier

**Acceptance model**

**Baseline models**

In [6]:
from sklearn.svm import SVC, SVR
from sklearn.svm import SVC, SVR
from core.utils.benchmark import benchmark_functions
from core.models.metrics import cross_compute

benchmark_models = {
    "random": RandomModel(),
    "conservative": ConservativeModel(),
    "em": EMModel(),
    "svc": SVC(gamma='auto'),
    "svr": SVR(gamma='auto'),
    "linear_regressor": LinearRegression(copy_X=False),
#     "linear_regressor_mse": keras_linear_regression(loss="mse"),
}
results = process_benchmarks(benchmark_models, x, y.ravel(), augment_data=[None])

results_mean = {key: item.mean() for key, item in results.items()}
results_mean["human"] = {f.__name__:cross_compute(df.min_offer, df_full['offer_final'], f) for f in benchmark_functions}
results_std = {key: item.std() for key, item in results.items()}
results_df = pd.DataFrame(results_mean).T
results_df.sort_values("avg_loss_ratio", inplace=True)
results_df

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,invariance,loss_sum,mse,rejection_ratio
em,11.771429,0.187675,11.537176,1.0,242.0,386.095238,0.049048
svc,11.771429,0.187675,11.537176,1.0,242.0,386.095238,0.049048
human,21.499199,0.374794,12.25695,1.0,2214.417476,976.823923,0.251673
svr,33.041517,0.62252,18.141918,0.619598,681.046617,1487.418042,0.514286
random,37.486318,0.67012,24.814805,0.188119,772.622063,1862.318023,0.495714
linear_regressor,42.210793,0.738103,34.143073,0.152101,869.776648,2237.834788,0.457143
conservative,54.466667,0.916466,56.149749,1.0,1122.0,3288.595238,0.029524


In [7]:
df_full["min_offer"].value_counts()

50     44
40     19
5       9
45      8
35      5
100     3
30      3
20      3
25      2
10      2
0       2
60      1
55      1
15      1
Name: min_offer, dtype: int64

In [8]:
df_full["offer_final"].value_counts()

50     67
40      6
25      5
10      5
55      3
45      3
60      2
35      2
30      2
20      2
15      2
0       2
100     1
5       1
Name: offer_final, dtype: int64