## Data preparation and sanitization

In [9]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sb

from core.utils.preprocessing import df_to_xy


# Read and sanitize the data
df = pd.read_csv("../data/t00/data_t00.csv")
df_full = df.copy()
drop_cols = ["worker_id", "resp_worker_id", "prop_worker_id", "updated", "status", "job_id", "status", "timestamp", "rowid", "offer_dss", "offer", "offer_final"]
df = df[[col for col in df.columns if col not in drop_cols]]
df = df.dropna()


x, y = df_to_xy(df, fuse_risk=False, centered=True, min_target=0, max_target=100, normalize=True)

## Linear regression (continuous dataset)

#### Benchmark

In [10]:
from core.utils.benchmark import process_benchmarks, process_benchmark_cv, process_model
from core.models.deep import KerasModel, loss_tf, gain_tf
from core.models import EMModel, RandomModel
from core.models import ConservativeModel
from sklearn.linear_model import LinearRegression, LogisticRegression
from core.models import RandomModel
from sklearn.neural_network import MLPClassifier

**Keras Model**

In [11]:
from core.models.deep import KerasModel, loss_tf, gain_tf

**Featureless model (fixed value)**

In [12]:
from core.models import EMModel, RandomModel

**Random Forest**

In [13]:
from sklearn.neural_network import MLPClassifier

**Acceptance model**

**Baseline models**

In [14]:
from sklearn.svm import SVC, SVR
from sklearn.svm import SVC, SVR
from core.utils.benchmark import benchmark_functions
from core.models.metrics import cross_compute

benchmark_models = {
    "random": RandomModel(),
    "conservative": ConservativeModel(),
    "em": EMModel(),
    "svc": SVC(gamma='auto'),
    "svr": SVR(gamma='auto'),
    "linear_regressor": LinearRegression(copy_X=False),
#     "linear_regressor_mse": keras_linear_regression(loss="mse"),
}
results = process_benchmarks(benchmark_models, x, y.ravel(), augment_data=[None])

results_mean = {key: item.mean() for key, item in results.items()}
results_mean["human"] = {f.__name__:cross_compute(df.min_offer, df_full['offer_final'], f) for f in benchmark_functions}
results_std = {key: item.std() for key, item in results.items()}
results_df = pd.DataFrame(results_mean).T
results_df.sort_values("avg_loss_ratio", inplace=True)
results_df

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,invariance,loss_sum,mse,rejection_ratio
em,11.6,0.188273,11.370175,1.0,232.0,369.0,0.05
svc,11.6,0.188273,11.370175,1.0,232.0,369.0,0.05
human,22.0155,0.388828,12.083061,1.0,2201.55,1001.6725,0.2683
svr,31.575095,0.599819,16.622944,0.999363,631.501905,1419.197312,0.5
random,35.673847,0.647927,22.875446,0.19897,713.476947,1699.627508,0.49
linear_regressor,42.490438,0.747976,31.616105,0.137534,849.80877,2374.178312,0.5
conservative,54.25,0.916466,55.957895,1.0,1085.0,3252.25,0.03


In [15]:
df_full["min_offer"].value_counts()

50     41
40     20
45      8
5       8
35      5
30      4
100     3
20      3
25      2
10      2
0       2
60      1
55      1
Name: min_offer, dtype: int64

In [16]:
df_full["offer_final"].value_counts()

50     62
40      8
25      5
10      5
55      3
45      3
60      2
35      2
30      2
20      2
15      2
5       2
100     1
0       1
Name: offer_final, dtype: int64