## Data preparation and sanitization

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
RND_SEED = 6
np.random.seed(RND_SEED)
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sb
from core.utils.preprocessing import df_to_xy


TREATMENT = "t00"

# Read and sanitize the data
df = pd.read_csv(f"../data/{TREATMENT}/export/result__{TREATMENT}_prop.csv")
df_full = df.copy()
drop_cols = ["worker_id", "resp_worker_id", "prop_worker_id", "updated", "status", "job_id", "status", "timestamp", "rowid", "offer_dss", "offer", "offer_final", "completion_code"]
df = df[[col for col in df.columns if col not in drop_cols]]
df = df.dropna()


x, y = df_to_xy(df, fuse_risk=False, centered=False)

## Linear regression (continuous dataset)

**Accuracy / Loss - For model comparison**

In [2]:
from core.models.metrics import MAX_GAIN


#### Benchmark

In [3]:
from core.utils.benchmark import process_benchmarks, process_benchmark_cv, process_model


**Keras Model**

In [4]:
from core.models.deep import KerasModel, loss_tf, gain_tf

Using TensorFlow backend.


In [5]:
import os, sys


**Featureless model**

In [6]:
from core.models import EMModel, RandomModel

**Convervative model**

In [7]:
from core.models import ConservativeModel

**Linear/Logistic regression**

In [8]:
from sklearn.linear_model import LinearRegression, LogisticRegression

**SVM**

In [9]:
from sklearn.svm import SVC

**Random Forest**

In [10]:
from sklearn.ensemble.forest import RandomForestClassifier, RandomForestRegressor

**Clustering**

In [11]:
from core.models import ClusterModel, ClusterExtModel

In [12]:
from sklearn.neural_network import MLPClassifier

**Acceptance model**

In [13]:
from core.models import AcceptanceModel

**Baseline models**

**Clustering based models**

**TOP MODELS**

In [14]:
benchmark_models = {
    "random": RandomModel(MAX_GAIN),
    "conservative": ConservativeModel(MAX_GAIN),
    "em": EMModel(MAX_GAIN),
    "svc": SVC(gamma='auto'),
    "cluster-meanshift": ClusterExtModel(base_model="meanshift"),
    "linear_regressor": LinearRegression(copy_X=False),
    "nn": KerasModel(no_hidden_layer=False, epochs=500),
    'acceptance_model': AcceptanceModel(SVC(gamma='auto')),
    "random_forest": RandomForestClassifier(n_estimators=100, min_samples_leaf=5),
}

results = dict()

results = process_benchmarks(benchmark_models, x, y.ravel(), augment_data=[None])

results_mean = {key: item.mean() for key, item in results.items()}
results_std = {key: item.std() for key, item in results.items()}
results_df = pd.DataFrame(results_mean).T
results_df.sort_values("avg_loss_ratio", inplace=True)
results_df

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,invariance,loss_sum,mse,rejection_ratio
em,11.771429,0.187675,11.537176,1.0,242.0,386.095238,0.049048
svc,11.771429,0.187675,11.537176,1.0,242.0,386.095238,0.049048
cluster-meanshift,11.771429,0.187675,11.537176,1.0,242.0,386.095238,0.049048
random_forest,12.161905,0.199538,11.091751,0.614088,250.0,405.857143,0.068571
nn,38.228571,0.636497,39.010902,0.404785,784.999999,2008.119047,0.019524
random,36.291306,0.655434,21.805543,0.190539,748.067432,1764.848562,0.504762
acceptance_model,38.166667,0.706664,16.913781,0.231395,787.0,1921.880952,0.640476
linear_regressor,42.320793,0.739311,34.335815,0.152042,871.976648,2250.104788,0.457143
conservative,54.466667,0.916466,56.149749,1.0,1122.0,3288.595238,0.029524


In [15]:
results_df = results_df[[col for col in results_df.columns if col not in ["loss_sum", "mse"]]]

print(results_df.to_latex(float_format="%.3f"))

\begin{tabular}{lrrrrr}
\toprule
{} &  avg\_loss &  avg\_loss\_ratio &  avg\_win\_loss &  invariance &  rejection\_ratio \\
\midrule
em                &    11.771 &           0.188 &        11.537 &       1.000 &            0.049 \\
svc               &    11.771 &           0.188 &        11.537 &       1.000 &            0.049 \\
cluster-meanshift &    11.771 &           0.188 &        11.537 &       1.000 &            0.049 \\
random\_forest     &    12.162 &           0.200 &        11.092 &       0.614 &            0.069 \\
nn                &    38.229 &           0.636 &        39.011 &       0.405 &            0.020 \\
random            &    36.291 &           0.655 &        21.806 &       0.191 &            0.505 \\
acceptance\_model  &    38.167 &           0.707 &        16.914 &       0.231 &            0.640 \\
linear\_regressor  &    42.321 &           0.739 &        34.336 &       0.152 &            0.457 \\
conservative      &    54.467 &           0.916 &        56.150 