## Features reduction

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sb
import imblearn

from core.utils.preprocessing import df_to_xydf, df_to_xy

# Read and sanitize the data
df = pd.read_csv("../data/t00/data_t00.csv")
df_full = df.copy()
drop_cols = ["worker_id", "resp_worker_id", "prop_worker_id", "updated", "status", "job_id", "status", "timestamp", "rowid", "offer_dss", "offer", "offer_final"]
df = df[[col for col in df.columns if col not in drop_cols]]
df = df.dropna()

cols = [col for col in df.columns if col != "min_offer"] + ["min_offer"]




## Determine significant features

**Feature importance permutation**

In [2]:
from core.utils.preprocessing import df_to_xy, df_to_xydf
from core.models import AcceptanceModel
from core.utils.benchmark import process_model, process_benchmark_cv
from mlxtend.evaluate import feature_importance_permutation

## Train model with top features

In [3]:
from core.models import AcceptanceModel, EMModel
from core.utils.selection import ffs

#model = AcceptanceModel()
model = EMModel()
tmp = ffs(model, df, cv=2)
print(tmp)

(['cc_beta'], 0.18827338597075433)


In [4]:
X, y = df_to_xy(df, select_columns=['cpc_q7', 'ras_q21', 'ras_q5'])
X, y = df_to_xy(df, select_columns=['cpc_expected_value', 'ras_q3'])

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from core.models.metrics import avg_gain_ratio, avg_loss_ratio

X, y = df_to_xy(df)

def my_binary_scoring(estimator, x, y):
    pred = estimator.predict(x)
    return avg_loss_ratio(y, pred)

limit = 40
y_binary = (y <= limit).astype(int).ravel()
y_binary[y_binary==1] = limit
y_binary[y_binary==0] = 50

model = SVC()
#model.score(X, y)
score = cross_val_score(model, X, y_binary, scoring=my_binary_scoring)
print(score)
y_binary.mean()

[0.07843137 0.07575758 0.1010101 ]




45.4

In [6]:
df.min_offer.value_counts()

50     41
40     20
45      8
5       8
35      5
30      4
100     3
20      3
25      2
10      2
0       2
60      1
55      1
Name: min_offer, dtype: int64

In [33]:
from sklearn.svm import LinearSVC, LinearSVR, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import BernoulliRBM, MLPClassifier
from core.utils.selection import ffs

model = SVC(gamma="auto", kernel="poly")
model = LogisticRegression()
model = MLPClassifier((8, 4, 2, 4, 8, 16), solver="sgd")
enforced_target_values = {(0, 40):40, (45, 100):50}
enforced_target_values = None
tmp = ffs(model, df, cv=3, enforced_target_values=enforced_target_values, early_stop=0)
print(tmp)
#model = AcceptanceModel()

(['cc_beta'], 0.18823433631342437)


In [8]:
# model_svc = SVC(gamma="auto", kernel="poly")
# # linear’, ‘poly’, ‘rbf’, ‘sigmoid’
# X, y = df_to_xy(df, select_columns=['cpc_expected_value', 'ras_q3'])
# model_svc.fit(X, y)
# np.unique(model_svc.predict(X))

In [9]:

limit = 25
X, y = df_to_xy(df.sample(frac=1.0))
y_binary = (y <= limit).astype(int).ravel()
y_binary[y_binary==1] = limit
y_binary[y_binary==0] = 50

avg_loss_ratio(y, y_binary)
split = 75
model = SVC()
model.fit(X[:split], y_binary[:split])
p = model.predict(X[split:])
avg_loss_ratio(y[split:], p)



0.27872951609793717

In [10]:

def target_to_enforced_target_values(y, enforced_target_values):
    y = y.ravel()
    res = np.array(y)
    for (lower, higher), value in enforced_target_values.items():
        res[(lower<y) & (y<=higher)] = value
    return res
enforced_target_values = {(-1, 20):40, (20, 100):50}
print(target_to_enforced_target_values(y, enforced_target_values))

[50 50 50 50 50 40 50 50 50 50 50 50 50 50 40 50 50 50 50 50 40 50 50 50
 50 50 50 50 50 40 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50
 50 50 50 50 50 40 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 40
 50 40 50 50 40 50 40 50 50 50 40 50 40 40 50 50 40 50 50 50 50 50 40 50
 50 50 50 40]


In [30]:
df_full["min_offer"].mean()

40.9