## Features reduction

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sb
import imblearn

from core.utils.preprocessing import df_to_xydf, df_to_xy

# Read and sanitize the data
df = pd.read_csv("../data/t00/data_t00.csv")
df_full = df.copy()
drop_cols = ["worker_id", "resp_worker_id", "prop_worker_id", "updated", "status", "job_id", "status", "timestamp", "rowid", "offer_dss", "offer", "offer_final"]
df = df[[col for col in df.columns if col not in drop_cols]]
df = df.dropna()

cols = [col for col in df.columns if col != "min_offer"] + ["min_offer"]




## Determine significant features

**Feature importance permutation**

In [2]:
from core.utils.preprocessing import df_to_xy, df_to_xydf
from core.models import AcceptanceModel
from core.utils.benchmark import process_model, process_benchmark_cv
from mlxtend.evaluate import feature_importance_permutation

## Train model with top features

In [3]:
from core.models import AcceptanceModel, EMModel
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from core.utils.selection import ffs

model = BaggingClassifier()
tmp = ffs(model, df, cv=3)
print(tmp)

(['cpc_q2'], 0.18823433631342437)


In [4]:
X, y = df_to_xy(df, select_columns=['cpc_q7', 'ras_q21', 'ras_q5'])
X, y = df_to_xy(df, select_columns=['cpc_expected_value', 'ras_q3'])

In [5]:
model.fit(X, y)
np.unique(model.predict(X))

  y = column_or_1d(y, warn=True)


array([  0,   5,  10,  20,  25,  30,  35,  40,  45,  50,  55,  60, 100])

In [6]:
df.min_offer.value_counts()

50     41
40     20
45      8
5       8
35      5
30      4
100     3
20      3
25      2
10      2
0       2
60      1
55      1
Name: min_offer, dtype: int64

In [7]:
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, PassiveAggressiveRegressor, SGDClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from core.models import AcceptanceModel, ClusterModel, ClusterExtModel
from core.utils.selection import ffs
from core.utils.selection import select_corr_columns

selected_columns = select_corr_columns(df)
selected_columns = None
model = ClusterExtModel()
# model = EMModel()

# tmp = ffs(model, df, selected_columns=selected_columns, cv=3)
# tmp

In [8]:
from core.utils.benchmark import process_benchmark_cv
# model_svc = SVC(gamma="auto")
model_svc = LogisticRegression(multi_class="auto", solver="lbfgs")
# model_svc = EMModel()
# model_svc = AcceptanceModel()
# linear’, ‘poly’, ‘rbf’, ‘sigmoid’
X, y = df_to_xy(df, select_columns=['cpc_expected_value', 'ras_q3'], centered=True, normalize=True)
# X, y = df_to_xy(df, select_columns=['cpc_q7', 'ras_q21', 'ras_q3'])
X, y = df_to_xy(df, select_columns=['cpc_q7', 'cc_beta', 'ras_q8', 'ras_q21', 'cpc_expected_value_max'], centered=False, normalize=True)
y = y.ravel()
model_svc.fit(X, y)
print("UNIQUE: ", np.unique(model_svc.predict(X)))
res = process_benchmark_cv(model_svc, X, y)
print(res.avg_loss_ratio.mean())
res

UNIQUE:  [40 50]
0.22762533227006912


Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,invariance,loss_sum,mse,rejection_ratio
0,13.25,0.245324,12.5,1.0,265,433.75,0.1
1,12.5,0.199364,8.055556,0.346065,250,455.0,0.1
2,10.0,0.115231,10.0,0.366025,200,377.5,0.0
3,14.5,0.251902,13.333333,0.346065,290,500.0,0.1
4,16.75,0.326306,11.875,0.324581,335,593.75,0.2


In [9]:
from sklearn.cluster import KMeans, MeanShift
from core.models import ClusterExtModel
from core.models.metrics import avg_loss_ratio
from core.utils.benchmark import process_benchmark_cv



X, y = df_to_xy(df.sample(frac=1.0))
# clf.fit(X)
# clf.cluster_centers_

clf = MeanShift(2.9)
model = ClusterExtModel(base_model=clf)
# model.fit(X, y)
# avg_loss_ratio(y, model.predict(X))

res = process_benchmark_cv(model, X, y)
print(res.avg_loss_ratio.mean())
res

0.19027338597075433


Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,invariance,loss_sum,mse,rejection_ratio
0,10.75,0.206801,9.444444,1.0,215.0,383.75,0.1
1,10.5,0.145041,10.5,1.0,210.0,267.5,0.0
2,13.25,0.264495,14.722222,1.0,265.0,446.25,0.1
3,9.5,0.15205,7.894737,1.0,190.0,275.0,0.05
4,14.5,0.182981,14.5,0.403829,290.0,477.5,0.0


In [10]:
f = np.polyfit(X[:, 0], y.ravel(), 3)
f

array([  62.47192597, -106.0618868 ,   51.62753472,   34.22202852])

In [11]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LinearSVC()
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = PassiveAggressiveClassifier()
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')


X, y = df_to_xy(df.sample(frac=1.0))
y = y.ravel()
# res = process_benchmark_cv(clf1, X, y)
# print(res.avg_loss_ratio.mean())
# res
# tmp = ffs(model, df, cv=3)
# tmp

In [97]:
from core.models.split import SplitModel
from core.models import ClusterModel, ClusterExtModel
from sklearn.linear_model import LogisticRegression, PassiveAggressiveRegressor, PassiveAggressiveClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import LinearSVR, LinearSVC, SVC, SVR

# base_model = LogisticRegression(solver="lbfgs")
base_model = SVR(gamma="auto")
base_model = BaggingRegressor(n_estimators=1)
base_model = PassiveAggressiveClassifier()

clf = SplitModel(base_model=base_model)
X, y = df_to_xy(df.sample(frac=1.0))
y = y.ravel()
res = process_benchmark_cv(clf, X, y)
print(res.avg_loss_ratio.mean())
res

0.1882733859707544


Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,invariance,loss_sum,mse,rejection_ratio
0,15.5,0.213422,13.947368,1.0,310,560.0,0.05
1,11.5,0.151697,11.5,1.0,230,327.5,0.0
2,11.25,0.267026,10.882353,1.0,225,383.75,0.15
3,9.75,0.177019,10.263158,1.0,195,283.75,0.05
4,10.0,0.132202,10.0,1.0,200,290.0,0.0


In [98]:
clf.fit(X, y)
np.unique(clf.predict(X))

array([50])