## Data preparation and sanitization

In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sb
import imblearn

# Read and sanitize the data
df = pd.read_excel("../data/UG_HH_NEW_continuous_no200.xls")
#df = pd.read_excel("./UG_HH_NEW_categorical_no200.xls")
df = df.dropna()

df_effort = df[['time_spent_prop', 'count_effort']]
df_effort = (df_effort - df_effort.min()) / (df_effort.max() - df_effort.min())

df['effort'] = df_effort['time_spent_prop'] * df_effort['count_effort']
df = df[['time_spent_risk', 'cells', 'selfish', 'effort',
         'Honesty_Humility','Extraversion', 'Agreeableness', 'min_offer']]

df = df[['effort', 'selfish','Honesty_Humility','Extraversion', 'Agreeableness', 'min_offer']]

MAX_ACCEPTABLE_MIN_OFFER = 150
df = df[df['min_offer'] <= MAX_ACCEPTABLE_MIN_OFFER]


NORMALISE_DATA = True


x = df.values[:, :-1]
y = df.values[:, -1:]

if NORMALISE_DATA:
    x_min = x.min(axis=0)
    x_max = x.max(axis=0)
    x = (x - x_min) / (x_max - x_min)
    
NB_FEATURES = x.shape[1]

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 1/3, random_state = 0)

## Linear regression (continuous dataset)

**Accuracy / Loss - For model comparison**

In [2]:
from models.metrics import avg_loss, avg_loss_ratio, avg_win_loss, loss_sum, mse, rejection_ratio

Using TensorFlow backend.


#### Benchmark

In [3]:
benchmark_functions = [avg_loss, mse, rejection_ratio, avg_win_loss, avg_loss_ratio, loss_sum]

from sklearn.model_selection import KFold
import numpy as np

def process_model(model, xTrain, yTrain, xTest, yTest, fit_kwargs=None, predict_kwargs=None):
    fit_kwargs = {} if fit_kwargs is None else fit_kwargs
    predict_kwargs = {} if predict_kwargs is None else predict_kwargs
    model.fit(xTrain, yTrain, **fit_kwargs)
    yPredict = model.predict(xTest, **predict_kwargs)
    results = {func.__name__: func(yTest, yPredict) for func in benchmark_functions}
    return results
    
def process_benchmark_cv(model, X, y, cv=5, metrics=None, fit_kwargs=None, predict_kwargs=None, augment_kwargs=None):
    # We make sure original values aren't modified, even by mistake
    X = np.copy(X)
    y = np.copy(y)
    
    kf = KFold(n_splits=cv)
    results = []
    for train_index, test_index in kf.split(X):
        xTrain, yTrain = X[train_index], y[train_index]
        if augment_kwargs:
            xTrain, yTrain = DACombine().fit_predict(xTrain, yTrain, **augment_kwargs)
        xTest, yTest = X[test_index], y[test_index]
        benchmark_result = process_model(model, xTrain, yTrain, xTest, yTest, fit_kwargs, predict_kwargs)
        results.append(benchmark_result)
    return pd.DataFrame(results)

** Data augmentation **

In [10]:
from utils.data_augmentation import DACombine

In [5]:
# #keras
# import math
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import multiply
# from keras.wrappers.scikit_learn import KerasRegressor
# import keras.backend as K
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import KFold
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline
# import tensorflow as tf
# import sys


# def sigmoid1024_tf(x):
#     return (1024**x) / (1024**x + 1)

# def sigmoid_tf(x):
#     return K.sigmoid(x)

# def gain_tf(y_true, y_pred):
#     math_pi = tf.constant(math.pi)
#     one = tf.constant(1.0)
#     ten = tf.constant(10.0)
#     x = tf.math.subtract(y_true, y_pred)
#     x = tf.math.truediv(x, ten)
#     left_mul = sigmoid_tf(x)
#     right_mul = tf.math.cos(tf.math.divide(x, math_pi))
#     return tf.math.multiply(left_mul, right_mul)


# def loss_tf(y_true, y_pred):
#     math_pi = tf.constant(math.pi)
#     one = tf.constant(1.0)
#     ten = tf.constant(10.0)
#     x0 = tf.math.subtract(y_true, y_pred)
#     x = tf.math.truediv(x0, ten)
#     left_mul = sigmoid_tf(x)
#     right_mul = tf.math.cos(tf.math.divide(x, math_pi))
#     return tf.math.subtract(one*2, tf.math.multiply(left_mul, right_mul))

# def _keras_model(loss=None, metrics=None):
#     """
#     build a simple regression model
#     :param loss: (str|callable, default: loss_tf)
#     """
#     if loss is None:
#         loss = loss_tf
#     if metrics is None:
#         metrics = ["mse"]
#     model = Sequential()
#     model.add(Dense(8, input_dim=NB_FEATURES, kernel_initializer='normal', activation='relu'))
#     model.add(Dense(8, activation="relu"))
#     model.add(Dense(8, activation="relu"))
#     model.add(Dense(1, kernel_initializer='normal'))
#     model.compile(loss=loss, optimizer='adam', metrics=metrics)
#     return model

# def _keras_linear_regression(loss=None, metrics=None):
#     if loss is None:
#         loss = "mse"
#     if metrics is None:
#         metrics = ["mse"]
#     model = Sequential()
#     model.add(Dense(1, input_dim=NB_FEATURES, kernel_initializer='normal', activation='relu'))
#     model.add(Dense(1, kernel_initializer='normal'))
#     model.compile(loss=loss, optimizer='adam', metrics=metrics)
#     return model

# def keras_linear_regression(loss=None, metrics=None, nb_epoch=100, batch_size=32, verbose=False):
#     build_fn = lambda : _keras_linear_regression(loss, metrics)
#     return KerasRegressor(build_fn=build_fn, epochs=nb_epoch, batch_size=batch_size, verbose=verbose)
    
# def keras_model(loss=None, metrics=None, nb_epoch=100, batch_size=32, verbose=False):
#     build_fn = lambda : _keras_model(loss, metrics)
#     return KerasRegressor(build_fn=build_fn, epochs=nb_epoch, batch_size=batch_size, verbose=verbose)

**Linear regression**

In [6]:
from sklearn.linear_model import LinearRegression, BayesianRidge, LogisticRegression, PassiveAggressiveRegressor, \
                                 ElasticNet, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble.bagging import BaggingRegressor, DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPRegressor
benchmark_models = {
    #'linear_regression': LinearRegression(),
    'svc': SVC(gamma='auto'),
    'log_regression': LogisticRegression(penalty='l1', solver='liblinear', multi_class='auto'),
}
    
augment_params = {
    'base': {},
    'retarget': {'retarget': True, 'distance': 10},
#     'x2': {'size':len(xTrain)*2},
#     'x2-up': {'size':len(xTrain)*2, 'upsample': True},
#     'x2+xy': {'size':len(xTrain)*2, 'include_xy':True},
#     'x4': {'size': len(xTrain)*4},
#     'x4+xy-up': {'size': len(xTrain)*4, 'include_xy':True, 'upsample':True},
    'x16': {'size': len(xTrain)*16},
    'x16-up': {'size': len(xTrain)*16, 'upsample': True},
    'x16_dist': {'size': len(xTrain)*16, 'distribution': True},
    'x16_combine': {'size': len(xTrain)*16, 'retarget': True, 'distribution': True, 'upsample': True},
}

results = {}
for key, model in benchmark_models.items():
    for aug_key, aug_params in augment_params.items():
        results[key+"_" + aug_key] = process_benchmark_cv(model=model, X=x, y=y.ravel(), augment_kwargs=aug_params)

results_mean = {key: item.mean() for key, item in results.items()}
results_std = {key: item.std() for key, item in results.items()}
pd.DataFrame(results_mean).T

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,loss_sum,mse,rejection_ratio
log_regression_base,27.01746,0.214933,23.40107,962.0,1550.428571,0.067302
log_regression_retarget,36.496825,0.292325,33.400688,1300.0,2137.063492,0.067143
log_regression_x16,80.688889,0.689162,39.660516,2872.0,8719.285714,0.561746
log_regression_x16-up,85.338095,0.727573,37.340909,3037.0,9449.452381,0.623651
log_regression_x16_combine,36.496825,0.292325,33.400688,1300.0,2137.063492,0.067143
log_regression_x16_dist,39.296825,0.330512,22.103498,1398.0,3256.650794,0.213651
svc_base,27.001587,0.213637,24.000891,962.0,1517.492063,0.061587
svc_retarget,34.871429,0.276041,33.447645,1242.0,1915.333333,0.044762
svc_x16,82.511111,0.700871,44.128409,2937.0,8861.960317,0.556032
svc_x16-up,82.02619,0.693758,42.727326,2923.0,8961.710317,0.560635


** Actual best model:**
- LogisticRegression (penalty='l1')

** Data Augmentation improve following models:**
- Ensemble models

In [7]:
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, RandomForestRegressor
benchmark_models = {
    'bag': BaggingRegressor(),
    'forest': RandomForestRegressor(n_estimators=50),
    #keras_linear_regression(nb_epoch=100, batch_size=60),
}
    
augment_params = {
    'base': {},
    'retarget': {'retarget': True, 'distance': 10},
    'x16': {'size': len(xTrain)*16},
    'x16-up': {'size': len(xTrain)*16, 'upsample': True},
    'x16_dist': {'size': len(xTrain)*16, 'distribution': True},
    'x16_combine': {'size': len(xTrain)*16, 'retarget': True, 'distribution': True},
}

results = {}
for key, model in benchmark_models.items():
    for aug_key, aug_params in augment_params.items():
        results[key+"_" + aug_key] = process_benchmark_cv(model=model, X=x, y=y.ravel(), augment_kwargs=aug_params)

results_mean = {key: item.mean() for key, item in results.items()}
results_std = {key: item.std() for key, item in results.items()}
pd.DataFrame(results_mean).T

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,loss_sum,mse,rejection_ratio
bag_base,69.82746,0.63825,25.492863,2484.4,6733.06881,0.567778
bag_retarget,54.296032,0.49421,29.185635,1931.8,4449.807778,0.371111
bag_x16,73.938071,0.667098,25.757054,2631.161181,7444.870623,0.60127
bag_x16-up,72.78686,0.65644,26.388152,2592.356125,7214.341737,0.583968
bag_x16_combine,57.571746,0.522713,27.098168,2048.6,4991.937143,0.415873
bag_x16_dist,65.237143,0.599816,27.772738,2323.4,5981.488571,0.510952
forest_base,67.235492,0.620754,24.652825,2393.28,6333.033479,0.550635
forest_retarget,55.908381,0.510751,27.811127,1990.44,4712.607797,0.398889
forest_x16,75.590134,0.682251,26.299587,2690.776663,7642.162737,0.617937
forest_x16-up,68.276288,0.625286,24.172165,2431.321487,6592.166564,0.555873


In [9]:
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.regularizers import L1L2

# reg = L1L2(l1=0.01, l2=0.01)
# model = Sequential()
# model.add(Dense(1, activation='relu', input_dim=x.shape[1]),)# W_regularizer=reg,)
# model.compile(optimizer='adam', loss="mse", metrics=["mse"])
# xTrain_a, yTrain_a = DACombine().fit_predict(xTrain, yTrain, size=1024)
# history = model.fit(xTrain_a, yTrain_a, nb_epoch=500, validation_split=0.25)
