## Feature reduction

In [5]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sb

df = pd.read_excel("../data/UG_HH_NEW_categorical_no200.xls")
df = df.dropna()

df_full = pd.read_excel("../data/UG_HH_NEW_continuous_no200.xls")
df_min = df_full.min()
df_max = df_full.max()

#Drop 'protected' features
drop_cols = ['prop', 'other_prop', 'other_resp']
df = df[[col for col in df if col not in drop_cols]]

df.head()

Unnamed: 0,q1,q2,q3,q4,q5,q6,q7,q8,q9,q10,...,equal_income,asian,white,lazy_stupid,diligent,completely_selfish,complete_donor,expect_50less,expect_100,min_offer
0,2,4,1,2,4,2,5,4,2,5,...,0,0,0,0,0,0,0,0,0,80
1,3,1,5,1,4,5,5,1,3,2,...,0,0,0,1,0,1,0,1,0,50
2,3,4,3,4,2,5,3,3,2,4,...,0,0,0,1,0,0,0,0,1,100
3,4,4,2,4,3,3,4,4,2,4,...,0,0,0,1,0,0,0,0,1,100
4,4,4,2,4,4,2,5,4,3,5,...,0,0,0,0,0,0,0,0,0,95


## Individual plots

In [3]:
# sb.pairplot(df, x_vars=('prop','other_resp','other_prop'), y_vars='min_offer', height=7, aspect=0.7, kind='reg')
# sb.pairplot(df, x_vars=('cells', 'selfish','count_effort'), y_vars='min_offer', height=7, aspect=0.7)
# sb.pairplot(df, x_vars=('Honesty_Humility', 'Extraversion','Agreeableness'), y_vars='min_offer', height=7, aspect=0.7)

## Determine significant features

In [6]:
from utils.preprocessing import df_to_xy, df_to_xydf
from models import AcceptanceModel
from utils.benchmark import process_model, process_benchmark_cv

res = {}
features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')
for col in features:
    if col == 'risk':    
        x, y = df_to_xy(df, select_columns=[col], fuse_risk=True, df_min=df_min, df_max=df_max)
        col = 'risk*'
    else:
        x, y = df_to_xy(df, select_columns=[col], df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[col] = item_res.mean()

res_single_df = pd.DataFrame(res).T
res_single_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_single_df

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,loss_sum,mse,rejection_ratio
q26,27.164286,0.218293,25.21599,967.0,1470.575397,0.060782
India,27.049206,0.218445,24.688663,963.0,1491.15873,0.063902
q12,27.049206,0.219333,24.239084,963.0,1518.65873,0.067143
USA,27.049206,0.219333,24.239084,963.0,1518.65873,0.067143
genius,27.049206,0.219333,24.239084,963.0,1518.65873,0.067143
asian,27.049206,0.219333,24.239084,963.0,1518.65873,0.067143
white,27.104762,0.219796,24.29969,965.0,1521.436508,0.066989
q3,27.220635,0.220885,24.426584,969.0,1523.515873,0.066
q29,27.220635,0.220975,24.415869,969.0,1521.801587,0.070571
donation_c,27.55,0.221657,25.62638,981.0,1488.75,0.060845


In [7]:
from itertools import combinations


features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')

res = {}
for cols in combinations(features, 2):
    #index should be a list
    cols = list(cols)
    if cols[0]==cols[1]: continue
    x, y = df_to_xy(df, select_columns=cols, df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[":".join(cols)] = item_res.mean()

res_duo_df = pd.DataFrame(res).T
res_duo_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_duo_df.head(10)

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,loss_sum,mse,rejection_ratio
q26:q29,26.792857,0.215351,24.822051,954.0,1445.003968,0.064047
q26:asian,26.821429,0.21544,24.852354,955.0,1451.718254,0.062905
q4:q29,26.715873,0.215463,24.343128,951.0,1479.730159,0.06755
q26:white,26.849206,0.215671,24.882657,956.0,1452.968254,0.062905
q16:India,26.771429,0.215807,24.400271,953.0,1487.269841,0.066989
q4:white,26.854762,0.216678,24.487716,956.0,1483.797619,0.066526
q4:donation_b,26.994444,0.216871,25.030709,961.0,1462.964286,0.062723
donation_b:ethnicity,27.022222,0.217017,25.05928,962.0,1468.103175,0.062569
q3:donation_b,27.022222,0.217017,25.05928,962.0,1468.103175,0.062569
q12:q13,27.1,0.21797,25.16829,965.0,1461.285714,0.07483


In [None]:
from itertools import combinations


features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')

res = {}
for cols in combinations(features, 3):
    #index should be a list
    cols = list(cols)
    if cols[0]==cols[1]: continue
    x, y = df_to_xy(df, select_columns=cols, df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[":".join(cols)] = item_res.mean()

res_trio_df = pd.DataFrame(res).T
res_trio_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_trio_df.head(10)

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,loss_sum,mse,rejection_ratio
q4:q26:asian,26.738095,0.214699,24.76664,952.0,1448.52381,0.063368
q12:q16:white,26.688095,0.21522,24.310931,950.0,1478.519841,0.07347
q4:q12:white,26.715873,0.215335,24.347538,951.0,1482.269841,0.066063
q26:genius:asian,26.821429,0.21544,24.852354,955.0,1451.718254,0.062905
q26:USA:asian,26.821429,0.21544,24.852354,955.0,1451.718254,0.062905
q4:q29:USA,26.71746,0.215454,24.348485,951.0,1479.301587,0.067532
q26:age:USA,26.85,0.215649,24.875731,956.0,1454.718254,0.063394
q26:asian:white,26.849206,0.215671,24.882657,956.0,1452.968254,0.062905
q26:USA:white,26.849206,0.215671,24.882657,956.0,1452.968254,0.062905
q26:ethnicity:asian,26.849206,0.215718,24.880925,956.0,1451.857143,0.062596


In [None]:
from itertools import combinations


features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')

res = {}
for cols in combinations(features, 4):
    #index should be a list
    cols = list(cols)
    if cols[0]==cols[1]: continue
    x, y = df_to_xy(df, select_columns=cols, df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[":".join(cols)] = item_res.mean()

res_quad_df = pd.DataFrame(res).T
res_quad_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_quad_df.head(10)

## Train model with top features

In [7]:
top_columns = ['selfish', 'time_spent_prop']
x, y = df_to_xy(df, select_columns=top_columns, min_target=20, max_target=180)


In [8]:
split = int(x.shape[0] * 0.6)
xTrain, yTrain = x[:split], y[:split]
xTest, yTest = x[split:], y[split:]


In [9]:
model = AcceptanceModel()
model.fit(xTrain, yTrain)

In [10]:
from models.metrics import gain_mean, avg_loss_ratio
yPred = model.predict(xTest)
print("Mean gain: ", gain_mean(yTest, yPred))
print("AVG loss ratio: ", avg_loss_ratio(yTest, yPred))

Mean gain:  97.65625
AVG loss ratio:  0.16550664472329668


In [11]:
print("Unique predicted values: ", np.unique(yPred))

Unique predicted values:  [ 95. 100. 105. 110.]
