## Features reduction

In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sb

df = pd.read_excel("../data/HH_SURVEY1/UG_HH_NEW_continuous_no200_train.xls")

df_full = pd.read_excel("../data/UG_HH_NEW_continuous_no200.xls")
df_min = df_full.min()
df_max = df_full.max()

#Drop 'protected' features
drop_cols = ['prop', 'other_prop', 'other_resp']
df = df[[col for col in df if col not in drop_cols]]

df.head()

Unnamed: 0,time_spent_risk,cells,selfish,time_spent_prop,count_effort,Honesty_Humility,Extraversion,Agreeableness,min_offer
0,43000,26,60,31000,20,4.2,3.0,2.8,50
1,12000,7,20,38000,20,1.9,4.4,2.9,50
2,88000,50,20,69000,12,3.3,3.8,3.4,80
3,24000,50,60,26000,14,3.6,3.8,2.6,50
4,137000,24,60,43000,4,3.6,2.5,2.7,100


## Individual plots

In [2]:
# sb.pairplot(df, x_vars=('prop','other_resp','other_prop'), y_vars='min_offer', height=7, aspect=0.7, kind='reg')
# sb.pairplot(df, x_vars=('cells', 'selfish','count_effort'), y_vars='min_offer', height=7, aspect=0.7)
# sb.pairplot(df, x_vars=('Honesty_Humility', 'Extraversion','Agreeableness'), y_vars='min_offer', height=7, aspect=0.7)

## Determine significant features

In [3]:
from utils.preprocessing import df_to_xy, df_to_xydf
from models import AcceptanceModel
from utils.benchmark import process_model, process_benchmark_cv

res = {}
features = list(df) + ['risk']
if 'min_offer' in features:
    features.remove('min_offer')
for col in features:
    if col == 'risk':    
        x, y = df_to_xy(df, select_columns=[col], fuse_risk=True, df_min=df_min, df_max=df_max)
        col = 'risk*'
    else:
        x, y = df_to_xy(df, select_columns=[col], df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[col] = item_res.mean()

res_single_df = pd.DataFrame(res).T
res_single_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_single_df

Using TensorFlow backend.


Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,loss_sum,mse,rejection_ratio
selfish,26.979167,0.217393,24.542857,428.0,1482.979167,0.076563
Honesty_Humility,27.166667,0.217868,24.709524,431.0,1532.666667,0.064844
time_spent_prop,27.791667,0.223925,25.466667,441.0,1543.916667,0.065625
risk*,28.5625,0.231158,26.219048,453.0,1564.229167,0.063146
cells,29.75,0.241587,26.615385,472.0,1730.458333,0.064844
time_spent_risk,30.208333,0.248951,26.953846,479.0,1644.583333,0.09375
Agreeableness,31.008333,0.257521,25.816667,492.0,1862.0,0.088281
Extraversion,32.0,0.265528,26.779487,507.0,1915.708333,0.085938
count_effort,32.1125,0.268126,26.40641,510.0,2001.104167,0.103125


In [4]:
from itertools import combinations


features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')

res = {}
for cols in combinations(features, 2):
    #index should be a list
    cols = list(cols)
    if cols[0]==cols[1]: continue
    x, y = df_to_xy(df, centered=True, select_columns=cols, df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[":".join(cols)] = item_res.mean()

res_duo_df = pd.DataFrame(res).T
res_duo_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_duo_df.head(10)

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,loss_sum,mse,rejection_ratio
selfish:time_spent_prop,27.7875,0.223882,25.461905,441.0,1549.770833,0.081115
selfish:Honesty_Humility,28.1,0.226192,25.766667,446.0,1573.958333,0.091135
cells:time_spent_prop,28.345833,0.227576,26.057143,450.0,1611.270833,0.070042
cells:Honesty_Humility,28.733333,0.230849,26.404762,456.0,1633.0,0.060156
selfish:Extraversion,29.3,0.239017,25.890476,464.0,1658.416667,0.089844
cells:selfish,30.3875,0.247214,27.085714,481.0,1769.354167,0.070177
Honesty_Humility:Agreeableness,30.725,0.250284,27.315614,487.0,1761.25,0.083594
time_spent_prop:Honesty_Humility,31.1625,0.25548,27.431868,495.0,1876.895833,0.07174
cells:Extraversion,31.5875,0.258758,27.423397,501.0,1886.0625,0.073438
Honesty_Humility:Extraversion,31.858333,0.261197,27.549679,505.0,1880.833333,0.080469


In [5]:
from itertools import combinations


features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')

res = {}
for cols in combinations(features, 3):
    #index should be a list
    cols = list(cols)
    if cols[0]==cols[1]: continue
    x, y = df_to_xy(df, centered=True, select_columns=cols, df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[":".join(cols)] = item_res.mean()

res_trio_df = pd.DataFrame(res).T
res_trio_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_trio_df.head(10)

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,loss_sum,mse,rejection_ratio
selfish:time_spent_prop:Honesty_Humility,28.966667,0.232898,26.761905,460.0,1662.291667,0.092562
cells:selfish:time_spent_prop,30.245833,0.245897,27.292308,480.0,1767.145833,0.078906
selfish:time_spent_prop:Extraversion,30.483333,0.248383,27.11598,483.0,1741.625,0.08125
selfish:Honesty_Humility:Extraversion,30.8,0.250877,27.34217,488.0,1765.125,0.089844
time_spent_risk:cells:time_spent_prop,30.883333,0.252273,27.661905,490.0,1768.833333,0.115729
selfish:Extraversion:Agreeableness,30.595833,0.252409,26.324542,485.0,1766.1875,0.080469
selfish:Honesty_Humility:Agreeableness,31.091667,0.25315,27.76859,493.0,1786.25,0.075781
cells:time_spent_prop:Honesty_Humility,31.179167,0.253181,28.052381,495.0,1840.229167,0.05625
cells:time_spent_prop:Agreeableness,31.2125,0.254646,27.905082,495.0,1786.229167,0.085021
time_spent_risk:selfish:time_spent_prop,30.8875,0.25525,27.671429,490.0,1686.770833,0.137604


In [6]:
from itertools import combinations


features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')

res = {}
for cols in combinations(features, 4):
    #index should be a list
    cols = list(cols)
    if cols[0]==cols[1]: continue
    x, y = df_to_xy(df, centered=True, select_columns=cols, df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[":".join(cols)] = item_res.mean()

res_quad_df = pd.DataFrame(res).T
res_quad_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_quad_df.head(10)

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,loss_sum,mse,rejection_ratio
cells:selfish:Extraversion:Agreeableness,30.791667,0.250824,27.426328,488.0,1761.916667,0.075781
time_spent_risk:time_spent_prop:count_effort:Agreeableness,30.925,0.252103,28.553571,490.0,1664.833333,0.075646
selfish:Honesty_Humility:Extraversion:Agreeableness,31.2125,0.253978,27.866804,495.0,1797.895833,0.077344
cells:selfish:time_spent_prop:Extraversion,31.654167,0.256973,28.350641,502.0,1860.729167,0.067969
time_spent_risk:selfish:time_spent_prop:Honesty_Humility,31.508333,0.259348,28.240476,500.0,1759.041667,0.11651
cells:selfish:time_spent_prop:count_effort,31.658333,0.259524,27.893773,503.0,1924.375,0.114302
cells:selfish:Honesty_Humility:Extraversion,32.116667,0.26093,28.776969,509.0,1884.875,0.074219
time_spent_risk:time_spent_prop:count_effort:Extraversion,32.133333,0.264463,28.707692,509.0,1795.166667,0.109375
cells:selfish:time_spent_prop:Honesty_Humility,32.620833,0.267074,28.785714,518.0,1993.854167,0.078906
time_spent_risk:selfish:time_spent_prop:count_effort,32.258333,0.267938,28.262821,512.0,1849.041667,0.137604


## Train model with top features

In [7]:
top_columns = ['selfish', 'time_spent_prop']
#top_columns = ['cells', 'time_spent_prop']
top_columns = res_duo_df.index[0].split(':')
x, y = df_to_xy(df, centered=True, select_columns=top_columns, min_target=5, max_target=190)


In [8]:
split = int(x.shape[0] * 0.6)
xTrain, yTrain = x[:split], y[:split]
xTest, yTest = x[split:], y[split:]


In [9]:
model = AcceptanceModel()
model.fit(xTrain, yTrain)

In [10]:
from models.metrics import gain_mean, avg_loss_ratio
yPred = model.predict(xTest)
print("Mean gain: ", gain_mean(yTest, yPred))
print("AVG loss ratio: ", avg_loss_ratio(yTest, yPred))

Mean gain:  97.65625
AVG loss ratio:  0.16558104948520147


In [11]:
print("Unique predicted values: ", np.unique(yPred))

Unique predicted values:  [ 95. 100. 105.]


## Generate data for the survey

In [12]:
# Read and sanitize the data
df_test = pd.read_excel("../data/HH_SURVEY1/UG_HH_NEW_continuous_no200_test.xls")

#TODO: add 'prop' to drop_cols?
drop_cols = ['prop', 'other_prop', 'other_resp']
df_test = df_test[[col for col in df_test if col not in drop_cols]]

df_features, df_y = df_to_xydf(df_test, centered=True, select_columns=top_columns)
predictions = model.predict(df_features.values).astype(int)

df_final = df_test[top_columns].copy()
#RESCALE FEATURES
df_final['pred_min_offer'] = predictions.ravel()
df_final['min_offer'] = df_y['min_offer']
df_final.head()

Unnamed: 0,selfish,time_spent_prop,pred_min_offer,min_offer
0,25,58000,100,100
1,30,66000,105,120
2,30,38000,95,100
3,15,154000,130,90
4,15,83000,110,100


In [13]:
df_final.to_excel("../data/HH_SURVEY1/UG_HH_NEW_continuous_no200_test_PRED.xls", index=False)