## Data preparation and sanitization

In [134]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sb

df = pd.read_excel("../data/HH_SURVEY1/UG_HH_NEW_continuous_no200_train.xls")

df_full = pd.read_excel("../data/UG_HH_NEW_continuous_no200.xls")
df_min = df_full.min()
df_max = df_full.max()

#Drop 'protected' features
drop_cols = ['prop', 'other_prop', 'other_resp']
df = df[[col for col in df if col not in drop_cols]]

df.head()

Unnamed: 0,time_spent_risk,cells,selfish,time_spent_prop,count_effort,Honesty_Humility,Extraversion,Agreeableness,min_offer
0,43000,26,60,31000,20,4.2,3.0,2.8,50
1,12000,7,20,38000,20,1.9,4.4,2.9,50
2,88000,50,20,69000,12,3.3,3.8,3.4,80
3,24000,50,60,26000,14,3.6,3.8,2.6,50
4,137000,24,60,43000,4,3.6,2.5,2.7,100


## Individual plots

In [2]:
# sb.pairplot(df, x_vars=('prop','other_resp','other_prop'), y_vars='min_offer', height=7, aspect=0.7, kind='reg')
# sb.pairplot(df, x_vars=('cells', 'selfish','count_effort'), y_vars='min_offer', height=7, aspect=0.7)
# sb.pairplot(df, x_vars=('Honesty_Humility', 'Extraversion','Agreeableness'), y_vars='min_offer', height=7, aspect=0.7)

## Determine significant features

In [163]:
from utils.preprocessing import df_to_xy, df_to_xydf
from models import AcceptanceModel
from utils.benchmark import process_model, process_benchmark_cv

res = {}
features = list(df) + ['risk']
if 'min_offer' in features:
    features.remove('min_offer')
for col in features:
    if col == 'risk':    
        x, y = df_to_xy(df, select_columns=[col], fuse_risk=True, df_min=df_min, df_max=df_max)
        col = 'risk*'
    else:
        x, y = df_to_xy(df, select_columns=[col], df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[col] = item_res.mean()

res_single_df = pd.DataFrame(res).T
res_single_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_single_df

TypeError: df_to_xy() got an unexpected keyword argument 'df_min'

In [5]:
from itertools import combinations


features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')

res = {}
for cols in combinations(features, 2):
    #index should be a list
    cols = list(cols)
    if cols[0]==cols[1]: continue
    x, y = df_to_xy(df, select_columns=[col], df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[":".join(cols)] = item_res.mean()

res_duo_df = pd.DataFrame(res).T
res_duo_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_duo_df.head(10)

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,loss_sum,mse,rejection_ratio
selfish:time_spent_prop,27.854167,0.2243,25.528571,442.0,1556.729167,0.072656
selfish:Honesty_Humility,27.916667,0.224444,25.566667,443.0,1567.041667,0.072656
cells:Honesty_Humility,28.416667,0.22787,26.071429,451.0,1624.541667,0.060156
cells:selfish,29.7375,0.241621,26.37619,471.0,1740.604167,0.086958
selfish:count_effort,30.045833,0.246191,26.018315,477.0,1800.1875,0.097656
cells:time_spent_prop,30.35,0.24882,26.49011,482.0,1820.083333,0.085021
time_spent_prop:count_effort,31.5375,0.261269,26.954762,501.0,1922.895833,0.097521
time_spent_prop:Honesty_Humility,31.979167,0.264628,27.502381,508.0,1961.104167,0.072656
count_effort:Honesty_Humility,31.979167,0.266489,26.161039,508.0,2008.604167,0.102344
selfish:Agreeableness,33.291667,0.279206,27.359524,528.0,2028.75,0.097656


In [6]:
from itertools import combinations


features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')

res = {}
for cols in combinations(features, 3):
    #index should be a list
    cols = list(cols)
    if cols[0]==cols[1]: continue
    x, y = df_to_xy(df, select_columns=[col], df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[":".join(cols)] = item_res.mean()

res_trio_df = pd.DataFrame(res).T
res_trio_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_trio_df.head(10)

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,loss_sum,mse,rejection_ratio
selfish:count_effort:Honesty_Humility,29.108333,0.236332,25.757143,462.0,1693.5,0.106115
selfish:Honesty_Humility:Extraversion,29.816667,0.242534,26.349908,473.0,1711.666667,0.089844
cells:time_spent_prop:Honesty_Humility,30.55,0.248049,27.242857,484.0,1807.333333,0.076802
time_spent_risk:selfish:time_spent_prop,30.570833,0.251932,27.333333,485.0,1677.5625,0.128229
cells:selfish:Honesty_Humility,30.8,0.252194,26.833333,489.0,1865.833333,0.083594
cells:selfish:count_effort,30.858333,0.253299,26.890476,490.0,1844.875,0.109104
cells:selfish:time_spent_prop,31.654167,0.261797,27.102564,503.0,1956.8125,0.10975
selfish:time_spent_prop:count_effort,31.791667,0.26291,27.278571,505.0,1969.541667,0.102344
cells:Honesty_Humility:Extraversion,32.258333,0.264191,27.960256,511.0,1946.041667,0.090625
selfish:time_spent_prop:Honesty_Humility,32.104167,0.265234,27.787879,510.0,1986.104167,0.094531


In [7]:
from itertools import combinations


features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')

res = {}
for cols in combinations(features, 4):
    #index should be a list
    cols = list(cols)
    if cols[0]==cols[1]: continue
    x, y = df_to_xy(df, select_columns=[col], df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[":".join(cols)] = item_res.mean()

res_quad_df = pd.DataFrame(res).T
res_quad_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_quad_df.head(10)

Unnamed: 0,avg_loss,avg_loss_ratio,avg_win_loss,loss_sum,mse,rejection_ratio
cells:selfish:time_spent_prop:Honesty_Humility,28.779167,0.230969,26.528571,457.0,1657.354167,0.091781
selfish:Honesty_Humility:Extraversion:Agreeableness,31.525,0.256686,28.162775,499.0,1814.708333,0.084375
cells:selfish:count_effort:Honesty_Humility,31.733333,0.259799,27.982418,504.0,1921.041667,0.088792
time_spent_risk:selfish:time_spent_prop:Honesty_Humility,31.266667,0.260514,27.318681,496.0,1759.458333,0.13774
cells:selfish:time_spent_prop:count_effort,33.341667,0.277752,27.987912,530.0,2122.125,0.123812
time_spent_risk:selfish:time_spent_prop:count_effort,33.358333,0.279284,28.275824,528.0,1962.541667,0.132406
selfish:time_spent_prop:count_effort:Honesty_Humility,33.916667,0.281777,28.597673,539.0,2160.166667,0.105469
time_spent_risk:selfish:count_effort:Honesty_Humility,33.533333,0.282303,27.466346,531.0,2040.416667,0.145417
time_spent_risk:cells:selfish:Honesty_Humility,33.691667,0.282374,28.242857,535.0,2094.208333,0.138385
selfish:time_spent_prop:Extraversion:Agreeableness,33.804167,0.283614,28.016667,536.0,2057.6875,0.110156


## Bayesian ridge model (continuous)

In [11]:
# from sklearn.linear_model import BayesianRidge

# # Use features from above
# new_df = df[max_features]
# print(new_df.head())

# # Split dataset
# x = new_df.iloc[:, :-1].values
# y = df.iloc[:, -1:].values
# xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 1/3, random_state = 0)

# print(xTrain[0])
# print(yTrain[0])

# regressor = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
#       lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
#       normalize=False, tol=0.001, verbose=False).fit(xTrain, np.ravel(yTrain))
# yPredict = regressor.predict(xTest)
# print("R2:", r2_score(yTest, yPredict))

# out_data = pd.DataFrame(data={'y_test': np.ravel(yTest), 'y_pred': np.ravel(yPredict)})
# print(out_data)

# out_data.plot()

## Polynomial linear regression (continuous)

In [12]:
# from sklearn.preprocessing import PolynomialFeatures

# polynomial_features= PolynomialFeatures(degree=2)
# x_poly = polynomial_features.fit_transform(xTrain)

# regressor = LinearRegression(copy_X=True).fit(x_poly, yTrain)
# yPredict = regressor.predict(polynomial_features.fit_transform(xTest))
# print("R2:", r2_score(yTest, yPredict))

# out_data = pd.DataFrame(data={'y_test': np.ravel(yTest), 'y_pred': np.ravel(yPredict)})
# print(out_data)

# out_data.plot()

## Train model with top features

In [157]:
top_columns = ['selfish', 'time_spent_prop']
x, y = df_to_xy(df, select_columns=top_columns, min_target=20, max_target=180)


In [158]:
split = int(x.shape[0] * 0.6)
xTrain, yTrain = x[:split], y[:split]
xTest, yTest = x[split:], y[split:]


In [159]:
model = AcceptanceModel()
model.fit(xTrain, yTrain)

In [160]:
from models.metrics import gain_mean, avg_loss_ratio
yPred = model.predict(xTest)
print("Mean gain: ", gain_mean(yTest, yPred))
print("AVG loss ratio: ", avg_loss_ratio(yTest, yPred))

Mean gain:  97.34375
AVG loss ratio:  0.16830358736398932


In [161]:
print("Unique predicted values: ", np.unique(yPred))

Unique predicted values:  [ 95. 100. 105.]


## Generate data for the survey

In [162]:
# Read and sanitize the data
df_test = pd.read_excel("../data/HH_SURVEY1/UG_HH_NEW_continuous_no200_test.xls")

#TODO: add 'prop' to drop_cols?
drop_cols = ['prop', 'other_prop', 'other_resp']
df_test = df_test[[col for col in df_test if col not in drop_cols]]

df_features, df_y = df_to_xydf(df_test, select_columns=top_columns)
predictions = model.predict(df_features.values)

df_final = df_test[top_columns].copy()
#RESCALE FEATURES
df_final['pred_min_offer'] = predictions.ravel()
df_final['min_offer'] = df_y['min_offer']
df_final.head()

Unnamed: 0,selfish,time_spent_prop,pred_min_offer,min_offer
0,25,58000,105.0,100
1,30,66000,105.0,120
2,30,38000,100.0,100
3,15,154000,130.0,90
4,15,83000,115.0,100


In [133]:
df_final.to_excel("../data/HH_SURVEY1/UG_HH_NEW_continuous_no200_test_PRED.xls", index=False)