## Feature reduction

In [5]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sb

df = pd.read_excel("../data/t00/UG_HH_NEW_categorical_no200.xls")
df = df.dropna()

df_full = pd.read_excel("../data/t00/UG_HH_NEW_continuous_no200.xls")
df_min = df_full.min()
df_max = df_full.max()

#Drop 'protected' features
drop_cols = ['prop', 'other_prop', 'other_resp']
df = df[[col for col in df if col not in drop_cols]]

df.head()

Unnamed: 0,q1,q2,q3,q4,q5,q6,q7,q8,q9,q10,...,equal_income,asian,white,lazy_stupid,diligent,completely_selfish,complete_donor,expect_50less,expect_100,min_offer
0,2,4,1,2,4,2,5,4,2,5,...,0,0,0,0,0,0,0,0,0,80
1,3,1,5,1,4,5,5,1,3,2,...,0,0,0,1,0,1,0,1,0,50
2,3,4,3,4,2,5,3,3,2,4,...,0,0,0,1,0,0,0,0,1,100
3,4,4,2,4,3,3,4,4,2,4,...,0,0,0,1,0,0,0,0,1,100
4,4,4,2,4,4,2,5,4,3,5,...,0,0,0,0,0,0,0,0,0,95


## Individual plots

In [6]:
# sb.pairplot(df, x_vars=('prop','other_resp','other_prop'), y_vars='min_offer', height=7, aspect=0.7, kind='reg')
# sb.pairplot(df, x_vars=('cells', 'selfish','count_effort'), y_vars='min_offer', height=7, aspect=0.7)
# sb.pairplot(df, x_vars=('Honesty_Humility', 'Extraversion','Agreeableness'), y_vars='min_offer', height=7, aspect=0.7)

## Determine significant features

In [None]:
from core.utils.preprocessing import df_to_xy, df_to_xydf
from core.models import AcceptanceModel
from core.utils.benchmark import process_model, process_benchmark_cv

res = {}
features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')
for col in features:
    if col == 'risk':    
        x, y = df_to_xy(df, select_columns=[col], fuse_risk=True, df_min=df_min, df_max=df_max)
        col = 'risk*'
    else:
        x, y = df_to_xy(df, select_columns=[col], df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[col] = item_res.mean()

res_single_df = pd.DataFrame(res).T
res_single_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_single_df

       q1
0    0.25
1    0.50
2    0.50
3    0.75
4    0.75
5    0.75
6    0.50
7    0.25
8    0.50
9    0.50
10   0.25
11   1.00
12   0.75
13   0.25
14   0.75
15   0.25
16   1.00
17   1.00
18   0.50
19   0.75
20   0.25
21   0.50
22   0.75
23   0.75
24   0.75
25   0.50
26   0.25
27   0.00
28   0.25
29   0.75
..    ...
149  0.75
150  0.75
151  0.25
152  0.00
153  0.75
154  0.50
155  0.25
156  0.25
157  0.50
158  0.75
159  0.25
160  0.75
161  0.25
162  0.25
163  0.50
164  1.00
165  0.75
166  0.75
167  0.75
168  0.25
169  0.00
170  1.00
171  0.75
172  0.75
173  0.75
174  0.25
175  0.75
176  0.25
177  0.75
178  0.75

[178 rows x 1 columns]      min_offer
0           80
1           50
2          100
3          100
4           95
5           20
6          150
7           50
8          100
9          100
10         125
11         100
12          50
13          75
14          30
15          90
16          95
17         140
18          50
19         100
20          50
21         100
22         

       q7
0    1.00
1    1.00
2    0.50
3    0.75
4    1.00
5    0.25
6    0.50
7    0.25
8    0.75
9    0.75
10   0.50
11   0.25
12   1.00
13   0.25
14   0.25
15   0.25
16   1.00
17   1.00
18   0.50
19   0.25
20   0.75
21   0.50
22   0.75
23   0.25
24   1.00
25   1.00
26   0.00
27   0.25
28   0.25
29   0.25
..    ...
149  0.25
150  1.00
151  0.00
152  0.00
153  0.25
154  0.50
155  0.25
156  0.50
157  0.50
158  0.25
159  0.25
160  0.25
161  0.25
162  1.00
163  0.25
164  0.00
165  0.25
166  0.50
167  0.50
168  0.50
169  0.25
170  0.75
171  0.25
172  0.75
173  0.25
174  0.25
175  0.75
176  0.00
177  1.00
178  0.75

[178 rows x 1 columns]      min_offer
0           80
1           50
2          100
3          100
4           95
5           20
6          150
7           50
8          100
9          100
10         125
11         100
12          50
13          75
14          30
15          90
16          95
17         140
18          50
19         100
20          50
21         100
22         

      q13
0    0.25
1    0.75
2    0.50
3    0.75
4    0.75
5    0.25
6    0.75
7    0.25
8    1.00
9    0.50
10   0.25
11   1.00
12   1.00
13   0.50
14   0.75
15   0.25
16   1.00
17   1.00
18   0.25
19   0.75
20   0.25
21   0.75
22   1.00
23   0.50
24   0.75
25   0.50
26   0.75
27   0.75
28   0.25
29   1.00
..    ...
149  0.75
150  0.75
151  0.25
152  0.25
153  0.75
154  0.50
155  0.50
156  0.50
157  0.50
158  0.50
159  0.25
160  0.50
161  0.25
162  0.25
163  0.75
164  1.00
165  0.75
166  0.75
167  0.75
168  0.50
169  0.25
170  0.75
171  0.75
172  0.50
173  0.75
174  0.50
175  0.25
176  0.25
177  0.75
178  0.75

[178 rows x 1 columns]      min_offer
0           80
1           50
2          100
3          100
4           95
5           20
6          150
7           50
8          100
9          100
10         125
11         100
12          50
13          75
14          30
15          90
16          95
17         140
18          50
19         100
20          50
21         100
22         

      q19
0    1.00
1    0.50
2    0.50
3    0.50
4    0.75
5    0.25
6    0.75
7    0.00
8    0.75
9    0.50
10   0.50
11   0.75
12   0.25
13   0.75
14   0.75
15   0.50
16   0.75
17   0.50
18   0.50
19   0.75
20   0.25
21   0.50
22   0.25
23   0.50
24   0.75
25   0.75
26   0.75
27   0.75
28   0.50
29   0.75
..    ...
149  0.75
150  0.25
151  0.00
152  0.75
153  0.75
154  0.50
155  0.50
156  0.50
157  0.50
158  0.50
159  0.50
160  0.25
161  0.75
162  0.75
163  1.00
164  0.25
165  0.50
166  0.75
167  0.75
168  0.50
169  0.25
170  0.75
171  0.50
172  0.75
173  0.25
174  0.25
175  0.50
176  0.25
177  0.25
178  0.75

[178 rows x 1 columns]      min_offer
0           80
1           50
2          100
3          100
4           95
5           20
6          150
7           50
8          100
9          100
10         125
11         100
12          50
13          75
14          30
15          90
16          95
17         140
18          50
19         100
20          50
21         100
22         

      q25
0    0.00
1    0.25
2    0.50
3    0.50
4    0.50
5    0.50
6    0.75
7    0.25
8    0.75
9    0.50
10   0.75
11   0.75
12   0.25
13   0.25
14   0.25
15   0.25
16   0.75
17   1.00
18   0.50
19   0.75
20   0.00
21   0.50
22   0.00
23   0.50
24   0.75
25   0.75
26   0.75
27   0.75
28   0.50
29   1.00
..    ...
149  0.75
150  0.25
151  0.50
152  0.75
153  0.50
154  0.50
155  0.50
156  0.75
157  0.50
158  0.50
159  0.75
160  0.25
161  0.25
162  0.50
163  0.50
164  0.25
165  0.50
166  1.00
167  0.25
168  0.50
169  0.25
170  0.75
171  0.25
172  0.50
173  0.25
174  0.25
175  0.25
176  0.75
177  0.75
178  0.25

[178 rows x 1 columns]      min_offer
0           80
1           50
2          100
3          100
4           95
5           20
6          150
7           50
8          100
9          100
10         125
11         100
12          50
13          75
14          30
15          90
16          95
17         140
18          50
19         100
20          50
21         100
22         

     donation_a
0          0.50
1          0.00
2          0.25
3          0.75
4          0.50
5          0.25
6          0.50
7          0.00
8          0.50
9          0.00
10         0.25
11         1.00
12         1.00
13         0.25
14         0.25
15         0.00
16         0.25
17         1.00
18         0.75
19         0.25
20         1.00
21         0.50
22         1.00
23         0.50
24         1.00
25         0.00
26         0.50
27         0.25
28         0.00
29         0.50
..          ...
149        1.00
150        0.75
151        1.00
152        0.75
153        1.00
154        0.50
155        1.00
156        0.50
157        0.50
158        1.00
159        1.00
160        0.00
161        1.00
162        0.25
163        1.00
164        0.00
165        0.75
166        1.00
167        1.00
168        0.00
169        0.75
170        0.25
171        0.50
172        0.00
173        1.00
174        0.50
175        0.25
176        0.00
177        0.00
178        0.00

[178 ro

     age
0    0.2
1    0.2
2    0.4
3    0.4
4    0.4
5    0.6
6    0.0
7    0.8
8    0.2
9    0.8
10   0.0
11   0.8
12   0.0
13   0.2
14   0.2
15   0.2
16   0.0
17   0.4
18   0.2
19   0.2
20   0.6
21   0.2
22   0.2
23   0.0
24   0.8
25   0.0
26   0.4
27   0.2
28   0.6
29   0.4
..   ...
149  0.8
150  0.4
151  0.4
152  0.2
153  0.2
154  0.4
155  0.4
156  0.6
157  0.2
158  0.4
159  0.4
160  0.6
161  0.4
162  0.2
163  0.6
164  0.2
165  0.4
166  0.6
167  0.6
168  0.2
169  0.4
170  0.4
171  0.2
172  0.4
173  0.2
174  1.0
175  0.0
176  0.6
177  0.6
178  0.0

[178 rows x 1 columns]      min_offer
0           80
1           50
2          100
3          100
4           95
5           20
6          150
7           50
8          100
9          100
10         125
11         100
12          50
13          75
14          30
15          90
16          95
17         140
18          50
19         100
20          50
21         100
22         195
23          50
24         100
25          50
26         10

     almost_genius
0              0.0
1              0.0
2              0.0
3              0.0
4              0.0
5              0.0
6              0.0
7              0.0
8              0.0
9              0.0
10             0.0
11             0.0
12             0.0
13             0.0
14             0.0
15             0.0
16             0.0
17             0.0
18             0.0
19             0.0
20             0.0
21             0.0
22             0.0
23             0.0
24             0.0
25             0.0
26             0.0
27             0.0
28             0.0
29             0.0
..             ...
149            1.0
150            1.0
151            1.0
152            1.0
153            1.0
154            1.0
155            1.0
156            1.0
157            1.0
158            1.0
159            1.0
160            1.0
161            1.0
162            1.0
163            1.0
164            1.0
165            1.0
166            1.0
167            1.0
168            1.0
169         

     India
0      1.0
1      1.0
2      1.0
3      1.0
4      0.0
5      0.0
6      1.0
7      0.0
8      1.0
9      0.0
10     1.0
11     0.0
12     1.0
13     1.0
14     1.0
15     1.0
16     1.0
17     1.0
18     1.0
19     1.0
20     0.0
21     0.0
22     1.0
23     1.0
24     0.0
25     1.0
26     0.0
27     1.0
28     0.0
29     1.0
..     ...
149    0.0
150    1.0
151    0.0
152    1.0
153    0.0
154    0.0
155    0.0
156    0.0
157    0.0
158    0.0
159    1.0
160    0.0
161    0.0
162    0.0
163    0.0
164    0.0
165    0.0
166    0.0
167    0.0
168    0.0
169    0.0
170    0.0
171    0.0
172    0.0
173    0.0
174    0.0
175    0.0
176    0.0
177    0.0
178    0.0

[178 rows x 1 columns]      min_offer
0           80
1           50
2          100
3          100
4           95
5           20
6          150
7           50
8          100
9          100
10         125
11         100
12          50
13          75
14          30
15          90
16          95
17         140
18       

     asian
0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
6      0.0
7      0.0
8      0.0
9      0.0
10     0.0
11     0.0
12     0.0
13     0.0
14     0.0
15     0.0
16     0.0
17     0.0
18     0.0
19     0.0
20     0.0
21     0.0
22     0.0
23     0.0
24     0.0
25     0.0
26     0.0
27     0.0
28     0.0
29     0.0
..     ...
149    0.0
150    0.0
151    0.0
152    0.0
153    0.0
154    0.0
155    0.0
156    0.0
157    0.0
158    0.0
159    0.0
160    0.0
161    0.0
162    0.0
163    0.0
164    0.0
165    0.0
166    0.0
167    0.0
168    0.0
169    0.0
170    0.0
171    0.0
172    0.0
173    0.0
174    0.0
175    0.0
176    0.0
177    0.0
178    0.0

[178 rows x 1 columns]      min_offer
0           80
1           50
2          100
3          100
4           95
5           20
6          150
7           50
8          100
9          100
10         125
11         100
12          50
13          75
14          30
15          90
16          95
17         140
18       

In [None]:
from itertools import combinations


features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')

res = {}
for cols in combinations(features, 2):
    #index should be a list
    cols = list(cols)
    if cols[0]==cols[1]: continue
    x, y = df_to_xy(df, select_columns=cols, df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[":".join(cols)] = item_res.mean()

res_duo_df = pd.DataFrame(res).T
res_duo_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_duo_df.head(10)

In [None]:
from itertools import combinations


features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')

res = {}
for cols in combinations(features, 3):
    #index should be a list
    cols = list(cols)
    if cols[0]==cols[1]: continue
    x, y = df_to_xy(df, select_columns=cols, df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[":".join(cols)] = item_res.mean()

res_trio_df = pd.DataFrame(res).T
res_trio_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_trio_df.head(10)

In [None]:
from itertools import combinations


features = list(df)
if 'min_offer' in features:
    features.remove('min_offer')

res = {}
for cols in combinations(features, 4):
    #index should be a list
    cols = list(cols)
    if cols[0]==cols[1]: continue
    x, y = df_to_xy(df, select_columns=cols, df_min=df_min, df_max=df_max)
    model = AcceptanceModel()
    item_res = process_benchmark_cv(model, X=x, y=y)
    res[":".join(cols)] = item_res.mean()

res_quad_df = pd.DataFrame(res).T
res_quad_df.sort_values(by=['avg_loss_ratio'], inplace=True)
res_quad_df.head(10)

## Train model with top features

In [None]:
top_columns = ['selfish', 'time_spent_prop']
x, y = df_to_xy(df, select_columns=top_columns, min_target=20, max_target=180)


In [None]:
split = int(x.shape[0] * 0.6)
xTrain, yTrain = x[:split], y[:split]
xTest, yTest = x[split:], y[split:]


In [None]:
model = AcceptanceModel()
model.fit(xTrain, yTrain)

In [None]:
from core.models.metrics import gain_mean, avg_loss_ratio
yPred = model.predict(xTest)
print("Mean gain: ", gain_mean(yTest, yPred))
print("AVG loss ratio: ", avg_loss_ratio(yTest, yPred))

In [None]:
print("Unique predicted values: ", np.unique(yPred))