In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import sys
sys.path.insert(0,'..')
from custom_funcs import split

In [3]:
data = pd.read_csv("../Data/choices_exp1_ext.csv")[['sid','choice_x','self_x','other_x','self_y','other_y','self_z','other_z']]
#train, test, iterations = split(data)

# Random Forest

In [2]:
grid = dict(
    max_features=range(1,7),
    n_estimators=range(100,1000),
    max_depth=[None,1,2,3,4,5,6,7,8,9,10,11,12])

candidates = list()

for i in grid['max_features']:
    for j in grid['n_estimators']:
        for k in grid['max_depth']:
            candidates.append((i,j,k))

random.seed(0)
random_candidates = random.sample(candidates,500)

In [4]:
best_error_rf = 1
best_error_vals_rf = None
best_error_stds_rf = None
params_rf = None
i = 0

for hp in random_candidates:
    i += 1
    if i in [10,50,100,150,200,250,300,350,400,450]:
        print(i)
    loss_rf = []
    rf = RandomForestClassifier(max_features=hp[0],
                                n_estimators=hp[1],
                                max_depth=hp[2],
                                random_state=181)
    
    for it in iterations:
        X_train, X_test = it[0].drop(columns=['choice_x']),it[1].drop(columns=['choice_x'])
        y_train, y_test = it[0]['choice_x'], it[1]['choice_x']
        
        rf.fit(X_train, y_train)
        y_pred = rf.predict_proba(X_test)
        
        loss_rf.append(log_loss(y_test,y_pred))
        
    mean_loss = np.mean(loss_rf)
    if mean_loss < best_error_rf:
        best_error_rf = mean_loss
        best_error_vals_rf = loss_rf
        vest_error_stds_rf = np.std(loss_rf)
        params_rf = dict(
            max_features=hp[0],
            n_estimators=hp[1],
            max_depth=hp[2])
            
        print(i, ': ', mean_loss)

1 :  0.4495299058228136
2 :  0.4253882227135116
5 :  0.3744930776897825
10
10 :  0.2569866340000345
20 :  0.2377840059565207
28 :  0.23763784002990515
50


KeyboardInterrupt: 

In [6]:
rf = RandomForestClassifier(random_state=181,
                           n_estimators=params_rf['n_estimators'],
                           max_features=params_rf['max_features'],
                           max_depth=params_rf['max_depth'])

X_train, X_test = train.drop(columns=['choice_x']), test.drop(columns=['choice_x'])
y_train, y_test = train['choice_x'], test['choice_x']

rf.fit(X_train, y_train)
y_pred = rf.predict_proba(X_test)

loss_ml_3 = log_loss(y_test,y_pred)

In [7]:
loss_ml_3

0.2221396571163091

In [10]:
rf = RandomForestClassifier(random_state=181,
                           n_estimators=1000,
                           max_features=4,
                           max_depth=None)

X_train, X_test = train.drop(columns=['choice_x']), test.drop(columns=['choice_x'])
y_train, y_test = train['choice_x'], test['choice_x']

rf.fit(X_train, y_train)
y_pred = rf.predict_proba(X_test)

print(log_loss(y_test,y_pred))

0.2216022352624339


In [19]:
rf = RandomForestClassifier(random_state=181,
                           n_estimators=1000,
                           max_features='auto',
                           max_depth=60)

X_train, X_test = train.drop(columns=['choice_x']), test.drop(columns=['choice_x'])
y_train, y_test = train['choice_x'], test['choice_x']

rf.fit(X_train, y_train)
y_pred = rf.predict_proba(X_test)

print(log_loss(y_test,y_pred))

0.23209784513808507


# Gradient Boosting

In [22]:
grid = dict(
    max_depth=range(1,5),
    n_estimators=range(100,1000,100),
    learning_rate=np.linspace(0.01,1,100))

candidates = list()

for i in grid['max_depth']:
    for j in grid['n_estimators']:
        for k in grid['learning_rate']:
            candidates.append((i,j,k))

random.seed(0)
random_candidates = random.sample(candidates,500)

In [23]:
best_error = 1
best_error_vals = None
best_error_stds = None
params = None
i=0

for hp in random_candidates:
    i+=1
    if i in [10,50,100,150,200,250,300,350,400,450]:
        print(i)
    loss_gb = []
    gb = GradientBoostingClassifier(max_depth=hp[0],
                                    n_estimators=hp[1],
                                    learning_rate=hp[2],
                                    random_state=181)
    
    for it in iterations:
        X_train, X_test = it[0].drop(columns=['choice_x']), it[1].drop(columns=['choice_x'])
        y_train, y_test = it[0]['choice_x'], it[1]['choice_x']
        
        gb.fit(X_train, y_train)
        y_pred = gb.predict_proba(X_test)
        
        loss_gb.append(log_loss(y_test,y_pred))
        
    mean_loss = np.mean(loss_gb)
    if mean_loss < best_error:
        best_error = mean_loss
        best_error_vals = loss_gb
        vest_error_stds = np.std(loss_gb)
        params = dict(
            max_depth=hp[0],
            n_estimators=hp[1],
            learning_rate=hp[2])
            
        print(i,': ', mean_loss)

1 :  0.38079054065728657
2 :  0.2959243660580165
3 :  0.2626669636710764
4 :  0.24420721653979713
10
10 :  0.23617327386518588
28 :  0.23214081906730993


KeyboardInterrupt: 

In [24]:
gb = GradientBoostingClassifier(max_depth=params['max_depth'],
                            n_estimators=params['n_estimators'],
                            learning_rate=params['learning_rate'],
                            random_state=181)

X_train, X_test = train.drop(columns=['choice_x']), test.drop(columns=['choice_x'])
y_train, y_test = train['choice_x'], test['choice_x']

gb.fit(X_train, y_train)
y_pred = gb.predict_proba(X_test)

loss_ml_4 = log_loss(y_test,y_pred)

In [31]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,max_iter=500,
                    hidden_layer_sizes=(100,), random_state=181)

clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)

print(log_loss(y_test,y_pred))

0.33868660862805994


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [32]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled_train = scaler.transform(X_train)
X_scaled_test = scaler.transform(X_test)

In [38]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-2,max_iter=1000,
                    hidden_layer_sizes=(100,), random_state=181)

clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)

print(log_loss(y_test,y_pred))

0.336516887143991


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [46]:
data = pd.read_csv("../Data/choices_exp1_ext.csv")[['gid','sid','choice_x','self_x','other_x','self_y','other_y','self_z','other_z']]
temp = data.groupby(['gid']).mean()[['self_x', 'other_x', 'self_y', 'other_y','self_z','other_z']].dropna()

In [47]:
temp['diff_z'] = temp['self_z'] - temp['other_z']
temp['diff_x'] = temp['self_x'] - temp['other_x']
temp['diff_y'] = temp['self_y'] - temp['other_y']

In [48]:
temp.describe()

Unnamed: 0,self_x,other_x,self_y,other_y,self_z,other_z,diff_z,diff_x,diff_y
count,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0
mean,685.384615,635.384615,514.615385,564.615385,600.0,600.0,0.0,50.0,-50.0
std,229.328168,273.123788,229.328168,273.123788,322.566816,292.716786,579.648348,475.055704,475.055704
min,330.0,140.0,140.0,140.0,0.0,0.0,-1100.0,-710.0,-820.0
25%,455.0,465.0,295.0,410.0,382.5,362.5,-425.0,-322.5,-435.0
50%,680.0,680.0,520.0,520.0,600.0,600.0,0.0,-30.0,30.0
75%,905.0,790.0,745.0,735.0,817.5,837.5,425.0,435.0,322.5
max,1060.0,1060.0,870.0,1060.0,1200.0,1200.0,1100.0,820.0,710.0


In [49]:
temp.nunique()

self_x     36
other_x    30
self_y     36
other_y    30
self_z     66
other_z    52
diff_z     70
diff_x     36
diff_y     36
dtype: int64

In [50]:
itt = iterations[0][0]

In [None]:
iterations[0][0].index.to_list()

In [53]:
len(iterations)

5

In [70]:
indc = []
for i in range(len(iterations)):
    indc.append((iterations[i][0].index.to_list(),
                 list(iterations[i][1].index.to_list())))

In [67]:
indc = []
for i in range(len(iterations)):
    indc.append(iterations[i][1].index.to_list())

In [64]:
from sklearn.model_selection import RandomizedSearchCV

In [73]:
params = {'n_estimators': range(1,1000)}

clf = RandomizedSearchCV(GradientBoostingClassifier(),
                         params,cv=iii, random_state=0)

In [74]:
clf.fit(X_train, y_train)

ValueError: No fits were performed. Was the CV iterator empty? Were there no candidates?

In [71]:
iii = iter(indc)

In [None]:
for i in iii:
    print(i)

In [75]:
import numpy as np
from sklearn.model_selection import KFold

X = ["a", "b", "c", "d"]
kf = KFold(n_splits=2)
for train, test in kf.split(X):
    print("%s %s" % (train, test))


[2 3] [0 1]
[0 1] [2 3]


In [77]:
aa = kf.split(X)
for i in aa:
    print(i)

(array([2, 3]), array([0, 1]))
(array([0, 1]), array([2, 3]))


In [94]:
def set_test_fold(row):
    if row.index in iterations[0][1].index.to_list():
        return 0
    elif row.index in iterations[1][1].index.to_list():
        return 1
    elif row.index in iterations[2][1].index.to_list():
        return 2
    elif row.index in iterations[3][1].index.to_list():
        return 3
    else:
        return 4

In [95]:
train['idx'] = train.index
test_fold = train.apply(set_test_fold, axis=1)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [97]:
train.drop(columns=['idx'])

Unnamed: 0,choice_x,self_x,other_x,self_y,other_y,self_z,other_z,sid_12010050501,sid_12010050502,sid_12010050603,...,sid_302010050502,sid_302010050705,sid_312010050501,sid_312010050502,sid_312010050705,sid_322010050501,sid_332010050501,sid_342010050501,sid_352010050501,sid_362010050501
15446,1,470,730,190,1010,610.0,590.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11617,0,870,140,870,520,730.0,660.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5212,1,700,760,500,440,0.0,0.0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9038,1,790,600,410,600,0.0,0.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
11349,1,960,500,780,160,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390,1,420,1040,240,700,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6933,1,690,770,510,430,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4311,1,520,870,140,870,0.0,1010.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18612,1,890,520,850,140,1030.0,380.0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [98]:
from sklearn.model_selection import PredefinedSplit
ps = PredefinedSplit(test_fold)

In [102]:
params = {'n_estimators': range(1,1000)}

clf = RandomizedSearchCV(GradientBoostingClassifier(),
                         params,cv=ps, random_state=0, n_jobs=-1, verbose=11)

In [100]:
X_train, X_test = train.drop(columns=['choice_x']), test.drop(columns=['choice_x'])
y_train, y_test = train['choice_x'], test['choice_x']

In [103]:
clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([3, 4, ..., 4, 0])),
                   estimator=GradientBoostingClassifier(), n_jobs=-1,
                   param_distributions={'n_estimators': range(1, 1000)},
                   random_state=0, verbose=11)

In [105]:
clf.best_params_

{'n_estimators': 878}