In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import log_loss
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
import sys
sys.path.insert(0,'..')
from custom_funcs import split

In [2]:
data = pd.read_csv("../Data/choices_exp1_ext.csv")[['sid','choice_x','self_x','other_x','self_y','other_y','self_z','other_z']]
train, test, iterations, ps = split(data)

In [3]:
X_train, X_test = train.drop(columns=['choice_x']), test.drop(columns=['choice_x'])
y_train, y_test = train['choice_x'], test['choice_x']

# Gradient Boosting

In [None]:
calibrated_gb = CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(random_state=181))

param_grid = {
    'base_estimator__n_estimators':range(100,1000),
    'base_estimator__max_depth':range(1,20),
    'base_estimator__learning_rate':np.linspace(0.001,1,1000)
}

clf = RandomizedSearchCV(calibrated_gb,
                         param_grid,cv=ps,
                         random_state=181,
                         n_jobs=-1,
                         verbose=11,
                         n_iter=500,
                         scoring='neg_log_loss')

clf.fit(X_train,y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[CV 2/5; 1/500] START base_estimator__learning_rate=0.148, base_estimator__max_depth=2, base_estimator__n_estimators=697
[CV 2/5; 1/500] END base_estimator__learning_rate=0.148, base_estimator__max_depth=2, base_estimator__n_estimators=697;, score=-0.244 total time=  54.0s
[CV 2/5; 3/500] START base_estimator__learning_rate=0.489, base_estimator__max_depth=11, base_estimator__n_estimators=349
[CV 2/5; 3/500] END base_estimator__learning_rate=0.489, base_estimator__max_depth=11, base_estimator__n_estimators=349;, score=-0.287 total time= 2.7min
[CV 4/5; 4/500] START base_estimator__learning_rate=0.064, base_estimator__max_depth=19, base_estimator__n_estimators=590
[CV 4/5; 4/500] END base_estimator__learning_rate=0.064, base_estimator__max_depth=19, base_estimator__n_estimators=590;, score=-0.235 total time= 8.5min




[CV 2/5; 2/500] START base_estimator__learning_rate=0.886, base_estimator__max_depth=18, base_estimator__n_estimators=829
[CV 2/5; 2/500] END base_estimator__learning_rate=0.886, base_estimator__max_depth=18, base_estimator__n_estimators=829;, score=-0.329 total time= 2.6min
[CV 5/5; 3/500] START base_estimator__learning_rate=0.489, base_estimator__max_depth=11, base_estimator__n_estimators=349
[CV 5/5; 3/500] END base_estimator__learning_rate=0.489, base_estimator__max_depth=11, base_estimator__n_estimators=349;, score=-0.289 total time= 2.8min
[CV 3/5; 5/500] START base_estimator__learning_rate=0.376, base_estimator__max_depth=10, base_estimator__n_estimators=442
[CV 3/5; 5/500] END base_estimator__learning_rate=0.376, base_estimator__max_depth=10, base_estimator__n_estimators=442;, score=-0.240 total time= 3.2min
[CV 1/5; 6/500] START base_estimator__learning_rate=0.615, base_estimator__max_depth=10, base_estimator__n_estimators=750
[CV 1/5; 6/500] END base_estimator__learning_rate=

ase_estimator__max_depth=14, base_estimator__n_estimators=797;, score=-0.266 total time= 3.6min
[CV 3/5; 17/500] START base_estimator__learning_rate=0.585, base_estimator__max_depth=2, base_estimator__n_estimators=335
[CV 3/5; 17/500] END base_estimator__learning_rate=0.585, base_estimator__max_depth=2, base_estimator__n_estimators=335;, score=-0.283 total time=  30.6s
[CV 1/5; 19/500] START base_estimator__learning_rate=0.363, base_estimator__max_depth=14, base_estimator__n_estimators=268
[CV 1/5; 19/500] END base_estimator__learning_rate=0.363, base_estimator__max_depth=14, base_estimator__n_estimators=268;, score=-0.246 total time= 2.9min
[CV 4/5; 19/500] START base_estimator__learning_rate=0.363, base_estimator__max_depth=14, base_estimator__n_estimators=268
[CV 4/5; 19/500] END base_estimator__learning_rate=0.363, base_estimator__max_depth=14, base_estimator__n_estimators=268;, score=-0.238 total time= 2.9min
[CV 2/5; 22/500] START base_estimator__learning_rate=0.74, base_estimato

r__max_depth=6, base_estimator__n_estimators=304
[CV 2/5; 22/500] END base_estimator__learning_rate=0.74, base_estimator__max_depth=6, base_estimator__n_estimators=304;, score=-0.440 total time= 1.3min
[CV 5/5; 22/500] START base_estimator__learning_rate=0.74, base_estimator__max_depth=6, base_estimator__n_estimators=304
[CV 5/5; 22/500] END base_estimator__learning_rate=0.74, base_estimator__max_depth=6, base_estimator__n_estimators=304;, score=-0.470 total time= 1.3min
[CV 1/5; 24/500] START base_estimator__learning_rate=0.858, base_estimator__max_depth=9, base_estimator__n_estimators=291
[CV 1/5; 24/500] END base_estimator__learning_rate=0.858, base_estimator__max_depth=9, base_estimator__n_estimators=291;, score=-0.479 total time= 1.9min
[CV 4/5; 24/500] START base_estimator__learning_rate=0.858, base_estimator__max_depth=9, base_estimator__n_estimators=291
[CV 4/5; 24/500] END base_estimator__learning_rate=0.858, base_estimator__max_depth=9, base_estimator__n_estimators=291;, scor

In [None]:
clf.best_estimator_.fit(X_train,y_train)
y_pred = clf.predict_proba(X_test)
print(log_loss(y_test, y_pred))

In [14]:
param_grid = {
    'n_estimators':range(100,1000),
    'max_depth':range(1,20),
    'learning_rate':np.linspace(0.001,1,1000)
}

clf = RandomizedSearchCV(GradientBoostingClassifier(random_state=181),
                         param_grid,cv=ps,
                         random_state=181,
                         n_jobs=-1,
                         verbose=11,
                         n_iter=500,
                         scoring='neg_log_loss')

clf.fit(X_train,y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV 1/5; 2/500] START learning_rate=0.886, max_depth=18, n_estimators=829.......
[CV 1/5; 1/500] START learning_rate=0.148, max_depth=2, n_estimators=697........
[CV 3/5; 1/500] START learning_rate=0.148, max_depth=2, n_estimators=697........
[CV 2/5; 2/500] START learning_rate=0.886, max_depth=18, n_estimators=829.......
[CV 3/5; 2/500] START learning_rate=0.886, max_depth=18, n_estimators=829.......
[CV 2/5; 1/500] START learning_rate=0.148, max_depth=2, n_estimators=697........
[CV 4/5; 1/500] START learning_rate=0.148, max_depth=2, n_estimators=697........
[CV 5/5; 1/500] START learning_rate=0.148, max_depth=2, n_estimators=697........
[CV 3/5; 1/500] END learning_rate=0.148, max_depth=2, n_estimators=697;, score=-0.251 total time=  14.2s
[CV 4/5; 2/500] START learning_rate=0.886, max_depth=18, n_estimators=829.......
[CV 2/5; 1/500] END learning_rate=0.148, max_depth=2, n_estimators=697;, score=-0.250 total time=  14.

RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([3, 4, ..., 4, 0])),
                   estimator=GradientBoostingClassifier(random_state=181),
                   n_iter=500, n_jobs=-1,
                   param_distributions={'learning_rate': array([0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
       0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018,
       0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027,...
       0.955, 0.956, 0.957, 0.958, 0.959, 0.96 , 0.961, 0.962, 0.963,
       0.964, 0.965, 0.966, 0.967, 0.968, 0.969, 0.97 , 0.971, 0.972,
       0.973, 0.974, 0.975, 0.976, 0.977, 0.978, 0.979, 0.98 , 0.981,
       0.982, 0.983, 0.984, 0.985, 0.986, 0.987, 0.988, 0.989, 0.99 ,
       0.991, 0.992, 0.993, 0.994, 0.995, 0.996, 0.997, 0.998, 0.999,
       1.   ]),
                                        'max_depth': range(1, 20),
                                        'n_estimators': range(100, 1000)},
                   random_stat

In [15]:
clf.best_params_

{'n_estimators': 916, 'max_depth': 4, 'learning_rate': 0.094}

In [16]:
from joblib import dump, load

dump(clf, 'ML_ind_v1.joblib') 

['ML_ind_v1.joblib']

In [17]:
clf.best_estimator_.fit(X_train, y_train)
y_pred = clf.best_estimator_.predict_proba(X_test)
print(log_loss(y_test, y_pred))

0.2140503175453289


In [18]:
clf.best_score_

-0.2320358484861777

In [26]:
df = pd.DataFrame(clf.cv_results_)
df = df.sort_values(by=['rank_test_score'])

In [23]:
clf.cv_results_

{'mean_fit_time': array([ 14.36560764,  51.035922  ,  45.00428009, 149.91994662,
         56.18153615,  92.29365058,  70.74137282,  21.79216189,
         44.17221518, 134.92999396,  28.79693289,  32.51634827,
         19.20635271,  65.36025224,   5.49210439,  22.49132323,
          8.81122513,  86.26028008,  50.68395085,  24.08534846,
         49.40235229,  23.42036119, 105.89049768,  33.37159166,
         17.86035523,  55.01842566,  20.55641966,  64.60928502,
         41.33049421,  21.62532301,  31.99762168,  92.6123764 ,
         67.5969409 ,  27.06029854,  97.27372837,   9.40900278,
         41.63827243,  50.76785679,  37.86809001,  36.50813274,
         43.8796401 ,  21.89473519,  38.18453135, 103.15220046,
         71.46499381,  13.52498455,  47.34004898,  95.4946682 ,
         98.45820637,  50.70128136,  75.33843889,   5.62535086,
         81.54758668,  43.44340501,  67.44644718,  69.26105738,
         64.78833675,  38.570154  ,   7.95242772,  55.83999262,
         64.02861085,  