In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from gbor.main import BoostedOrdinal
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import time
from sklearn.model_selection import cross_val_score, RepeatedKFold

#wine_red = pd.read_csv('../data/winequality-red.csv', sep = ';')
wine_red = pd.read_csv('../data/winequality-white.csv', sep = ';')
wine_red['quality'] = wine_red['quality'] - np.min(wine_red['quality'])

X, y = wine_red.drop(columns = ['quality']).to_numpy(), wine_red['quality'].to_numpy(dtype = 'int')
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, stratify = y)

cv = RepeatedKFold(n_repeats = 10, n_splits = 5)

In [30]:
new_gbor = BoostedOrdinal(
    n_iter_no_change = 10, max_iter = 1000
    , base_learner = DecisionTreeRegressor(max_depth = 6)
    , lr_g = 1e-1
    , lr_theta = 1e-4
    , validation_stratify = False
    , validation_fraction = 0.2
)
start_time = time.time()
my_cv = cross_val_score(
    new_gbor, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv)

time: 14.2 sec
[0.5377551  0.56326531 0.58877551 0.58529111 0.56996936 0.55
 0.57040816 0.58265306 0.57814096 0.56792646 0.55714286 0.5744898
 0.57653061 0.57303371 0.57201226 0.57346939 0.57653061 0.57755102
 0.5546476  0.57201226 0.5877551  0.6        0.57653061 0.56179775
 0.5587334  0.57755102 0.55408163 0.59081633 0.57099081 0.59856997
 0.60204082 0.59081633 0.58061224 0.5485189  0.57405516 0.56632653
 0.57346939 0.57346939 0.57916241 0.57405516 0.58469388 0.55204082
 0.57142857 0.57814096 0.56384065 0.5877551  0.55102041 0.58673469
 0.59856997 0.5587334 ]


In [31]:
my_cv.mean()

0.5734783306581059

In [27]:
parameter_grid2 = {'base_learner__max_depth': [3, 6, 9], 'lr_g': [1e-1, 1e-2, 1e-3]}
new_gridsearch = GridSearchCV(
    new_gbor
    , parameter_grid2
    , scoring = 'accuracy'
    , n_jobs = -1
)
start_time = time.time()
my_cv2 = cross_val_score(
    new_gridsearch, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
    , verbose = 1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  6.2min


time: 572.9 sec
[0.06428571 0.18163265 0.42244898 0.27477017 0.39938713 0.37653061
 0.26836735 0.40612245 0.39836568 0.46067416 0.40918367 0.4755102
 0.45204082 0.05720123 0.22778345 0.45612245 0.17653061 0.44693878
 0.17875383 0.37180797 0.45306122 0.05918367 0.28367347 0.19305414
 0.46271706 0.30204082 0.38469388 0.26836735 0.24923391 0.30132789
 0.22142857 0.41836735 0.45816327 0.47088866 0.30439224 0.1744898
 0.30816327 0.38877551 0.46578141 0.05107252 0.40918367 0.02346939
 0.04285714 0.19203269 0.18896834 0.43877551 0.14897959 0.47346939
 0.43207354 0.18283963]


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.5min finished


In [4]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

parameter_distros = {
    'base_learner__max_depth': [3, 4, 5, 6, 7, 8, 9]
    , 'lr_g': loguniform(1e-3, 1e-1)
}
new_randomsearch = RandomizedSearchCV(
    new_gbor
    , parameter_distros
    , scoring = 'accuracy'
    , n_iter = 10
)
start_time = time.time()
my_cv3 = cross_val_score(
    new_randomsearch, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
    , verbose = 1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  3.0min


time: 298.3 sec
[0.603125   0.671875   0.571875   0.525      0.5799373  0.621875
 0.6        0.578125   0.546875   0.63322884 0.59375    0.6125
 0.571875   0.61875    0.57366771 0.58125    0.634375   0.58125
 0.58125    0.59561129 0.61875    0.553125   0.575      0.646875
 0.5862069  0.61875    0.6125     0.596875   0.63125    0.5862069
 0.5875     0.56875    0.596875   0.55625    0.61755486 0.6
 0.534375   0.59375    0.615625   0.63322884 0.625      0.634375
 0.5625     0.63125    0.60815047 0.6        0.578125   0.621875
 0.575      0.57053292]


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  5.0min finished


In [5]:
#from scipy import stats

# Perform paired t-test
#t_stat, p_value = stats.ttest_rel(my_cv, my_cv2)
#print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")


In [14]:
from mord import LogisticIT
mord_model = LogisticIT(alpha = 0.0)

start_time = time.time()
my_cv4 = cross_val_score(
    mord_model, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
    , verbose = 1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   28.7s


time: 42.1 sec
[0.49489796 0.55408163 0.5255102  0.53524004 0.52298264 0.5244898
 0.53061224 0.53571429 0.53421859 0.51174668 0.51122449 0.53469388
 0.52959184 0.52706844 0.52400409 0.51530612 0.52959184 0.53469388
 0.54954035 0.50663943 0.52959184 0.52959184 0.50918367 0.53932584
 0.51378958 0.50408163 0.54489796 0.54489796 0.51685393 0.52400409
 0.53367347 0.52653061 0.49183673 0.53013279 0.54954035 0.49591837
 0.52857143 0.55918367 0.50766088 0.53217569 0.52755102 0.53469388
 0.50918367 0.51174668 0.5505618  0.51020408 0.52857143 0.53877551
 0.52911134 0.51787538]


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   42.0s finished


In [32]:
my_cv4.mean(), my_cv.mean()#, my_cv2.mean(), my_cv3.mean()

(0.5260313105834775, 0.5734783306581059)

In [9]:
from scipy import stats

t_stat, p_value = stats.ttest_rel(my_cv4, my_cv2)
print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")


Paired t-test: t-statistic = -2.099393074140664, p-value = 0.04095599477173379


In [None]:
#t_stat, p_value = stats.ttest_rel(my_cv2, my_cv3)
#print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")


In [21]:
np.unique(y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]),
 array([  20,  163, 1457, 2198,  880,  175,    5], dtype=int64))