In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from gbor.main import BoostedOrdinal
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import time
from sklearn.model_selection import cross_val_score, RepeatedKFold

wine_red = pd.read_csv('../data/winequality-red.csv', sep = ';')
#wine_red = pd.read_csv('../data/winequality-white.csv', sep = ';')
wine_red['quality'] = wine_red['quality'] - np.min(wine_red['quality'])

X, y = wine_red.drop(columns = ['quality']).to_numpy(), wine_red['quality'].to_numpy(dtype = 'int')
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, stratify = y)

cv = RepeatedKFold(n_repeats = 5, n_splits = 10)

In [10]:
new_gbor = BoostedOrdinal(
    n_iter_no_change = 10, max_iter = 10000
    , base_learner = DecisionTreeRegressor(max_depth = 6)
    , lr_g = 1e-1
    , lr_theta = 1#e-4
    , validation_stratify = False
    , validation_fraction = 0.2
    , reltol = 0.0
)
start_time = time.time()
my_cv = cross_val_score(
    new_gbor, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = 10
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv)
print(my_cv.mean())

time: 6.4 sec
[0.58125    0.6        0.5875     0.60625    0.65625    0.66875
 0.65625    0.6125     0.64375    0.62893082 0.64375    0.58125
 0.6        0.54375    0.65625    0.61875    0.68125    0.725
 0.56875    0.64779874 0.5875     0.66875    0.6625     0.675
 0.5875     0.6        0.56875    0.5625     0.65625    0.6918239
 0.59375    0.63125    0.65625    0.60625    0.6        0.6375
 0.61875    0.63125    0.6375     0.58490566 0.70625    0.625
 0.63125    0.6        0.61875    0.63125    0.625      0.61875
 0.5875     0.58490566]
0.6238922955974843


In [12]:
parameter_grid2 = {'base_learner__max_depth': [3, 6, 9], 'lr_g': [1e0, 1e-1, 1e-2]}
new_gridsearch = GridSearchCV(
    new_gbor
    , parameter_grid2
    , scoring = 'accuracy'
    , n_jobs = 1
)
start_time = time.time()
my_cv2 = cross_val_score(
    new_gridsearch, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = 10
    , verbose = 2
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv2)
print(my_cv2.mean())

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:  7.6min


time: 840.7 sec
[0.60625    0.56875    0.61875    0.61875    0.55       0.66875
 0.5875     0.65       0.59375    0.60377358 0.675      0.575
 0.6625     0.5875     0.575      0.6625     0.56875    0.60625
 0.60625    0.62893082 0.56875    0.65625    0.625      0.59375
 0.6375     0.64375    0.575      0.575      0.61875    0.64150943
 0.6125     0.6        0.59375    0.60625    0.575      0.58125
 0.625      0.63125    0.64375    0.59119497 0.61875    0.5875
 0.6        0.63125    0.59375    0.65       0.61875    0.61875
 0.61875    0.69811321]
0.6128954402515723


[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed: 14.0min finished


In [13]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

parameter_distros = {
    'base_learner__max_depth': [3, 4, 5, 6, 7, 8, 9]
    , 'lr_g': loguniform(1e-2, 1e0)
}
new_randomsearch = RandomizedSearchCV(
    new_gbor
    , parameter_distros
    , scoring = 'accuracy'
    , n_iter = 10
    , n_jobs = 1
)
start_time = time.time()
my_cv3 = cross_val_score(
    new_randomsearch, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = 10
    , verbose = 2
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv3)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:  4.1min


time: 571.5 sec
[0.6125     0.5625     0.66875    0.6125     0.6375     0.59375
 0.61875    0.58125    0.5875     0.61006289 0.63125    0.60625
 0.56875    0.6        0.6625     0.64375    0.60625    0.56875
 0.63125    0.56603774 0.6125     0.55       0.56875    0.5875
 0.6625     0.65625    0.66875    0.56875    0.56875    0.66037736
 0.63125    0.63125    0.53125    0.5875     0.59375    0.6375
 0.58125    0.675      0.55625    0.6163522  0.5875     0.575
 0.53125    0.63125    0.56875    0.6375     0.575      0.59375
 0.6625     0.67295597]


[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:  9.5min finished


In [14]:
print(my_cv3.mean())

0.6070157232704403


In [None]:
#from scipy import stats

# Perform paired t-test
#t_stat, p_value = stats.ttest_rel(my_cv, my_cv2)
#print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")


In [15]:
from mord import LogisticIT
mord_model = LogisticIT(alpha = 0.0)

start_time = time.time()
my_cv4 = cross_val_score(
    mord_model, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
    , verbose = 1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.7s


time: 7.5 sec
[0.63125    0.59375    0.6125     0.56875    0.65       0.58125
 0.56875    0.575      0.575      0.59119497 0.55625    0.625
 0.625      0.6375     0.58125    0.63125    0.59375    0.5875
 0.53125    0.53459119 0.65       0.6        0.55       0.58125
 0.5125     0.625      0.65       0.59375    0.575      0.54716981
 0.64375    0.6125     0.55625    0.63125    0.63125    0.5875
 0.48125    0.56875    0.59375    0.57232704 0.61875    0.575
 0.63125    0.54375    0.60625    0.58125    0.5625     0.63125
 0.575      0.58490566]


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    7.3s finished


In [16]:
my_cv4.mean(), my_cv.mean(), my_cv2.mean(), my_cv3.mean()

(0.5904787735849056,
 0.6238922955974843,
 0.6128954402515723,
 0.6070157232704403)

In [19]:
from scipy import stats

t_stat, p_value = stats.ttest_rel(my_cv, my_cv2)
print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")


Paired t-test: t-statistic = 1.6182284563048241, p-value = 0.11203328470181405
