In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from gbor.main import BoostedOrdinal
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import time
from sklearn.model_selection import cross_val_score, RepeatedKFold

wine_red = pd.read_csv('../data/winequality-red.csv', sep = ';')
#wine_red = pd.read_csv('../data/winequality-white.csv', sep = ';')
wine_red['quality'] = wine_red['quality'] - np.min(wine_red['quality'])

X, y = wine_red.drop(columns = ['quality']).to_numpy(), wine_red['quality'].to_numpy(dtype = 'int')
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, stratify = y)

cv = RepeatedKFold(n_repeats = 10, n_splits = 5)

In [13]:
new_gbor = BoostedOrdinal(
    n_iter_no_change = 10, max_iter = 1000
    , base_learner = DecisionTreeRegressor(max_depth = 6)
    , lr_g = 1e-1
    , lr_theta = 5e-4
    , validation_stratify = False
    , validation_fraction = 0.2
)
start_time = time.time()
my_cv = cross_val_score(
    new_gbor, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv)

time: 5.5 sec
[0.64375    0.58125    0.678125   0.6125     0.63636364 0.640625
 0.628125   0.615625   0.64375    0.5799373  0.6375     0.696875
 0.6125     0.575      0.63009404 0.61875    0.634375   0.596875
 0.6375     0.65517241 0.61875    0.6875     0.578125   0.6125
 0.62382445 0.63125    0.64375    0.6125     0.703125   0.60188088
 0.671875   0.56875    0.621875   0.671875   0.60188088 0.640625
 0.58125    0.590625   0.675      0.59561129 0.6375     0.640625
 0.6        0.6125     0.63009404 0.6        0.6        0.640625
 0.6625     0.62695925]


In [14]:
my_cv.mean()

0.6267613636363637

In [None]:
parameter_grid2 = {'base_learner__max_depth': [3, 6, 9], 'lr_g': [1e-1, 1e-2, 1e-3]}
new_gridsearch = GridSearchCV(
    new_gbor
    , parameter_grid2
    , scoring = 'accuracy'
    , n_jobs = -1
)
start_time = time.time()
my_cv2 = cross_val_score(
    new_gridsearch, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
    , verbose = 1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv2)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

parameter_distros = {
    'base_learner__max_depth': [3, 4, 5, 6, 7, 8, 9]
    , 'lr_g': loguniform(1e-3, 1e-1)
}
new_randomsearch = RandomizedSearchCV(
    new_gbor
    , parameter_distros
    , scoring = 'accuracy'
    , n_iter = 10
)
start_time = time.time()
my_cv3 = cross_val_score(
    new_randomsearch, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
    , verbose = 1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv3)

In [None]:
#from scipy import stats

# Perform paired t-test
#t_stat, p_value = stats.ttest_rel(my_cv, my_cv2)
#print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")


In [15]:
from mord import LogisticIT
mord_model = LogisticIT(alpha = 0.0)

start_time = time.time()
my_cv4 = cross_val_score(
    mord_model, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
    , verbose = 1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.3s


time: 11.6 sec
[0.603125   0.56875    0.6375     0.571875   0.56426332 0.6125
 0.571875   0.590625   0.5875     0.59561129 0.596875   0.59375
 0.6        0.596875   0.56426332 0.58125    0.525      0.565625
 0.6375     0.63009404 0.609375   0.58125    0.596875   0.60625
 0.5799373  0.615625   0.578125   0.615625   0.559375   0.59561129
 0.59375    0.584375   0.55625    0.609375   0.56739812 0.603125
 0.54375    0.56875    0.628125   0.5830721  0.65625    0.584375
 0.59375    0.571875   0.55172414 0.6125     0.571875   0.56875
 0.596875   0.59874608]


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.5s finished


In [16]:
my_cv4.mean(), my_cv.mean()#, my_cv2.mean(), my_cv3.mean()

(0.589551920062696, 0.6267613636363637)

In [17]:
from scipy import stats

t_stat, p_value = stats.ttest_rel(my_cv4, my_cv)
print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")


Paired t-test: t-statistic = -7.268203255014478, p-value = 2.551137812715156e-09


In [None]:
#t_stat, p_value = stats.ttest_rel(my_cv2, my_cv3)
#print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")


In [None]:
np.unique(y, return_counts=True)