In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from gbor.main import BoostedOrdinal
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import time
from sklearn.model_selection import cross_val_score, RepeatedKFold

#wine_red = pd.read_csv('../data/winequality-red.csv', sep = ';')
wine_red = pd.read_csv('../data/winequality-white.csv', sep = ';')
wine_red['quality'] = wine_red['quality'] - np.min(wine_red['quality'])

X, y = wine_red.drop(columns = ['quality']).to_numpy(), wine_red['quality'].to_numpy(dtype = 'int')
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, stratify = y)

cv = RepeatedKFold(n_repeats = 5, n_splits = 10)

In [2]:
new_gbor = BoostedOrdinal(
    n_iter_no_change = 10, max_iter = 1000
    , base_learner = DecisionTreeRegressor(max_depth = 6)
    , lr_g = 1e-1
    , lr_theta = 5#e-4
    , validation_stratify = False
    , validation_fraction = 0.2
)
start_time = time.time()
my_cv = cross_val_score(
    new_gbor, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv)

time: 7.9 sec
[0.58979592 0.58367347 0.58979592 0.5877551  0.55510204 0.5877551
 0.56122449 0.5755102  0.53578732 0.55010225 0.58163265 0.55510204
 0.57142857 0.57959184 0.54285714 0.56938776 0.57142857 0.54081633
 0.55623722 0.55623722 0.55918367 0.56938776 0.55306122 0.58367347
 0.57346939 0.58367347 0.54693878 0.6122449  0.55623722 0.55623722
 0.58571429 0.54897959 0.6244898  0.56530612 0.56122449 0.57959184
 0.6        0.56530612 0.54601227 0.56850716 0.55510204 0.58979592
 0.54897959 0.54897959 0.59387755 0.55102041 0.51632653 0.57142857
 0.6196319  0.56441718]


In [3]:
my_cv.mean()

0.5688003839572638

In [4]:
parameter_grid2 = {'base_learner__max_depth': [3, 6, 9], 'lr_g': [1e-1, 1e-2, 1e-3]}
new_gridsearch = GridSearchCV(
    new_gbor
    , parameter_grid2
    , scoring = 'accuracy'
    , n_jobs = -1
)
start_time = time.time()
my_cv2 = cross_val_score(
    new_gridsearch, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
    , verbose = 1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  9.1min


time: 834.2 sec
[0.52244898 0.54081633 0.52653061 0.54489796 0.57142857 0.57755102
 0.49795918 0.55102041 0.56237219 0.52760736 0.51836735 0.56530612
 0.51428571 0.5244898  0.56530612 0.55714286 0.54897959 0.53469388
 0.57873211 0.55010225 0.54489796 0.56938776 0.52857143 0.54081633
 0.46734694 0.56326531 0.54693878 0.57346939 0.54805726 0.5603272
 0.56326531 0.54693878 0.51836735 0.56938776 0.54897959 0.50204082
 0.53877551 0.5877551  0.52556237 0.55214724 0.53061224 0.51632653
 0.57142857 0.55102041 0.55714286 0.47346939 0.52653061 0.60612245
 0.53169734 0.57055215]


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 13.9min finished


In [5]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

parameter_distros = {
    'base_learner__max_depth': [3, 4, 5, 6, 7, 8, 9]
    , 'lr_g': loguniform(1e-3, 1e-1)
}
new_randomsearch = RandomizedSearchCV(
    new_gbor
    , parameter_distros
    , scoring = 'accuracy'
    , n_iter = 10
)
start_time = time.time()
my_cv3 = cross_val_score(
    new_randomsearch, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
    , verbose = 1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  7.1min


time: 741.4 sec
[0.52244898 0.53061224 0.53265306 0.5755102  0.52040816 0.58367347
 0.55510204 0.54081633 0.53783231 0.55214724 0.57959184 0.52244898
 0.52244898 0.56326531 0.54285714 0.51836735 0.56530612 0.5
 0.57668712 0.60327198 0.4877551  0.57755102 0.51428571 0.56938776
 0.49795918 0.52244898 0.55306122 0.53265306 0.57259714 0.54192229
 0.53469388 0.5244898  0.49387755 0.59183673 0.55510204 0.55102041
 0.5877551  0.51020408 0.55010225 0.55214724 0.53061224 0.54285714
 0.5877551  0.55306122 0.5755102  0.56938776 0.56734694 0.51836735
 0.58282209 0.55623722]


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 12.4min finished


In [6]:
#from scipy import stats

# Perform paired t-test
#t_stat, p_value = stats.ttest_rel(my_cv, my_cv2)
#print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")


In [7]:
from mord import LogisticIT
mord_model = LogisticIT(alpha = 0.0)

start_time = time.time()
my_cv4 = cross_val_score(
    mord_model, X, y, cv = cv
    , scoring = 'accuracy'
    , n_jobs = -1
    , verbose = 1
)
end_time = time.time()
print('time: {} sec'.format(round(end_time - start_time, 1)))

print(my_cv4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   30.6s


time: 47.2 sec
[0.50204082 0.54489796 0.53469388 0.51020408 0.50612245 0.50612245
 0.53877551 0.52244898 0.56441718 0.53578732 0.54897959 0.54897959
 0.53673469 0.50816327 0.51020408 0.54489796 0.49183673 0.51632653
 0.51533742 0.54805726 0.4877551  0.49183673 0.48367347 0.55918367
 0.56530612 0.58571429 0.5244898  0.54285714 0.48670757 0.52147239
 0.46938776 0.52244898 0.58979592 0.52040816 0.48571429 0.51836735
 0.59795918 0.52653061 0.49488753 0.52351738 0.51836735 0.52244898
 0.5122449  0.54081633 0.51632653 0.53877551 0.55306122 0.52653061
 0.5194274  0.50306748]


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   47.1s finished


In [8]:
my_cv4.mean(), my_cv.mean(), my_cv2.mean(), my_cv3.mean()

(0.5256821501606777,
 0.5688003839572638,
 0.5442247819373148,
 0.5470051333416802)

In [13]:
from scipy import stats

t_stat, p_value = stats.ttest_rel(my_cv2, my_cv3)
print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")


Paired t-test: t-statistic = -0.5295268103868339, p-value = 0.5988311391917884


In [10]:
#t_stat, p_value = stats.ttest_rel(my_cv2, my_cv3)
#print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")


In [11]:
np.unique(y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]),
 array([  20,  163, 1457, 2198,  880,  175,    5], dtype=int64))