In [1]:
folder = 'C:/Users/alire/OneDrive/data/asmahani_github/ordinal-boost/wine_quality'
filename_red, filename_white = 'winequality-red.csv', 'winequality-white.csv'

import os
import pandas as pd

def load_data(folder, filename):
    csv_path = os.path.join(folder, filename)
    return pd.read_csv(csv_path, sep = ';')

red_wine = load_data(folder, filename_red)
white_wine = load_data(folder, filename_white)

red_wine['quality'] = red_wine['quality'] - 3
white_wine['quality'] = white_wine['quality'] - 3

print(f'Shape of red wine: {red_wine.shape}')
print(f'Shape of white wine: {white_wine.shape}')

red_wine.head()
#print(white_wine.head())

X_red, y_red = red_wine.drop(columns = ['quality']).to_numpy(), red_wine['quality'].to_numpy(dtype = 'int')
X_white, y_white = white_wine.drop(columns = ['quality']).to_numpy(), white_wine['quality'].to_numpy(dtype = 'int')

Shape of red wine: (1599, 12)
Shape of white wine: (4898, 12)


In [2]:
#from gbor.main import BoostedOrdinal
import sys
sys.path.append('../locallib')
from main_v2 import BoostedOrdinal

gbor = BoostedOrdinal(predict_type = 'probs').fit(X_red, y_red)
gbor.predict(X_red[:10]).shape

(10, 6)

In [37]:
from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, KFold
#my_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2,
#    random_state=123)
my_cv = RepeatedKFold(n_splits=2, n_repeats=10,
    #random_state=123
)
#my_cv = KFold(n_splits=5, shuffle=True, 
#              random_state=123
#              )
#my_cv = 10


In [38]:
# Create a scorer object for use in scikit-learn
from main_v2 import concordance_index
from sklearn.metrics import make_scorer
concordance_index_scorer = make_scorer(concordance_index, greater_is_better=True)

In [39]:
from sklearn.model_selection import cross_val_score

scores_red_baseline = cross_val_score(
    BoostedOrdinal(predict_type='latent'), 
    X_red, y_red, cv = my_cv, 
    scoring = concordance_index_scorer,
    n_jobs = -1
)
scores_red_baseline.mean()

0.8122921649358906

In [40]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
#rfc = GradientBoostingClassifier()
rfc = GradientBoostingRegressor()
scores_rfc_red = cross_val_score(
    rfc, X_red, y_red, cv = my_cv, 
    scoring = concordance_index_scorer,
    n_jobs = -1
)
scores_rfc_red.mean()

0.8088403401493436

In [41]:
# t-test of the two models
from scipy.stats import ttest_rel
ttest_rel(scores_red_baseline, scores_rfc_red)

TtestResult(statistic=1.2836430916182746, pvalue=0.21470179339697945, df=19)

In [42]:
from sklearn.metrics import log_loss, make_scorer

def my_neg_log_loss_scorer(y_true, y_pred):
    return -log_loss(y_true, y_pred, labels=[0, 1, 2, 3, 4, 5])

custom_scorer = make_scorer(my_neg_log_loss_scorer, greater_is_better=True, response_method='predict')


In [43]:
scores_red_baseline_log_loss = cross_val_score(
    BoostedOrdinal(predict_type='probs', n_class=6), 
    X_red, y_red, cv = my_cv, 
    #scoring = 'neg_log_loss',
    scoring = custom_scorer,
    n_jobs = 1
)
scores_red_baseline_log_loss.mean()

-0.9675174922948793

In [45]:
rfc = GradientBoostingClassifier()
scores_rfc_red_log_loss = cross_val_score(
    rfc, X_red, y_red, cv = my_cv, 
    scoring = 'neg_log_loss',
    #scoring = custom_scorer,
    n_jobs = 1
)
scores_rfc_red_log_loss.mean()

-1.1072253961349905

In [None]:
scores_red_baseline_log_loss

In [29]:
obj = BoostedOrdinal(n_class=6).fit(X_red, y_red)

In [20]:
probs = obj.predict_proba(X_red)

In [21]:
import numpy as np

assert np.all(probs >= 0) and np.all(probs <= 1), "Probabilities must be between 0 and 1."
assert np.allclose(probs.sum(axis=1), 1), "Probabilities for each sample must sum to 1."


In [None]:
np.log(np.min(probs))

In [None]:
obj.n_class

In [None]:
probs.shape

In [None]:
from sklearn.metrics import log_loss
print("Log loss:", log_loss(y_red, probs))


In [26]:
assert np.all(np.isfinite(probs)), "Probabilities must be finite (no NaN or inf)."

In [3]:
from sklearn.utils.estimator_checks import check_estimator
check_estimator(BoostedOrdinal())  # Will raise detailed errors if something is missing

AssertionError: `BoostedOrdinal.fit()` does not set the `n_features_in_` attribute. You might want to use `sklearn.utils.validation.validate_data` instead of `check_array` in `BoostedOrdinal.fit()` which takes care of setting the attribute.

In [4]:
from sklearn.base import is_classifier, is_regressor
is_classifier(BoostedOrdinal()), is_regressor(BoostedOrdinal())

(False, False)

In [5]:
print(is_classifier(BoostedOrdinal()))

False
