In [103]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from scipy.stats import norm

class BoostedOrdinal(BaseEstimator, RegressorMixin):
    def __init__(self, base_learner = DecisionTreeRegressor(), max_iter=100, tol=1e-4):
        self.base_learner = base_learner
        self.max_iter = max_iter
        self.tol = tol
    
    def _validate_ordinal(arr):
        """
        Check if the unique values in a numpy integer vector are 0, 1, ..., M with M >= 2.
    
        Parameters:
        arr (numpy.ndarray): Input numpy integer vector.
    
        Returns:
        bool: True if unique values are 0, 1, ..., M with M >= 2, False otherwise.
        """
        if not isinstance(arr, np.ndarray):
            raise ValueError("Input must be a numpy array")
        if arr.dtype.kind not in {'i', 'u'}:
            raise ValueError("Input array must contain integers")
    
        unique_values = np.unique(arr)
        
        if unique_values[0] != 0:
            return []
        
        M = unique_values[-1]

        if M < 2:
            return []
        
        expected_values = np.arange(M + 1)

        if np.array_equal(unique_values, expected_values):
            #return M + 1
            return [np.where(arr == m) for m in unique_values]
        else:
            return []
    
    def _initialize_thresholds(y):
        # Calculate the initial threshold vector
        n_samples = len(y)
        n_class = np.max(y) + 1
        P = np.array([np.sum(y == i) for i in range(n_class)]) / n_samples
        return norm.ppf(np.cumsum(P[:-1]))
    
    def _pad_thresholds(theta):
        return np.insert(theta, [0, theta.size], [-np.inf, np.inf])
    
    def _derivative_threshold(X, ylist, thresh, g):
        thresh_padded = BoostedOrdinal._pad_thresholds(thresh)
        M = len(thresh)
        ret = []
        for m in range(M):
            S_m = ylist[m]
            S_mp1 = ylist[m+1]
            v1 = np.sum(norm.pdf(thresh_padded[m+1] - g[S_m]) / (norm.cdf(thresh_padded[m+1] - g[S_m]) - norm.cdf(thresh_padded[m] - g[S_m])))
            v2 = np.sum(norm.pdf(thresh_padded[m+1] - g[S_mp1]) / (norm.cdf(thresh_padded[m+2] - g[S_mp1]) - norm.cdf(thresh_padded[m+1] - g[S_mp1])))
            ret.append(-v1 + v2)
        return ret

    def _derivative_g(X, y, thresh, g):
        thresh_padded = BoostedOrdinal._pad_thresholds(thresh)
        ret = (norm.pdf(thresh_padded[y] - g) - norm.pdf(thresh_padded[y+1] - g)) / (norm.cdf(thresh_padded[y+1] - g) - norm.cdf(thresh_padded[y] - g))
        return ret


In [63]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Create a sample dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=3, n_informative=5, random_state=0)
#indices = BoostedOrdinal._validate_ordinal(y)
#print(len(indices))

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

3


In [104]:
my_indices = BoostedOrdinal._validate_ordinal(y_train)
theta_init = BoostedOrdinal._initialize_thresholds(y_train)
g_init = np.zeros(X_train.shape[0])
#BoostedOrdinal._derivative_threshold(X_train, my_indices, theta_init, g_init)
BoostedOrdinal._derivative_g(X_train, y_train, theta_init, g_init)

array([ 0.00969938,  0.00969938, -1.08750399, -1.08750399, -1.08750399,
        1.10241312, -1.08750399,  1.10241312, -1.08750399, -1.08750399,
       -1.08750399, -1.08750399, -1.08750399,  0.00969938,  0.00969938,
       -1.08750399,  0.00969938,  0.00969938, -1.08750399,  0.00969938,
        1.10241312, -1.08750399,  1.10241312, -1.08750399, -1.08750399,
        0.00969938,  1.10241312,  1.10241312,  1.10241312,  1.10241312,
        1.10241312, -1.08750399, -1.08750399,  0.00969938,  1.10241312,
        0.00969938,  0.00969938,  1.10241312,  0.00969938,  1.10241312,
       -1.08750399,  0.00969938, -1.08750399,  1.10241312, -1.08750399,
       -1.08750399,  0.00969938,  1.10241312,  1.10241312,  1.10241312,
       -1.08750399, -1.08750399, -1.08750399,  1.10241312,  1.10241312,
        0.00969938,  0.00969938,  1.10241312,  1.10241312,  1.10241312,
        0.00969938, -1.08750399,  1.10241312,  1.10241312, -1.08750399,
       -1.08750399, -1.08750399, -1.08750399,  0.00969938,  0.00

In [72]:
theta_init

array([-0.42614801,  0.44682697])

In [73]:
len(my_indices)

3

In [84]:
norm.cdf(np.inf)

1.0