In [1]:
import random
import numpy as np
import pandas as pd
from deap import gp as deap_gp
import gp
from data import get_embeddings

In [2]:
seed = 1126
random.seed(seed)

In [3]:
class config:
    def __init__(
        self,
        algorithm,
        embedding_type,
        dimension,
        population_size,
        crossover_method,
        cross_prob,
        mut_prob,
        num_generations,
        num_evaluations,
        debug,
    ):
        self.algorithm = algorithm
        self.embedding_type = embedding_type
        self.dimension = dimension
        self.population_size = population_size
        self.crossover_method = crossover_method
        self.cross_prob = cross_prob
        self.mut_prob = mut_prob
        self.num_generations = num_generations
        self.num_evaluations = num_evaluations
        self.debug = debug

In [4]:
Config = config("simple_gp",
        "word2vec",
        10,
        100,
        "cx_random",
        0.9,
        0.1,
        100,
        1000,
        False)

In [16]:
data, embeddings, embedding_model = get_embeddings(
        Config.embedding_type, Config.dimension
    )

cx_method = gp.get_cx_num(Config.crossover_method)

# Initialize instance weights
data["weights"] = 1.0 / len(data)
data["weights_update"] = 1.0 / len(data)

iboost = 10  # Boosting interval
ensemble = []  # Ensemble to store the best individuals
loss = "linear"
learning_rate = 1.0
sample_weight = np.array(data["weights_update"])
num_ensemble = 20




In [None]:
gpab = gp.GP(
    Config.algorithm,
    Config.embedding_type,
    Config.dimension,
    Config.population_size,
    cx_method,
    Config.cross_prob,
    Config.mut_prob,
    Config.num_generations,
    Config.num_evaluations,
    data,
    embeddings,
)
gpab.initialize_pop()

In [25]:
def get_X(trees):
        X_list = np.array([np.array([trees.embeddings[char] for char in words]) for words in trees.inputword])
        return X_list

def get_predict(trees, individual):
        func = deap_gp.compile(individual, gpab.pset)
        y_pred_list = np.array([func(*np.array([trees.embeddings[char] for char in words])) for words in trees.inputword])
        return y_pred_list

def get_y(trees):
    y_true_list = np.array(np.array([trees.embeddings[char] for char in trees.realword]))
    return y_true_list

In [7]:
iboost = 1

## Adaboost Regressor

In [35]:
gpab.n_gen = 0
# while gpab.n_gen < gpab.max_gen:
while gpab.n_gen < 2:
    gpab.select()

    epsilon = np.finfo(sample_weight.dtype).eps
    zero_weight_mask = sample_weight == 0.0

    # Boosting
    if gpab.n_gen % iboost == 0:
        # for iboost in range(len(num_ensemble)):

        # Get the best individual
        best_ind = max(gpab.pop, key=lambda x: x.fitness.values)
        # Avoid extremely small weights
        sample_weight = np.clip(sample_weight, a_min=epsilon, a_max=None)
        sample_weight[zero_weight_mask] = 0.0

        # Boosting step
        X = get_X(gpab)
        y = get_y(gpab)
        sample_weight, estimator_weight, estimator_error = boosting(iboost, gpab, best_ind, X, y, sample_weight, learning_rate, loss)

        # Early termination
        if sample_weight is None:
            break

        # Stop if error is zero
        if estimator_error == 0:
            break

        sample_weight_sum = np.sum(sample_weight)

        if not np.isfinite(sample_weight_sum):
            warnings.warn(
                (
                    "Sample weights have reached infinite values,"
                    f" at iteration {iboost}, causing overflow. "
                    "Iterations stopped. Try lowering the learning rate."
                ),
                stacklevel=2,
            )
            break

        # Stop if the sum of sample weights has become non-positive
        if sample_weight_sum <= 0:
            break

        if iboost < num_ensemble - 1:
            # Normalize
            sample_weight /= sample_weight_sum

        print("Sample weight: ", sample_weight)

        # Update the population dataset
        select_new_data = np.random.uniform(0, 1, len(data))
        data["cumulative_weights"] = data["weights_update"].cumsum()
        # Find the indices of the closest rows in cumulative_weights for each value in select_new_data
        indices = np.digitize(select_new_data, data["cumulative_weights"])
        # Create new dataset by selecting rows from original dataset based on indices
        new_dataset = data.iloc[indices].reset_index(drop=True)

        gpab.data = new_dataset
        # Evaluate the entire population
        fitnesses = map(gpab.toolbox.evaluate, gpab.pop)
        for ind, fit in zip(gpab.pop, fitnesses):
            ind.fitness.values = fit
    print("Generation: ", gpab.n_gen)
    gpab.n_gen += 1

Sample weight:  [0.00042234 0.00035713 0.00043287 ... 0.0004323  0.00047308 0.00042408]


In [15]:
sample_weight

array([0.00037467, 0.00037467, 0.00037467, ..., 0.00037467, 0.00037467,
       0.00037467])

In [31]:
def boosting(iboost, gpab, best_ind, X, y, sample_weight, learning_rate, loss):
    y_pred = get_predict(gpab, best_ind)

    error_vect = np.linalg.norm(y - y_pred, axis=1)
    sample_mask = sample_weight > 0
    masked_sample_weight = sample_weight[sample_mask]
    masked_error_vector = error_vect[sample_mask]

    error_max = masked_error_vector.max()
    if error_max != 0:
        masked_error_vector /= error_max

    if loss == "square":
        masked_error_vector **= 2
    elif loss == "exponential":
        masked_error_vector = 1.0 - np.exp(-masked_error_vector)

    # Culcalate the average loss
    estimator_error = (masked_sample_weight * masked_error_vector).sum()
    if estimator_error <= 0:
        # Stop if fit is perfect
        return sample_weight, 1.0, 0.0
    elif estimator_error >= 0.5:
        # Discard the estimator if worse than random guessing and it isn't the only one
        if len(ensemble) > 0:
            ensemble.pop(-1)
        return None, None, None

    beta = estimator_error / (1.0 - estimator_error)

    # Boost weight using AdaBoost.R2 algorithm
    estimator_weight = learning_rate * np.log(1.0 / beta)

    if not iboost == num_ensemble - 1:
        sample_weight[sample_mask] *= np.power(
        beta, (1.0 - masked_error_vector) * learning_rate
    )

    return sample_weight, estimator_weight, estimator_error

### Try

In [None]:

sample_weight = np.array(data["weights_update"])
sample_weight.shape

(2669,)

In [None]:
y_pred = get_predict(gpab)
y = get_y(gpab)

In [290]:
# line 1
error_vect = np.linalg.norm(y - y_pred, axis=1)
error_vect.shape

(2669,)

In [297]:
# line 2
sample_mask = sample_weight > 0
sample_mask.shape

(2669,)

In [298]:
# line 3
masked_sample_weight = sample_weight[sample_mask]
masked_sample_weight.shape

(2669,)

In [299]:
# line 4
masked_error_vector = error_vect[sample_mask]
masked_error_vector.shape

(2669,)

In [300]:
# line 5
error_max = masked_error_vector.max()
error_max.shape

()

In [301]:
error_max

1082835.4

In [302]:
# line 6
if error_max != 0:
    masked_error_vector /= error_max
masked_error_vector

array([4.8427035e-05, 4.3844911e-06, 1.7800233e-04, ..., 1.4145022e-03,
       6.5471912e-05, 9.6160511e-06], dtype=float32)

In [305]:
# line 7
if loss == "square":
    masked_error_vector **= 2
elif loss == "exponential":
    masked_error_vector = 1.0 - np.exp(-masked_error_vector)

In [314]:
# line 8
estimator_error = (masked_sample_weight * masked_error_vector).sum()
estimator_error

0.0006279703240808366

In [307]:
# line 9
# Calculate the average loss
if estimator_error <= 0:
    # Stop if fit is perfect
    return sample_weight
elif estimator_error >= 0.5:
    # Discard the estimator if worse than random guessing and it isn't the only one
    if len(ensemble) > 0:
        ensemble.pop(-1)
    return None, None, None


SyntaxError: 'return' outside function (2712928142.py, line 3)

In [315]:
# line 10
beta = estimator_error / (1.0 - estimator_error)
beta

0.0006283649186024124

In [316]:
# line 11
estimator_weight = learning_rate * np.log(1.0 / beta)
estimator_weight

7.372389479680931

In [313]:
# line 12
if not iboost == len(ensemble) - 1:
    sample_weight[sample_mask] *= np.power(
        beta, (1.0 - masked_error_vector) * learning_rate
    )

In [None]:
# line 13
return sample_weight, estimator_weight, estimator_error

In [172]:
class GPABRegressor():
    """
    Parameters
    ----------
    estimator: deap.gp object
        The base estimator from which the boosted ensemble is built.

    n_estimators: int
        The number of estimators to train, a.k.a. the number of population of GP trees.

    learning_rate: float, default=1.0
        The learning rate of the boosting algorithm.

    loss: {'linear', 'square', 'exponential'}, optional
        The loss function to use when updating the weights after each iteration.

    Attributes
    ----------
    estimators_: estimator
        The base estimator from which the ensemble is grown.

    estimators_: list of regressors
        The collection of fitted sub-estimators.

    estimator_weights_: array-like of shape (n_estimators,)
        Weights for each estimator in the boosted ensemble.

    """
    def __init__(self, estimator, n_estimators, learning_rate=1, loss="linear"):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.loss = loss


    def fit(self, X, y, sample_weight=None):
        """
        Build a boosted ensemble of estimators from the training set (X, y).

        Parameters
        ----------
        X: array
            The training input samples (length = 5)

        y: array
            The target values (real numbers)

        sample_weight: array
            The sample weights. If None, the sample weights are initialized to 1 / n_samples.

        Returns
        -------
        self: object
            Fitted estimator.
        """


        # sample_weight /= sample_weight.sum()

        # Clear any previous fit
        # self.estimators_ = []
        # self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
        # self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)
        epsilon = np.finfo(sample_weight).eps




        zero_weight_mask = sample_weight == 0
        for iboost in range(self.n_estimators):

            # Avoid extremely small sample weight
            sample_weight = np.clip(sample_weight, a_min=epsilon, a_max=None)
            sample_weight[zero_weight_mask] = 0.0

            # Boosting step
            sample_weight, estimator_weight, estimator_error = self._boost(iboost, X, y, sample_weight)

            # Early stopping
            if sample_weight is None:
                break
            self.estimator_weights_[iboost] = estimator_weight
            self.estimator_errors_[iboost] = estimator_error

            # Stop if error is zero
            if estimator_error == 0:
                break

            sample_weight_sum = np.sum(sample_weight)
            if not np.isfinite(sample_weight_sum):
                warnings.warn(
                    (
                        "Sample weights have reached infinite values,"
                        f" at iteration {iboost}, causing overflow. "
                        "Iterations stopped. Try lowering the learning rate."
                    ),
                    stacklevel=2,
                )
                break

            # Stop if the sum of sample weights has become non-positive
            if sample_weight_sum <= 0:
                break

            if iboost < self.n_estimators - 1:
                # Normalize the sample weights
                sample_weight /= sample_weight_sum

        return self

    def _boost(self, iboost, X, y, sample_weight):
        """
        Implement a single boost iteration.

        Perform a single boost according to the AdaBoost.R2 algorithm and return the updated sample weights.

        Parameters
        ----------
        iboost: int
            The current boosting iteration.

        X: array
            The training input samples (length = 5)

        y: array
            The target values (real numbers)

        sample_weight: array
            The current sample weights.

        Returns
        -------
        sample_weight: array
            The updated sample weights.

        estimator_weight: float
            The weight of the estimator.

        estimator_error: float
            The error of the estimator.
        """
        estimator = self.estimator

        # Weighted sampling of the training data with replacement
        # boostrap_idx = np.random.choice(random_state, size=len(X), replace=True, p=sample_weight)


        # Fit on the bootstrapped sample and obtain the predictions
        # estimator.fit(X, y)
        # estimator.select()
        y_pred = self.predict(estimator)

        error_vect = np.abs(y_pred - y)
        sample_mask = sample_weight > 0
        masked_sample_weight = sample_weight[sample_mask]
        masked_error_vector = error_vect[sample_mask]

        error_max = np.max(error_vect[sample_mask])
        if error_max != 0:
            masked_error_vector /= error_max

        if self.loss == "square":
            masked_error_vector **= 2
        elif self.loss == "exponential":
            masked_error_vector = 1.0 - np.exp(-masked_error_vector)

        # Calculate the average loss
        estimator_error = (masked_sample_weight * masked_error_vector).sum()

        if estimator_error <= 0:
            # Stop if fit is perfect
            return sample_weight, 1.0, 0.0
        elif estimator_error >= 0.5:
            # Discard current estimator only if it isn't the only one
            if len(self.estimators_) > 1:
                self.estimators_.pop(-1)
            return None, None, None

        beta = estimator_error / (1.0 - estimator_error)

        # Boost weight using AdaBoost.R2 algo
        estimator_weight = self.learning_rate * np.log(1.0 / beta)

        if not iboost == self.n_estimators - 1:
            # Update the sample weights
            sample_weight[sample_mask] *= np.power(beta, 1.0 - masked_error_vector * self.learning_rate)

    # def _get_median_predict(self, X, limit):
        # Evaluate predictions of all estimators (ensemble)
        # predictions = np.array([estimator.predict(X) for estimator in self.estimators_[:limit]])
        # predictions = np.array([func(*np.array([gpab.embeddings[char] for char in words])) for words in gpab.inputword])

        # # Sort the predictions
        # sorted_idx = np.argsort(predictions, axis=1)

        # # Find index of median prediction for each sample
        # weight_cdf = np.cumsum(self.estimator_weights_[:limit], axis=1)
        # median_or_above = weight_cdf >= 0.5
        # median_idx = np.argmax(median_or_above, axis=1)

        # median_estimators = sorted_idx[np.arange(len(X), median_idx)]

        # # Return the median prediction
        # return predictions[np.arange(len(X)), median_estimators]

    def predict(self, X):
        """
        Predict the target values.

        Parameters
        ----------
        X: array
            The input samples.

        Returns
        -------
        y_pred: array
            The predicted target values.
        """
        # Get the median prediction
        return get_predict(estimator)



In [173]:
regr_1 = GPABRegressor(estimator=gpab, n_estimators=gpab.pop_size, learning_rate=1.0, loss="linear")

In [174]:
regr_1.estimator, regr_1.n_estimators, regr_1.learning_rate, regr_1.loss

(<gp.GP at 0x7f53af1aaf40>, 100, 1.0, 'linear')

In [189]:
X = get_X(gpab)
y = get_y(gpab)
data["weights_update"] = 1.0 / len(data)

In [None]:
data["weights_update"]

In [None]:


regr_1.fit(X, y, )

In [21]:
gpab.inputword

69542                [teary, adler, tells, of, family]
142293    [ethiopia, tigray, refugees, sudan, eritrea]
188137           [vic, corruption, fighter, tells, of]
123849                [fowler, fury, set, for, crisis]
18781                 [love, pleads, guilty, to, drug]
                              ...                     
166473      [tendulkar, confident, ahead, of, special]
265261               [russia, to, build, reactors, in]
17271           [hotel, for, former, academy, cinemas]
7032             [voss, out, fletcher, faces, nervous]
52567       [abbott, backs, morrisons, asylum, seeker]
Name: 0, Length: 2669, dtype: object

In [20]:
gpab.inputword.iloc[0]

['teary', 'adler', 'tells', 'of', 'family']

In [39]:
str(one_tree)

'protected_div(square(c), b)'

In [25]:
one_tree = gpab.pop[0]
func = deap_gp.compile(one_tree, gpab.pset)
func

<function <lambda>(a, b, c, d, e)>

In [49]:
one_sentence = gpab.inputword.iloc[0]
one_sentence

['teary', 'adler', 'tells', 'of', 'family']

In [118]:
X = np.array([gpab.embeddings[char] for char in one_sentence])
X.shape

(5, 10)

In [69]:
for words in gpab.inputword:
    X = np.array([gpab.embeddings[char] for char in words])
    y_pred = func(*X)
y_pred

array([-4.3991321e-01, -1.2687312e-02,  3.8116419e-01,  4.7464702e-02,
       -6.3511804e-02, -3.0491473e-03,  6.4744306e-04,  1.7864481e+00,
       -4.4006062e-01,  2.3497958e-03], dtype=float32)

In [73]:
y_pred_list = np.array([func(*np.array([gpab.embeddings[char] for char in words])) for words in gpab.inputword])
y_pred_list.shape

(2669, 10)

In [79]:
len([gpab.embeddings[char] for char in gpab.realword])

2669

In [81]:
y_true_list = np.array(np.array([gpab.embeddings[char] for char in gpab.realword]))
y_true_list.shape

(2669, 10)

In [100]:
len(gpab.inputword)

2669

In [115]:
def update_weight(trees):
    differences = np.zeros((len(trees.inputword)))
    num_data = 0
    for idx, tree in enumerate(trees.pop):
        func = deap_gp.compile(tree, trees.pset)
        # print(f"tree: {tree}")

        y_preds_for_one_tree = np.array([func(*np.array([trees.embeddings[char] for char in words])) for words in trees.inputword])

        y_trues_for_one_tree = np.array(np.array([trees.embeddings[char] for char in trees.realword]))

        # Calculate the difference
        differences_for_one_tree = np.linalg.norm((y_preds_for_one_tree - y_trues_for_one_tree), axis=1)
        # print(f"differences_for_one_tree: {differences_for_one_tree.shape}")

        differences[num_data] = differences_for_one_tree[0]

        num_data += 1

    print(f"differences: {differences.shape}")

    # Find the supremum (maximum) of these differences
    D = np.max(differences)
    print(f"D: {D}")

    L_1 = differences / D
    L_2 = np.square(differences) / np.square(D)
    L_3 = 1 - np.exp(-differences / D)
    print(f"L_1: {L_1.shape}, L_2: {L_2.shape}, L_3: {L_3.shape}")
    L = np.mean(np.stack((L_1, L_2, L_3), axis=0), axis=0)
    print(f"L: {L.shape}")

    return L

result_L = update_weight(gpab)
result_L

differences: (2669,)
D: 11136956.0
L_1: (2669,), L_2: (2669,), L_3: (2669,)
L: (2669,)


array([1.04124756e-05, 1.67119513e-05, 1.08809853e-06, ...,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [117]:
beta = result_L / (1 - result_L)
beta.shape

(2669,)

In [None]:
data["weights_update"] = data["weights"] * np.exp(beta * )