In [65]:
import random
import numpy as np
from deap import gp as deap_gp
import gp
from data import get_embeddings

In [2]:
seed = 1126
random.seed(seed)

In [3]:
class config:
    def __init__(
        self,
        algorithm,
        embedding_type,
        dimension,
        population_size,
        crossover_method,
        cross_prob,
        mut_prob,
        num_generations,
        num_evaluations,
        debug,
    ):
        self.algorithm = algorithm
        self.embedding_type = embedding_type
        self.dimension = dimension
        self.population_size = population_size
        self.crossover_method = crossover_method
        self.cross_prob = cross_prob
        self.mut_prob = mut_prob
        self.num_generations = num_generations
        self.num_evaluations = num_evaluations
        self.debug = debug

In [4]:
Config = config("simple_gp",
        "word2vec",
        10,
        100,
        "cx_random",
        0.9,
        0.1,
        100,
        1000,
        False)

In [6]:
data, embeddings, embedding_model = get_embeddings(
        Config.embedding_type, Config.dimension
    )

cx_method = gp.get_cx_num(Config.crossover_method)

# Initialize instance weights
data["weights"] = 1.0 / len(data)
data["weights_update"] = 1.0 / len(data)

boosting_interval = 10  # Boosting interval

ensemble = []  # Ensemble to store the best individuals

gpab = gp.GP(
    Config.algorithm,
    Config.embedding_type,
    Config.dimension,
    Config.population_size,
    cx_method,
    Config.cross_prob,
    Config.mut_prob,
    Config.num_generations,
    Config.num_evaluations,
    data,
    embeddings,
)
gpab.initialize_pop()



In [122]:
def get_X(trees):
        X_list = np.array([np.array([trees.embeddings[char] for char in words]) for words in trees.inputword])
        return X_list

def get_y(trees):
    y_true_list = np.array(np.array([trees.embeddings[char] for char in trees.realword]))
    return y_true_list

In [128]:
from sklearn.base import BaseEstimator, RegressorMixin
import copy
from deap import tools

class EvolvingTreeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, trees):
        self.trees = trees

    def fit(self, X, y):
        # Assuming `trees` is an object that contains your population (pop),
        # toolbox, and other necessary components for evolution.

        # Your evolution logic here
        candidates = tools.selRandom(self.trees.pop, 3)
        sorted_candidates = sorted(candidates, key=lambda x: x.fitness.values)  # Small to large

        parent1, parent2 = copy.deepcopy(candidates[0]), copy.deepcopy(candidates[1])
        offspring = self.trees.toolbox.crossover(parent1, parent2)

        offspring = self.trees.toolbox.mutate(offspring[0])

        offspring[0].fitness.values = self.trees.toolbox.evaluate(offspring[0], self.trees.realword)
        if offspring[0].fitness.values > sorted_candidates[2].fitness.values:
            idx = self.trees.pop.index(sorted_candidates[2])
            self.trees.pop[idx] = offspring[0]

        # Note: You need to adapt this method to work with X and y if they are to be used.
        return self

    def predict(self, X):
        # Implement prediction logic based on evolved trees
        # This is a placeholder; you need to adapt it to your specific case.
        return np.zeros(len(X))

In [136]:
# Assuming `trees` is your evolutionary algorithm setup
evolving_tree_regressor = EvolvingTreeRegressor(gpab)

# Create AdaBoostRegressor with the custom base estimator
regr = GPABRegressor(evolving_tree_regressor, n_estimators=gpab.pop_size)

# Fit AdaBoostRegressor
regr.fit(X, y)

AttributeError: 'NoneType' object has no attribute 'sum'

## Adaboost Regressor

In [135]:
class GPABRegressor():
    """
    Parameters
    ----------
    estimator: deap.gp object
        The base estimator from which the boosted ensemble is built.

    n_estimators: int
        The number of estimators to train, a.k.a. the number of population of GP trees.

    learning_rate: float, default=1.0
        The learning rate of the boosting algorithm.

    loss: {'linear', 'square', 'exponential'}, optional
        The loss function to use when updating the weights after each iteration.

    Attributes
    ----------
    estimators_: estimator
        The base estimator from which the ensemble is grown.

    estimators_: list of regressors
        The collection of fitted sub-estimators.

    estimator_weights_: array-like of shape (n_estimators,)
        Weights for each estimator in the boosted ensemble.

    """
    def __init__(self, estimator, n_estimators, learning_rate=1, loss="linear"):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.loss = loss


    def fit(self, X, y, sample_weight=None):
        """
        Build a boosted ensemble of estimators from the training set (X, y).

        Parameters
        ----------
        X: array
            The training input samples (length = 5)

        y: array
            The target values (real numbers)

        sample_weight: array
            The sample weights. If None, the sample weights are initialized to 1 / n_samples.

        Returns
        -------
        self: object
            Fitted estimator.
        """
        sample_weight /= sample_weight.sum()
        print(f"intial sample weight: {sample_weight}")
        # Clear any previous fit
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
        self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)
        epsilon = np.finfo(sample_weight.dtype).eps

        zero_weight_mask = sample_weight == 0
        for iboost in range(self.n_estimators):
            # Avoid extremely small sample weight
            sample_weight = np.clip(sample_weight, a_min=epsilon, a_max=None)
            sample_weight[zero_weight_mask] = 0.0

            # Boosting step
            sample_weight, estimator_weight, estimator_error = self._boost(iboost, X, y, sample_weight)

            # Early stopping
            if sample_weight is None:
                break
            self.estimator_weights_[iboost] = estimator_weight
            self.estimator_errors_[iboost] = estimator_error

            # Stop if error is zero
            if estimator_error == 0:
                break

            sample_weight_sum = np.sum(sample_weight)
            if not np.isfinite(sample_weight_sum):
                warnings.warn(
                    (
                        "Sample weights have reached infinite values,"
                        f" at iteration {iboost}, causing overflow. "
                        "Iterations stopped. Try lowering the learning rate."
                    ),
                    stacklevel=2,
                )
                break

            # Stop if the sum of sample weights has become non-positive
            if sample_weight_sum <= 0:
                break

            if iboost < self.n_estimators - 1:
                # Normalize the sample weights
                sample_weight /= sample_weight_sum

        return self

    def _boost(self, iboost, X, y, sample_weight):
        """
        Implement a single boost iteration.

        Perform a single boost according to the AdaBoost.R2 algorithm and return the updated sample weights.

        Parameters
        ----------
        iboost: int
            The current boosting iteration.

        X: array
            The training input samples (length = 5)

        y: array
            The target values (real numbers)

        sample_weight: array
            The current sample weights.

        Returns
        -------
        sample_weight: array
            The updated sample weights.

        estimator_weight: float
            The weight of the estimator.

        estimator_error: float
            The error of the estimator.
        """
        estimator = self.estimator

        # Weighted sampling of the training data with replacement
        boostrap_idx = np.random.choice(size=len(X), replace=True, p=sample_weight)

        # Fit on the bootstrapped sample and obtain the predictions
        estimator.fit(X, y)
        y_pred = estimator.predict(X)

        error_vect = np.abs(y_pred - y_true)
        sample_mask = sample_weight > 0
        masked_sample_weight = sample_weight[sample_mask]
        masked_error_vector = error_vect[sample_mask]

        error_max = np.max(error_vect[sample_mask])
        if error_max != 0:
            masked_error_vector /= error_max

        if self.loss == "square":
            masked_error_vector **= 2
        elif self.loss == "exponential":
            masked_error_vector = 1.0 - np.exp(-masked_error_vector)

        # Calculate the average loss
        estimator_error = (masked_sample_weight * masked_error_vector).sum()

        if estimator_error <= 0:
            # Stop if fit is perfect
            return sample_weight, 1.0, 0.0
        elif estimator_error >= 0.5:
            # Discard current estimator only if it isn't the only one
            if len(self.estimators_) > 1:
                self.estimators_.pop(-1)
            return None, None, None

        beta = estimator_error / (1.0 - estimator_error)

        # Boost weight using AdaBoost.R2 algo
        estimator_weight = self.learning_rate * np.log(1.0 / beta)

        if not iboost == self.n_estimators - 1:
            # Update the sample weights
            sample_weight[sample_mask] *= np.power(beta, 1.0 - masked_error_vector * self.learning_rate)

    def _get_median_predict(self, X, limit):
        # Evaluate predictions of all estimators (ensemble)
        predictions = np.array([estimator.predict(X) for estimator in self.estimators_[:limit]])

        # Sort the predictions
        sorted_idx = np.argsort(predictions, axis=1)

        # Find index of median prediction for each sample
        weight_cdf = np.cumsum(self.estimator_weights_[:limit], axis=1)
        median_or_above = weight_cdf >= 0.5
        median_idx = np.argmax(median_or_above, axis=1)

        median_estimators = sorted_idx[np.arange(len(X), median_idx)]

        # Return the median prediction
        return predictions[np.arange(len(X)), median_estimators]

    def predict(self, X):
        """
        Predict the target values.

        Parameters
        ----------
        X: array
            The input samples.

        Returns
        -------
        y_pred: array
            The predicted target values.
        """
        # Get the median prediction
        return self._get_median_predict(X, len(self.estimators_))



In [126]:
regr_1 = AdaboostRegressor(estimator=gpab, n_estimators=100, learning_rate=1.0, loss="linear")

In [127]:
X = get_X(gpab)
y = get_y(gpab)
regr_1.fit(X, y)

AttributeError: 'NoneType' object has no attribute 'sum'

In [21]:
gpab.inputword

69542                [teary, adler, tells, of, family]
142293    [ethiopia, tigray, refugees, sudan, eritrea]
188137           [vic, corruption, fighter, tells, of]
123849                [fowler, fury, set, for, crisis]
18781                 [love, pleads, guilty, to, drug]
                              ...                     
166473      [tendulkar, confident, ahead, of, special]
265261               [russia, to, build, reactors, in]
17271           [hotel, for, former, academy, cinemas]
7032             [voss, out, fletcher, faces, nervous]
52567       [abbott, backs, morrisons, asylum, seeker]
Name: 0, Length: 2669, dtype: object

In [20]:
gpab.inputword.iloc[0]

['teary', 'adler', 'tells', 'of', 'family']

In [39]:
str(one_tree)

'protected_div(square(c), b)'

In [25]:
one_tree = gpab.pop[0]
func = deap_gp.compile(one_tree, gpab.pset)
func

<function <lambda>(a, b, c, d, e)>

In [49]:
one_sentence = gpab.inputword.iloc[0]
one_sentence

['teary', 'adler', 'tells', 'of', 'family']

In [118]:
X = np.array([gpab.embeddings[char] for char in one_sentence])
X.shape

(5, 10)

In [69]:
for words in gpab.inputword:
    X = np.array([gpab.embeddings[char] for char in words])
    y_pred = func(*X)
y_pred

array([-4.3991321e-01, -1.2687312e-02,  3.8116419e-01,  4.7464702e-02,
       -6.3511804e-02, -3.0491473e-03,  6.4744306e-04,  1.7864481e+00,
       -4.4006062e-01,  2.3497958e-03], dtype=float32)

In [73]:
y_pred_list = np.array([func(*np.array([gpab.embeddings[char] for char in words])) for words in gpab.inputword])
y_pred_list.shape

(2669, 10)

In [79]:
len([gpab.embeddings[char] for char in gpab.realword])

2669

In [81]:
y_true_list = np.array(np.array([gpab.embeddings[char] for char in gpab.realword]))
y_true_list.shape

(2669, 10)

In [100]:
len(gpab.inputword)

2669

In [115]:
def update_weight(trees):
    differences = np.zeros((len(trees.inputword)))
    num_data = 0
    for idx, tree in enumerate(trees.pop):
        func = deap_gp.compile(tree, trees.pset)
        # print(f"tree: {tree}")

        y_preds_for_one_tree = np.array([func(*np.array([trees.embeddings[char] for char in words])) for words in trees.inputword])

        y_trues_for_one_tree = np.array(np.array([trees.embeddings[char] for char in trees.realword]))

        # Calculate the difference
        differences_for_one_tree = np.linalg.norm((y_preds_for_one_tree - y_trues_for_one_tree), axis=1)
        # print(f"differences_for_one_tree: {differences_for_one_tree.shape}")

        differences[num_data] = differences_for_one_tree[0]

        num_data += 1

    print(f"differences: {differences.shape}")

    # Find the supremum (maximum) of these differences
    D = np.max(differences)
    print(f"D: {D}")

    L_1 = differences / D
    L_2 = np.square(differences) / np.square(D)
    L_3 = 1 - np.exp(-differences / D)
    print(f"L_1: {L_1.shape}, L_2: {L_2.shape}, L_3: {L_3.shape}")
    L = np.mean(np.stack((L_1, L_2, L_3), axis=0), axis=0)
    print(f"L: {L.shape}")

    return L

result_L = update_weight(gpab)
result_L

differences: (2669,)
D: 11136956.0
L_1: (2669,), L_2: (2669,), L_3: (2669,)
L: (2669,)


array([1.04124756e-05, 1.67119513e-05, 1.08809853e-06, ...,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [117]:
beta = result_L / (1 - result_L)
beta.shape

(2669,)

In [None]:
data["weights_update"] = data["weights"] * np.exp(beta * )