In [52]:
import random
import operator
import copy
import pprint
import numpy as np
import pandas as pd
# import gp
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold
import deap.gp as gp
from deap import creator, base, tools, algorithms
from deap.gp import cxOnePoint as cx_simple
from deap.gp import PrimitiveSet
from data import get_embeddings

#### Set parameters

In [53]:
WORD2VEC = "word2vec"
GLOVE = "glove"
FASTTEXT = "fasttext"
CX_RANDOM = 0
CX_SIMPLE = 1
CX_UNIFORM = 2
CX_FAIR = 3
CX_ONEPOINT = 4

#### Fuctions

In [54]:
# Arithmetic operators
def protected_div(x, y):
    mask = y == 0
    safe_y = np.where(mask, 1, y)
    return np.where(mask, 1, x / safe_y)


def protected_sqrt(x):
    sign = np.sign(x)
    x = np.abs(x)
    return np.sqrt(x) * sign

In [55]:
# Crossover method
def subtree_height(tree, index):
        """
        Calculate the height of the subtree starting at the given index.
        """
        def height(node_index):
            node = tree[node_index]
            if node.arity == 0:  # Leaf node
                return 1
            else:
                return 1 + max(
                    height(child_index)
                    for child_index in range(
                        node_index + 1, node_index + 1 + node.arity
                    )
                )

        return height(index)

def searchSubtree_idx(tree, begin):
        end = begin + 1
        total = tree[begin].arity
        while total > 0:
            total += tree[end].arity - 1
            end += 1
        return begin, end

def cx_uniform(ind1, ind2):
        # No crossover on single node tree
        if (len(ind1) < 2 or len(ind2) < 2):
            return ind1, ind2

        child = type(ind1)([])
        parents = [ind1, ind2]
        flag0, flag1 = 0, 0
        left_0 = parents[0].searchSubtree(1)
        left_1 = parents[1].searchSubtree(1)
        b0, e0 = searchSubtree_idx(parents[0], 1)
        b1, e1 = searchSubtree_idx(parents[1], 1)

        if e0 + 1 < len(parents[0]):
            right_0 = parents[0].searchSubtree(e0 + 1)
            flag0 = 1
        if e1 + 1 < len(parents[1]):
            right_1 = parents[1].searchSubtree(e1 + 1)
            flag1 = 1
        left = [left_0, left_1]
        if flag0 == 1 and flag1 == 1:
            right = [right_0, right_1]
            r_arity = 0
            if parents[0][e0 + 1].arity == parents[1][e1 + 1].arity:
                r_arity = 1
        r = random.randint(0, 1)  # root
        m = 1 - r
        if len(parents[r]) < len(parents[m]):
            # root = parents[r].root
            if flag1 == 0 or flag0 == 0:
                return parents[r], parents[m]
            parents[m][0] = parents[r].root
            m = r
        if flag0 == 1 and flag1 == 1:
            r1 = random.randint(0, 1)  # 左邊
            if parents[r][1] == parents[r1][1]:
                parents[r][left[r]] = parents[r1][left[r1]]
            if r_arity == 1:
                r2 = random.randint(0, 1)
                parents[r][right[r]] = parents[r2][right[r2]]
        else:
            # print("只有一個子點")
            r1 = random.randint(0, 1)
            parents[r][left[r1]] = parents[r1][left[r1]]
        return parents[r], parents[r]

def cx_fair(ind1, ind2):
    """Size fair crossover for two trees.
    :param ind1: First tree participating in the crossover.
    :param ind2: Second tree participating in the crossover.
    :returns: A tuple of two trees.
    """
    # No crossover on single node tree
    if len(ind1) < 2 or len(ind2) < 2:
        return ind1, ind2

    # List all available primitive types in each individual
    types1 = gp.defaultdict(list)
    types2 = gp.defaultdict(list)
    if ind1.root.ret == gp.__type__:
        # Not STGP optimization
        types1[gp.__type__] = list(range(1, len(ind1)))
        types2[gp.__type__] = list(range(1, len(ind2)))
        common_types = [gp.__type__]
    else:
        for idx, node in enumerate(ind1[1:], 1):
            types1[node.ret].append(idx)
        for idx, node in enumerate(ind2[1:], 1):
            types2[node.ret].append(idx)
        common_types = set(types1.keys()).intersection(set(types2.keys()))

    if len(common_types) > 0:
        type_ = random.choice(list(common_types))

    index1 = random.choice(types1[type_])
    height1 = subtree_height(ind1, index1)
    # height = ind1.height

    while True:
        index2 = random.choice(types2[type_])
        height2 = subtree_height(ind2, index2)
        if height2 <= height1:
            # print(f"height1: {height1}, height2: {height2}")
            break
    slice1 = ind1.searchSubtree(index1)
    slice2 = ind2.searchSubtree(index2)
    ind1[slice1], ind2[slice2] = ind2[slice2], ind1[slice1]
    return ind1, ind2

def traverse_tree(stack, res, parent, idx):
    while res != 0:
        res -= 1
        idx += 1
        stack.append((parent[idx], [], idx))
        res += parent[idx].arity
    # print(f"stack: {stack}")
    return stack, res, idx

def cx_one_point(ind1, ind2):
    idx1 = 0
    idx2 = 0
    # To track the trees
    stack1 = []
    stack2 = []
    # Store the common region
    region1 = []
    region2 = []

    # Start traversing the trees
    while idx1 < len(ind1) and idx2 < len(ind2):
        # Push the nodes to the stack
        stack1.append((ind1[idx1], [], idx1))
        stack2.append((ind2[idx2], [], idx2))

        # Not the same region
        if stack1[-1][0].arity != stack2[-1][0].arity:
            res1 = stack1[-1][0].arity
            res2 = stack2[-1][0].arity
            stack1, res1, idx1 = traverse_tree(stack1, res1, ind1, idx1)
            stack2, res2, idx2 = traverse_tree(stack2, res2, ind2, idx2)
        else:
            region1.append([ind1[idx1], idx1])
            region2.append([ind2[idx2], idx2])

        idx1 += 1
        idx2 += 1

    # for pri, idx in region1:
    #     print(f"{idx}: {pri.name}")

    # Select crossover point
    if len(region1) > 0:
        point = random.randint(0, len(region1) - 1)
        # print(f"crossover point: {point}")
        # print(f"crossover point for trees: {region1[point]}, {region2[point]}")

    # Swap subtrees
    if len(region1) > 0:
        slice1 = ind1.searchSubtree(region1[point][1])
        slice2 = ind2.searchSubtree(region2[point][1])
        ind1[slice1], ind2[slice2] = ind2[slice2], ind1[slice1]

    return ind1, ind2

In [82]:
class GP():
    def __init__(self, embeddings, dimension, population_size, crossover_method, cross_prob, mut_prob, num_generations, dataset):
        self.embeddings = embeddings
        self.dim = dimension
        self.pop_size = population_size
        self.pop = None
        self.cx_method = crossover_method
        self.cx_pb = cross_prob
        self.mut_pb = mut_prob
        self.num_gen = 0
        self.inputword = dataset[0].str.split().apply(lambda x: x[:5])
        self.realword = dataset[0].str.split().str.get(5)
        self.eval_count = 0

    def register(self):
        # Function set
        self.pset = gp.PrimitiveSet("MAIN", 5)
        self.pset.addPrimitive(np.add, 2)
        self.pset.addPrimitive(np.subtract, 2)
        self.pset.addPrimitive(np.multiply, 2)
        self.pset.addPrimitive(protected_div, 2)
        self.pset.addPrimitive(protected_sqrt, 1)
        self.pset.addPrimitive(np.square, 1)
        # Terminal set
        count = 0
        self.pset.renameArguments(ARG0="a", ARG1="b", ARG2="c", ARG3="d", ARG4="e")
        # for line in self.inputword:
            # for word in line:
                # if count < 5:
                    # print(f"conut: {count}")
                    # print(f"embedding: {embeddings[word]}, {type(embeddings[word])}")
                    # count += 1
                # self.pset.addTerminal(word)

        # Initialize the individual
        if "Individual" not in creator.__dict__:
            creator.create("FitnessMax", base.Fitness, weights=(1,))
            creator.create(
                "Individual", gp.PrimitiveTree, fitness=creator.FitnessMax, pset=self.pset
            )

        # Initialize the toolbox
        self.toolbox = base.Toolbox()
        self.toolbox.register("expr", gp.genHalfAndHalf, pset=self.pset, min_=1, max_=5)
        self.toolbox.register(
            "individual", tools.initIterate, creator.Individual, self.toolbox.expr
        )
        self.toolbox.register(
            "population",
            tools.initRepeat,
            list,
            self.toolbox.individual,
            n=self.pop_size,
        )

        # Register the operators
        # self.toolbox.register("select_candidate", tools.selRandom, tournsize=3)
        self.toolbox.register("crossover", self.crossover)
        self.toolbox.register(
            "mutate", gp.mutUniform, expr=self.toolbox.expr, pset=self.pset
        )
        self.toolbox.decorate(
            "mutate", gp.staticLimit(operator.attrgetter("height"), max_value=5)
        )
        self.toolbox.register("evaluate", self.evaluate)

        # Register the record for analyzing
        self.stats = tools.Statistics(key=lambda ind: ind.fitness.values)
        self.stats.register("avg", np.mean)
        self.stats.register("std", np.std)
        self.stats.register("min", np.min)
        self.stats.register("max", np.max)
        self.hof = tools.HallOfFame(10)  # hall of fame size

    def clean_data(self, data):
        data = np.where(np.isinf(data), np.finfo(np.float32).max, data)
        data = np.nan_to_num(data, nan=0.0)
        return data

    def evaluate(self, individual, input_word):
        """Evalute the fitness of an individual"""
        # print(f"individual種類:{type(individual)}")
        func = gp.compile(individual, self.pset)
        total_similarity = 0.0
        for data_index in range(len(input_word)):
            words = self.inputword.iloc[data_index]
            in_vectors = [self.embeddings[word] for word in words]
            a, b, c, d, e = in_vectors[:5]

            y = self.realword.iloc[data_index]
            out_vector = self.embeddings[y]

            predict = self.clean_data(func(a, b, c, d, e))
            similarity = cosine_similarity([predict], [out_vector])[0][0]
            total_similarity += similarity

        fitness = total_similarity / len(self.inputword)
        ftiness = self.clean_data(fitness)
        self.eval_count += 1
        return (fitness,)

    def crossover(self, ind1, ind2):
        parents = [ind1, ind2]
        if random.uniform(0, 1) < self.cx_pb:
            choice = -1
            if self.cx_method == CX_RANDOM:
                choice = random.randint(1, 4)
                print(f"choice: {choice}")
            if choice == CX_SIMPLE or self.cx_method == CX_SIMPLE:
                ind1, ind2 = cx_simple(ind1, ind2)
            elif choice == CX_UNIFORM or self.cx_method == CX_UNIFORM:
                ind1, ind2 = cx_uniform(ind1, ind2)
            elif choice == CX_ONEPOINT or self.cx_method == CX_FAIR:
                ind1, ind2 = cx_fair(ind1, ind2)
            else:  # self.cx_method == 4:
                ind1, ind2 = cx_one_point(ind1, ind2)
        # return ind1, ind2

        fitness_ind1 = self.toolbox.evaluate(ind1, self.realword)
        # ?:
        # if self.cx_method == 2:
        #     parents.remove(b)
        #     return parents
        fitness_ind2 = self.toolbox.evaluate(ind2, self.realword)
        if fitness_ind1 <= fitness_ind2:
            parents.remove(ind1)
            return parents
        else:
            parents.remove(ind2)
            return parents

    def mutate(self, offspring):
        if random.uniform(0, 1) < self.mut_pb:
            self.toolbox.mutate(offspring[0])
            offspring[0].fitness.values = self.toolbox.evaluate(offspring[0], self.realword)
        return offspring

## Evolutionary Forest

### Setup

In [113]:
population_size = s = 5
m = 500
dim = 10
crossover_method = CX_SIMPLE
cross_prob = 0.5
mut_prob = 0.1
num_generations = 100
num_evaluations = 1000
embedding = WORD2VEC

# Number of archive (forest size)
num_archive = 200

# Record all individuals
population_history = []


In [None]:
dataset, embeddings, word2vec_model = get_embeddings(WORD2VEC, 10, 1)


### Functions

In [58]:
def add_trees_to_history(individual, history):
    """Add the individual to the history"""
    history.extend(tree for tree in individual.pop)
    return history

In [109]:
# Initialize the population and record trees for each individuals

if len(population) % s == 0:
    seb_population = len(population) // s
elif len(population) % s == 1:
    seb_population = len(population) // s + 1
...


def initialize_population(s, m):
    """Initialize the population"""
    population = []
    for i in range(s):
        sub_dataset = dataset.sample(int(len(dataset) / (s / 2)))
        # Set up GP tree
        trees = GP(embeddings, dim, population_size, crossover_method, cross_prob, mut_prob, num_generations, sub_dataset)
        trees.register()
        trees.pop = trees.toolbox.population(n=m)
        # Evaluate the entire population
        fitnesses = map(trees.toolbox.evaluate, trees.pop, trees.inputword)
        for ind, fit in zip(trees.pop, fitnesses):
            ind.fitness.values = fit
        population.append(trees)
        # add_trees_to_history(population[i], population_history)
    return population


In [60]:
'''# Initialize the population and record trees for each individuals
def initialize_population(s, m):
    """Initialize the population"""
    population = []
    for i in range(s):
        # Set up GP tree
        trees = GP(embeddings, dim, population_size, crossover_method, cross_prob, mut_prob, num_generations, dataset)
        trees.register()
        trees.pop = trees.toolbox.population(n=m)
        # Evaluate the entire population
        fitnesses = map(trees.toolbox.evaluate, trees.pop, trees.inputword)
        for ind, fit in zip(trees.pop, fitnesses):
            ind.fitness.values = fit
        population.append(trees)
        # add_trees_to_history(population[i], population_history)
    return population'''

'# Initialize the population and record trees for each individuals\ndef initialize_population(s, m):\n    """Initialize the population"""\n    population = []\n    for i in range(s):\n        # Set up GP tree\n        trees = GP(embeddings, dim, population_size, crossover_method, cross_prob, mut_prob, num_generations, dataset)\n        trees.register()\n        trees.pop = trees.toolbox.population(n=m)\n        # Evaluate the entire population\n        fitnesses = map(trees.toolbox.evaluate, trees.pop, trees.inputword)\n        for ind, fit in zip(trees.pop, fitnesses):\n            ind.fitness.values = fit\n        population.append(trees)\n        # add_trees_to_history(population[i], population_history)\n    return population'

In [76]:
def select_p(individual):
    """Select the parent"""
    # Parent selection
    candidates = tools.selRandom(individual.pop, 3)
    sorted_candidates = sorted(candidates, key=lambda x: x.fitness.values)  # Small to large
    # print(f"sorted_candidates: {[ind.fitness.values for ind in sorted_candidates]}")

    return sorted_candidates

In [77]:
def crossover_for_one_ind(individual, candidates, population_history):
    parent1, parent2 = copy.deepcopy(candidates[0]), copy.deepcopy(candidates[1])
    offspring = individual.toolbox.crossover(parent1, parent2)


    # End if the offspring is not repeated
    # is_new_ind = False
    # while (not is_new_ind):
    #     offspring = individual.toolbox.crossover(parent1, parent2)
    #     # print(f"offspring: {offspring[0]}")
    #     if not (offspring[0] in population_history):
    #         is_new_ind = True
    # population_history.append(offspring[0])
    return offspring

In [78]:
def mutate_for_one_ind(offspring, individual, population_history):
    # print(f"offspring[0] in mutation: {offspring[0]}")
    offspring = individual.toolbox.mutate(offspring[0])

    # End if the offspring is not repeated
    # is_new_ind = False
    # while (not is_new_ind):
    #     offspring = individual.toolbox.mutate(offspring[0])
    #     if not (offspring[0] in population_history):
    #         is_new_ind = True
    # population_history.append(offspring[0])
    return offspring

In [79]:
def select_s(individual, sorted_candidates, offspring):
    # Evaluate new fitness value
    offspring[0].fitness.values = individual.toolbox.evaluate(offspring[0], individual.inputword)

    if offspring[0].fitness.values > sorted_candidates[2].fitness.values:
        idx = individual.pop.index(sorted_candidates[2])
        individual.pop[idx] = offspring[0]

In [65]:
'''def validate(individual):

    # Concatenate these five loss values into a fitness vector
    cur_fitness = []
    for ind in individual.pop:
        cur_fitness.append(ind.fitness.values[0])
        # print(ind.fitness.values)
    return cur_fitness'''

'def validate(individual):\n\n    # Concatenate these five loss values into a fitness vector\n    cur_fitness = []\n    for ind in individual.pop:\n        cur_fitness.append(ind.fitness.values[0])\n        # print(ind.fitness.values)\n    return cur_fitness'

In [111]:

# Evolute groups of population
def evolve_group(population):
    print(f"Start evolve groups")

    for i in range(s):
        while population[i].num_gen < num_generations:
            if population[i].num_gen % 20 == 0:
                print(f"population[{i}].num_gen: {population[i].num_gen}")
            # Parent selection
            candidates = select_p(population[i])
            # Crossover the two trees
            # print(f"Start crossover")
            offspring = crossover_for_one_ind(population[i], candidates, population_history)
            # print(f"offspring after crossover: {offspring[0]}")

            # Mutation
            # print(f"Start mutation")
            offspring = mutate_for_one_ind(offspring, population[i], population_history)
            # print(f"offspring after mutation: {offspring[0]}")

            # Survivor selection
            select_s(population[i], candidates, offspring)
            # print(f"Finish survivor selection")

            if population[i].eval_count % 50 == 0:
                print(f"population[{i}].eval_count: {population[i].eval_count}")
                record = population[i].stats.compile(population[i].pop)
                population[i].hof.update(population[i].pop)
                print(f"record: {record}")
            population[i].num_gen += 1

    return population


In [67]:
'''# Fivefold cross-validation
def cross_validation(population, dataset):
    print(f"Start cross-validation")
    fold_size = 5
    kf = KFold(n_splits=fold_size, shuffle=True)

    # Fitness vector
    fit_vec = np.zeros((s, fold_size))
    theta_j = 0
    count = 0

    for train_index, valid_index in kf.split(dataset):
        print(f"Fold: {count}, TRAIN: {len(train_index)}, TEST: {len(valid_index)}")
        train, valid = dataset.iloc[train_index], dataset.iloc[valid_index]
        # For four folds: train the DT model
        for i in range(s):
            # print(f"=== Individual {i} is training ===")
            population[i].inputword = train[0].str.split().apply(lambda x: x[:5])
            population[i].realword = train[0].str.split().str.get(5)

            while population[i].eval_count < num_evaluations:
                # Parent selection
                candidates = select_p(population[i])
                # Crossover the two trees
                # print(f"Start crossover")
                offspring = crossover_for_one_ind(population[i], candidates, population_history)
                # print(f"offspring after crossover: {offspring[0]}")

                # Mutation
                # print(f"Start mutation")
                offspring = mutate_for_one_ind(offspring, population[i], population_history)
                # print(f"offspring after mutation: {offspring[0]}")

                # Survivor selection
                select_s(population[i], candidates, offspring)
                # print(f"Finish survivor selection")

                if population[i].eval_count % 20 == 0:
                    print(f"population[{i}].eval_count: {population[i].eval_count}")
                    record = population[i].stats.compile(population[i].pop)
                    population[i].hof.update(population[i].pop)
                    print(f"record: {record}")

        # For the remaining fold: validate the model
            population[i].inputword = valid[0].str.split().apply(lambda x: x[:5])
            population[i].realword = valid[0].str.split().str.get(5)

            # Concatenate these five loss values into a fitness vector
            fitnesses = map(population[0].toolbox.evaluate, population[0].pop, population[0].inputword)
            for ind, fit in zip(population[0].pop, fitnesses):
                ind.fitness.values = fit
            cur_fitness = validate(population[i])
            fit_vec[count] = np.array(cur_fitness)
        count += 1
    return fit_vec
'''

'# Fivefold cross-validation\ndef cross_validation(population, dataset):\n    print(f"Start cross-validation")\n    fold_size = 5\n    kf = KFold(n_splits=fold_size, shuffle=True)\n\n    # Fitness vector\n    fit_vec = np.zeros((s, fold_size))\n    theta_j = 0\n    count = 0\n\n    for train_index, valid_index in kf.split(dataset):\n        print(f"Fold: {count}, TRAIN: {len(train_index)}, TEST: {len(valid_index)}")\n        train, valid = dataset.iloc[train_index], dataset.iloc[valid_index]\n        # For four folds: train the DT model\n        for i in range(s):\n            # print(f"=== Individual {i} is training ===")\n            population[i].inputword = train[0].str.split().apply(lambda x: x[:5])\n            population[i].realword = train[0].str.split().str.get(5)\n\n            while population[i].eval_count < num_evaluations:\n                # Parent selection\n                candidates = select_p(population[i])\n                # Crossover the two trees\n                #

In [68]:
'''def compute_threshold(fit_vec):
    print(f"Number of computing individuals: {len(fit_vec)}")
    j = np.random.randint(0, m)  # Randomly choose an index j
    # Threshold theta = min(L_i[j]) + median(abs(L_i[j] + median(L_k[j]))
    # Calculate the median absolute deviation (MAD)
    fitness_j = [ind[j] for ind in fit_vec]
    median_j = np.median(fitness_j)
    mad_j = np.median([abs(f - median_j) for f in fitness_j])
    theta_j = np.min(fitness_j) + mad_j  # Calculate the threshold theta

    return j, theta_j'''

'def compute_threshold(fit_vec):\n    print(f"Number of computing individuals: {len(fit_vec)}")\n    j = np.random.randint(0, m)  # Randomly choose an index j\n    # Threshold theta = min(L_i[j]) + median(abs(L_i[j] + median(L_k[j]))\n    # Calculate the median absolute deviation (MAD)\n    fitness_j = [ind[j] for ind in fit_vec]\n    median_j = np.median(fitness_j)\n    mad_j = np.median([abs(f - median_j) for f in fitness_j])\n    theta_j = np.min(fitness_j) + mad_j  # Calculate the threshold theta\n\n    return j, theta_j'

In [69]:
'''# Select the best individual: Automatic lexicase selection
def lexicase_selection(fit_vec, s, m):
    indices = np.arange(len(fit_vec))
    select_ind = None

    find_best_individual_count = 0
    while (len(indices) > 1):
        print(f"find_best_individual_count: {find_best_individual_count}")
        # print(f"indices: {len(indices)}, {indices}")
        j, theta_j = compute_threshold(fit_vec[indices])

        # For each individual
        for i in range(len(indices)):
            # j-th fitness element > theta
            # print(f"fit_vec[{i}][{j}]: {fit_vec[i][j]}, theta_j: {theta_j}")
            if fit_vec[i][j] > theta_j:
                # preserve the individual
                pass
            else:
                indices = np.delete(indices, np.where(indices == i))
            # print(f"{i}: indices: {len(indices)}, {indices}")

            # Case 1: > 1, then repeat the filtering process
            if len(indices) > 1:
                continue
            # Case 2: == 1, then select the individual
            elif len(indices) == 1:
                select_ind = indices[0]
            # Case 3: == 0, then randomly select an individual
            else:
                select_ind = np.random.randint(0, s)

    return select_ind
'''

'# Select the best individual: Automatic lexicase selection\ndef lexicase_selection(fit_vec, s, m):\n    indices = np.arange(len(fit_vec))\n    select_ind = None\n\n    find_best_individual_count = 0\n    while (len(indices) > 1):\n        print(f"find_best_individual_count: {find_best_individual_count}")\n        # print(f"indices: {len(indices)}, {indices}")\n        j, theta_j = compute_threshold(fit_vec[indices])\n\n        # For each individual\n        for i in range(len(indices)):\n            # j-th fitness element > theta\n            # print(f"fit_vec[{i}][{j}]: {fit_vec[i][j]}, theta_j: {theta_j}")\n            if fit_vec[i][j] > theta_j:\n                # preserve the individual\n                pass\n            else:\n                indices = np.delete(indices, np.where(indices == i))\n            # print(f"{i}: indices: {len(indices)}, {indices}")\n\n            # Case 1: > 1, then repeat the filtering process\n            if len(indices) > 1:\n                continue

In [70]:
'''def add_to_archive(archive, num_archive, individual, avg_fitness_value):
    # If the number of selected individuals < remain num_archive
    if len(archive) < num_archive:
        archive.append((individual, avg_fitness_value))  # Add to the archive
        archive = sorted(archive, key=lambda ind: (ind[1]))
    # replace the worst individual in the archive with the best individual
    else:
        # Find the minimum fitness value; fitness value = average of the fitness vector
        archive[0] = (individual, avg_fitness_value) if avg_fitness_value > archive[0][1] else archive[0]
    return archive'''

'def add_to_archive(archive, num_archive, individual, avg_fitness_value):\n    # If the number of selected individuals < remain num_archive\n    if len(archive) < num_archive:\n        archive.append((individual, avg_fitness_value))  # Add to the archive\n        archive = sorted(archive, key=lambda ind: (ind[1]))\n    # replace the worst individual in the archive with the best individual\n    else:\n        # Find the minimum fitness value; fitness value = average of the fitness vector\n        archive[0] = (individual, avg_fitness_value) if avg_fitness_value > archive[0][1] else archive[0]\n    return archive'

### Evolution

In [112]:
archive = []

# Initialize the population
population = initialize_population(s=5, m=500)

# Evolutionary process for each group
population = evolve_group(population)

# Select the best num_archive individuals
population_merged = [ind for trees in population for ind in trees.pop]


Start evolve groups
population[0].num_gen: 0
population[0].num_gen: 20
population[0].num_gen: 40
population[0].eval_count: 650
record: {'avg': 0.011133004053323601, 'std': 0.05083545969342669, 'min': -0.0023590957548647365, 'max': 0.4440107989513805}
population[0].num_gen: 60
population[0].num_gen: 80
population[0].eval_count: 800
record: {'avg': 0.016746608969047408, 'std': 0.062145317340245795, 'min': -0.0023590957548647365, 'max': 0.4440107989513805}
population[1].num_gen: 0
population[1].num_gen: 20
population[1].num_gen: 40
population[1].eval_count: 650
record: {'avg': 0.0076799282420676514, 'std': 0.04406729436271467, 'min': -0.002025596511174053, 'max': 0.4710164433624508}
population[1].num_gen: 60
population[1].num_gen: 80
population[1].eval_count: 800
record: {'avg': 0.016983996790577507, 'std': 0.06750367542227506, 'min': -0.002025596511174053, 'max': 0.4710164433624508}
population[2].num_gen: 0
population[2].num_gen: 20
population[2].num_gen: 40
population[2].eval_count: 650



population[4].num_gen: 80
population[4].eval_count: 800
record: {'avg': 0.014124907418025336, 'std': 0.05567434458984742, 'min': -0.0020545445201043894, 'max': 0.41053181442030817}


In [None]:
population_merged[:5]

In [114]:
population_merged = sorted(population_merged, key=lambda x: x.fitness.values, reverse=True)

In [115]:
archive.extend(population_merged[:num_archive])

In [116]:
for ind in archive:
    print(f"{ind.fitness.values[0]}, {str(ind)}")

0.4710164433624508, add(e, subtract(c, subtract(multiply(d, d), add(a, square(d)))))
0.44551955836652907, add(subtract(protected_sqrt(e), subtract(e, c)), add(b, e))
0.4440107989513805, add(protected_sqrt(c), add(b, a))
0.44232580739975974, add(add(b, protected_sqrt(e)), e)
0.43918907105130156, add(add(c, a), protected_sqrt(d))
0.4256158527304334, add(e, c)
0.4253166894683575, add(add(subtract(add(c, c), protected_sqrt(c)), d), a)
0.4176096860808168, add(e, multiply(c, square(protected_div(c, c))))
0.41053181442030817, add(a, d)
0.40930214414334637, protected_sqrt(e)
0.4079763448694389, add(a, c)
0.40783983187736017, protected_div(e, protected_div(d, d))
0.4065900233623527, protected_sqrt(add(add(e, d), multiply(multiply(subtract(a, a), add(c, d)), b)))
0.4001176507548702, protected_sqrt(add(c, a))
0.39985633246673075, add(e, protected_sqrt(e))
0.3897028422621825, add(protected_sqrt(a), c)
0.38780475172427836, protected_sqrt(protected_sqrt(e))
0.3748911334118337, add(protected_sqrt(pro

In [None]:
archive = []  # Archive the best individual
total_eval_count = 0


    # For each individual (DT) (s individuals)
        # For each GP tree (m trees)
            # Randomly initialize the tree
population = initialize_population(s, m)

# Fivefold cross-validation
fit_vec = cross_validation(population, dataset)

# Select the best individual: Automatic lexicase selection
select_ind = lexicase_selection(fit_vec, s, m)

# Archive the best individual
archive = add_to_archive(archive, num_archive, population[select_ind], np.mean(fit_vec[select_ind]))

# Return the archive

total_eval_count = sum([ind.eval_count for ind in population])

# Show current prediction
# 1. Mean all GP trees in the archive
# mean_predictions = []

for population, avg_fitness in archive:
    record = population.stats.compile(population.pop)
    population.hof.update(population.pop)
    print(f"record: {record}")
#     population_predictions = []
#     for ind in population.pop:
#         func = gp.compile(ind, population.pset)
#         a, b, c, d, e = [population.embeddings[word] for word in population.inputword.iloc[1]]
#         predict_vector = func(a, b, c, d, e)
#         population.predictions.append(predict_vector)

#     mean_population_prediction = np.mean(population_predictions, axis=0)
#     mean_predictions.append(mean_population_prediction)
# final_prediction = np.mean(mean_predictions, axis=0)

# # 2. Print current inputword (five words), realword (1 word) and predict word
# print("Input words: ", dataset['inputword'].iloc[:5])
# print("Real word: ", dataset['realword'].iloc[0])
# print("Predicted word: ", final_prediction)

#### Try

In [None]:
population = initialize_population(s, m)

# Fivefold cross-validation
fit_vec = cross_validation(population, dataset)

# Select the best individual: Automatic lexicase selection
select_ind = lexicase_selection(fit_vec, s, m)

In [None]:
for ind in population:
    print(f"ind eval count: {ind.eval_count}")

In [53]:
record = population[0].stats.compile(population[0].pop)
record

{'avg': 0.000557741725970799,
 'std': 0.0009850077197091109,
 'min': -0.00039808267172886417,
 'max': 0.0024288598350371312}

In [27]:
len(population.inputword)

533

In [39]:
archive

[(<__main__.GP at 0x7fd1037d41f0>, 0.05257447638766555)]

In [38]:
predict_vectors = np.zeros((len(population.inputword), m * len(archive)))
predict_vectors.shape

(533, 5)

In [41]:
target_vectors = np.zeros((len(population.inputword), m * len(archive)))
target_vectors.shape

(533, 5)

In [None]:
# Save the prediction and target vectors for each sentence
for population, avg_fitness in archive:
    # print(f"population: {population}")
    for ind in population.pop:
        func = gp.compile(ind, population.pset)
        for data_index in range(len(population.inputword)):
            five_words = population.inputword.iloc[data_index]
            a, b, c, d, e = [population.embeddings[word] for word in five_words]
            predict_vector = func(a, b, c, d, e)
            predict_vectors[data_index] = predict_vector

            y = population.realword.iloc[0]
            target_vector[data_index] = population.embeddings[y]


In [None]:
# Print the prediction and target word

print(f"predict_vectors: {np.mean(predict_vectors, axis=0)}")
predict_word = word2vec_model.wv.similar_by_vector(np.mean(predict_vectors, axis=0), topn=1)


In [36]:
predict_word

[('ulirach', 0.915840208530426)]

In [None]:
similarity = cosine_similarity([np.mean(predict_vectors)], [target_vectors])[0][0]

### Build functions

In [None]:
# For each generation (n_gen generations)
    # For each individual (DT) (s individuals)
        # For each GP tree (m trees)
            # Randomly initialize the tree


    # Fivefold cross-validation
    # Traing dataset: divided into five folds
    # For four folds: train the DT model
    # For the remaining fold: test the model
    # Concatenate these five loss values into a fitness vector

        # Select the best individual: Automatic lexicase selection
        # Fitness vector L: i-th individual's fitness vector
        # While the number of selected individuals == s
            # Randomly choose an index j
                # j = np.random.randint(m)
            # Threshold theta = min(L_i[j]) + median(abs(L_i[j] + median(L_k[j]))
                # Calculate the median absolute deviation (MAD)
                    # fitness_j = [L[j] for L in individauls]
                    # median_j = np.median(fitness_j)
                    # mad_j = np.median([abs(f - median_j) for f in fitness_j])
                # Calculate the threshold theta
                    # theta_j = np.min(fitness_j) + mad_j
            # For each individual
                # j-th fitness element > theta -> preserve the individual
                # Number of selected individuals
                # Case 1: > 1, then repeat the filtering process
                # Case 2: == 1, then select the individual
                # Case 3: == 0, then randomly select an individual
        # Archive the best individual
        # For each selected individual
            # If the number of selected individuals < remain num_archive
                # Add to the archive
            # Else, replace the worst individual in the archive with the best individual
                # Find the minimum fitness value; fitness value = average of the fitness vector
    # Return the archive

#### Initialization

In [None]:
# Record all individuals
population_history = []

# Initialize the population and record trees for each individuals
population = []
for i in range(s):
    # Set up GP tree
    trees = GP(embeddings, dim, population_size, crossover_method, cross_prob, mut_prob, num_generations, dataset)
    trees.register()
    trees.pop = trees.toolbox.population(n=m)
    # Evaluate the entire population
    fitnesses = map(trees.toolbox.evaluate, trees.pop, trees.inputword)
    for ind, fit in zip(trees.pop, fitnesses):
        ind.fitness.values = fit
    population.append(trees)
    add_trees_to_history(population[i], population_history)
# for i in range(s):
#     if i % 3 == 0:
#         population.append([WORD2VEC] +word2vec_setup.toolbox.population(n=m))
#         population_history.extend(tree for tree in population[i])
#     elif i % 3 == 1:
#         population.append([GLOVE] + glove_setup.toolbox.population(n=m))
#         population_history.extend(tree for tree in population[i])
#     else:  # i % 3 == 2
#         population.append([FASTTEXT] + fasttext_setup.toolbox.population(n=m))
#         population_history.extend(tree for tree in population[i])


##### Test

In [None]:
tree = GP(embeddings, dim, population_size, crossover_method, cross_prob, mut_prob, num_generations, dataset)
tree.register()
# print(self.pop_size)
tree.pop = tree.toolbox.population(n=tree.pop_size)
# for ind in tree.pop:
#     print(str(ind))
# Evaluate the entire population
# fitnesses = map(tree.toolbox.evaluate, tree.pop)
# for ind, fit in zip(tree.pop, fitnesses):
#     ind.fitness.values = fit
print(f"tree.pop type: {type(tree.pop)}")


In [None]:
type(tree)

In [209]:
population

[<__main__.GP at 0x7fda7545b1c0>,
 <__main__.GP at 0x7fda742fbfd0>,
 <__main__.GP at 0x7fdaee9dce50>,
 <__main__.GP at 0x7fda7595c0a0>,
 <__main__.GP at 0x7fda73ac35b0>,
 <__main__.GP at 0x7fdaf7449a90>,
 <__main__.GP at 0x7fdabc1d1fd0>,
 <__main__.GP at 0x7fdb246ec520>,
 <__main__.GP at 0x7fdab3c81fa0>,
 <__main__.GP at 0x7fdab3c81940>,
 <__main__.GP at 0x7fda73a76fa0>,
 <__main__.GP at 0x7fdaf72ab6a0>,
 <__main__.GP at 0x7fda750efcd0>,
 <__main__.GP at 0x7fdafffc6310>,
 <__main__.GP at 0x7fdaf74df820>,
 <__main__.GP at 0x7fdb24666df0>,
 <__main__.GP at 0x7fda6e7163d0>,
 <__main__.GP at 0x7fda6e5dea00>,
 <__main__.GP at 0x7fda6e4a6f70>,
 <__main__.GP at 0x7fda6e2f1640>,
 <__main__.GP at 0x7fda6e17ac70>,
 <__main__.GP at 0x7fda6df982b0>,
 <__main__.GP at 0x7fda6df0e8b0>,
 <__main__.GP at 0x7fda6ddd8fa0>,
 <__main__.GP at 0x7fda6dca07f0>,
 <__main__.GP at 0x7fda6dae8df0>,
 <__main__.GP at 0x7fda6d9063d0>,
 <__main__.GP at 0x7fda6d87c880>,
 <__main__.GP at 0x7fda6d745df0>,
 <__main__.GP 

In [210]:
for ind in population_history:
    print(str(ind))

subtract(subtract(subtract(e, c), subtract(b, a)), subtract(protected_div(d, e), protected_sqrt(c)))
multiply(c, subtract(e, e))
protected_sqrt(c)
add(subtract(a, d), e)
add(e, b)
protected_sqrt(d)
square(a)
subtract(a, subtract(square(a), a))
add(a, c)
protected_sqrt(protected_div(square(protected_div(e, c)), square(multiply(c, b))))
multiply(subtract(a, a), multiply(d, add(a, b)))
protected_sqrt(d)
multiply(d, c)
protected_sqrt(a)
square(protected_sqrt(protected_div(square(e), protected_div(c, e))))
square(multiply(e, d))
protected_div(multiply(square(add(multiply(e, a), square(e))), protected_div(square(add(d, c)), multiply(protected_sqrt(e), protected_div(d, c)))), protected_sqrt(square(multiply(subtract(b, c), add(a, a)))))
multiply(b, subtract(a, d))
multiply(square(multiply(add(add(e, d), add(a, b)), add(subtract(e, a), protected_sqrt(e)))), protected_div(add(subtract(protected_sqrt(a), add(c, e)), multiply(subtract(e, b), square(d))), subtract(square(protected_sqrt(e)), square(

In [26]:
type(population[0])

__main__.GP

In [27]:
len(population[0].pop)

5

In [28]:
len((population[0].pop)[0])

11

In [29]:
test2 = []
test2.extend(tree for tree in population[0].pop)
len(test2)

5

#### Parent selection

In [None]:
# Fivefold cross-validation
fold_size = 5
kf = KFold(n_splits=fold_size)

# Fitness vector
fit_vec = np.zeros((s, fold_size))
count = 0

for train_index, valid_index in kf.split(dataset):
    print("TRAIN:", len(train_index), "TEST:", len(valid_index))
    train, valid = dataset.iloc[train_index], dataset.iloc[valid_index]
    for i in range(1):
        population[i].inputword = train[0].str.split().apply(lambda x: x[:5])
        population[i].realword = train[0].str.split().str.get(5)
        # Parent selection
        # parent_num = random.randint(0, s - 1)  # Randomly select another individual
        # tree_num = random.randint(0, m - 1)  # Randomly select a tree from each individual
        candidates = tools.selRandom(population[i].pop, 3)
        sorted_candidates = sorted(candidates, key=lambda x: x.fitness.values)  # Small to large
        print(f"sorted_candidates: {[ind.fitness.values for ind in sorted_candidates]}")


#### Crossover

In [213]:
for i in range(1):
    parent1, parent2 = copy.deepcopy(candidates[0]), copy.deepcopy(candidates[1])

    # End if the offspring is not repeated
    is_new_ind = False
    while (not is_new_ind):
        offspring = population[i].toolbox.crossover(parent1, parent2)
        print(f"offspring: {offspring[0]}")
        if not (offspring[0] in population_history):
            is_new_ind = True
    population_history.append(offspring[0])
    # Evaluate new fitness value
    # offspring[0].fitness.values = population[i].toolbox.evaluate(offspring[0], population[i].inputword)


offspring: add(subtract(a, d), e)
offspring: add(subtract(a, d), e)
offspring: add(subtract(subtract(e, c), subtract(b, a)), e)


##### Test

In [30]:
parent_test1 = copy.deepcopy((population[0].pop)[0])
parent_test2 = copy.deepcopy((population[3].pop)[3])

In [None]:
parent_test2.__str__()

In [None]:
type(parent_test2)

In [None]:
for ind in population_history:
    print(str(ind))

In [None]:
test3 = gp.PrimitiveTree.from_string("add(protected_div(c, c), c)", pset=population[0].pset)
test3

In [77]:
for ind in population_history:
    if str(ind) == str(test3):
        print("found")

In [None]:
test3 in population_history

In [214]:
for node in offspring[0]:
    print(node.name)

add
subtract
subtract
ARG4
ARG2
subtract
ARG1
ARG0
ARG4


In [215]:
offspring[0].__str__()

'add(subtract(subtract(e, c), subtract(b, a)), e)'

In [216]:
len(offspring[0])

9

In [181]:
type(offspring[0])

deap.creator.Individual

In [217]:
offspring[0]

[<deap.gp.Primitive at 0x7fdab3faa3b0>,
 <deap.gp.Primitive at 0x7fdab3faa540>,
 <deap.gp.Primitive at 0x7fdab3faa540>,
 <deap.gp.Terminal at 0x7fda74f95f00>,
 <deap.gp.Terminal at 0x7fda74f95a40>,
 <deap.gp.Primitive at 0x7fdab3faa540>,
 <deap.gp.Terminal at 0x7fda74f95780>,
 <deap.gp.Terminal at 0x7fda74f95d40>,
 <deap.gp.Terminal at 0x7fda74f95f00>]

#### Mutation

In [218]:
# Mutation
# count = 0
for i in range(1):
    # End if the offspring is not repeated
    is_new_ind = False
    while (not is_new_ind):
        # offspring = population[i].toolbox.mutate(offspring)
        offspring = [population[i].toolbox.mutate(individual)[0] for individual in offspring]
        # print(f"count: {count}")
        # count += 1
        print(f"offspring: {offspring[0]}")
        if not (offspring[0] in population_history):
            is_new_ind = True
    population_history.append(offspring[0])

    # Evaluate new fitness value
    # offspring[0].fitness.values = population[i].toolbox.evaluate(offspring[0], population[i].inputword)


offspring: add(subtract(subtract(e, c), subtract(b, add(c, b))), e)
offspring: add(subtract(protected_sqrt(multiply(b, d)), subtract(b, add(c, b))), e)
offspring: add(add(c, d), e)
offspring: add(protected_sqrt(square(d)), e)
offspring: add(add(protected_div(c, a), b), e)
offspring: add(add(protected_div(multiply(b, subtract(a, e)), a), b), e)
offspring: add(add(protected_div(multiply(b, subtract(a, e)), a), b), e)


##### Test

In [219]:
str(offspring[0])

'add(add(protected_div(multiply(b, subtract(a, e)), a), b), e)'

#### Survivor selection

In [232]:
for i in range(1):
    # Evaluate new fitness value
    offspring[0].fitness.values = population[i].toolbox.evaluate(offspring[0], population[i].inputword)

    if offspring[0].fitness.values > sorted_candidates[2].fitness.values:
        idx = population[i].pop.index(sorted_candidates[2])
        population[i].pop[idx] = offspring[0]

##### Test

In [None]:
offspring[0].fitness.values = population[i].toolbox.evaluate(offspring[0], population[i].inputword)

In [None]:
print(f"offspring: {offspring[0].fitness.values}")
print(f"sorted_candidates[2]: {sorted_candidates[2].fitness.values}")
offspring[0].fitness.values > sorted_candidates[2].fitness.values

offspring: (0.21631643317361815,)
sorted_candidates[2]: (0.0005105780727209425,)


True

In [223]:
str(sorted_candidates[2])

'subtract(subtract(subtract(e, c), subtract(b, a)), subtract(protected_div(d, e), protected_sqrt(c)))'

In [224]:
sorted_candidates[2].fitness.values

(0.0005105780727209425,)

In [229]:
idx = population[0].pop.index(sorted_candidates[2])
idx

0

In [230]:
str(population[0].pop[idx])

'subtract(subtract(subtract(e, c), subtract(b, a)), subtract(protected_div(d, e), protected_sqrt(c)))'

In [233]:
str(population[0].pop[idx])

'add(add(protected_div(multiply(b, subtract(a, e)), a), b), e)'

#### Validation

In [234]:
# For the remaining fold: validate the model
count = 0
for i in range(1):
    population[i].inputword = valid[0].str.split().apply(lambda x: x[:5])
    population[i].realword = valid[0].str.split().str.get(5)

    # Concatenate these five loss values into a fitness vector
    cur_fitness = []
    for ind in population[i].pop:
        cur_fitness.append(ind.fitness.values[0])
        print(ind.fitness.values)
    fit_vec[count] = np.array(cur_fitness)

(0.21631643317361815,)
(0.0,)
(0.0007900872193458331,)
(0.00032505390795039164,)
(0.00039713937229500946,)


##### Test

In [235]:
fit_vec[0]

array([0.21631643, 0.        , 0.00079009, 0.00032505, 0.00039714])

In [252]:
fit_vec.shape

(50, 5)

#### Fivefold cross-validation

In [None]:
# Fivefold cross-validation
fold_size = 5
kf = KFold(n_splits=fold_size)

# Fitness vector
fit_vec = np.zeros((s, fold_size))
theta_j = 0
count = 0

for train_index, valid_index in kf.split(dataset):
    print("TRAIN:", len(train_index), "TEST:", len(valid_index))
    train, valid = dataset.iloc[train_index], dataset.iloc[valid_index]
    # For four folds: train the DT model
    for i in range(s):
        population[i].inputword = train[0].str.split().apply(lambda x: x[:5])
        population[i].realword = train[0].str.split().str.get(5)

        # Parent selection
        candidates = select_p(population[i])
        # Crossover the two trees
        offspring = crossover_for_one_ind(population[i], candidates)

        # Mutation
        offspring = mutate_for_one_ind(population[i])

        # Survivor selection
        select_s(population[i], candidates, offspring)

    # For the remaining fold: validate the model
        population[i].inputword = valid[0].str.split().apply(lambda x: x[:5])
        population[i].realword = valid[0].str.split().str.get(5)

        # Concatenate these five loss values into a fitness vector
        cur_fitness = validate(individual)
        fit_vec[count] = np.array(cur_fitness)

        # Select the best individual: Automatic lexicase selection
        theta_j = compute_threshold(fit_vec)

    count += 1



#### Forest selection

##### Test

In [337]:
fit_vec2 = np.zeros((s, m))

In [339]:
indices2 = np.arange(len(fit_vec2))
fit_vec2[indices2].shape

(50, 5)

In [256]:
# Select the best individual: Automatic lexicase selection
    # Fitness vector L: i-th individual's fitness vector
    # While the number of selected individuals == s
        # Randomly choose an index j
            # j = np.random.randint(m)
        # Threshold theta = min(L_i[j]) + medien(abs(L_i[j] + median(L_k[j]))
            # Calculate the median absolute deviation (MAD)
                # fitness_j = [L[j] for L in individauls]
                # median_j = np.median(fitness_j)
                # mad_j = np.median([abs(f - median_j) for f in fitness_j])

                # theta_j = np.min(fitness_j) + mad_j


# Select the best individual: Automatic lexicase selection
j = np.random.randint(0, m)
# Threshold theta = min(L_i[j]) + median(abs(L_i[j] + median(L_k[j]))
# Calculate the median absolute deviation (MAD)
fitness_j = [individual[j] for individual in fit_vec]
median_j = np.median(fitness_j)
mad_j = np.median([abs(f - median_j) for f in fitness_j])
theta_j = np.min(fitness_j) + mad_j  # Calculate the threshold theta


In [292]:

indices = np.arange(s)
select_ind = None
# For each individual
for i in range(s):
    # j-th fitness element > theta
    if fit_vec[i][j] > theta_j:
        # preserve the individual
        pass
    else:
        indices = np.delete(indices, np.where(indices == i))
    # print(f"{i}: indices: {len(indices)}, {indices}")

    # Case 1: > 1, then repeat the filtering process
    if len(indices) > 1:
        continue
    # Case 2: == 1, then select the individual
    elif len(indices) == 1:
        select_ind = indices[0]
    # Case 3: == 0, then randomly select an individual
    else:
        select_ind = np.random.randint(0, s)


In [255]:
len([individual[j] for individual in fit_vec])

50

In [257]:
j

3

In [258]:
len(fitness_j)

50

In [259]:
median_j

0.0

In [260]:
mad_j

0.0

In [261]:
theta_j

0.0

In [265]:
fit_vec[0][j] > theta_j

True

In [266]:
last_ind = population[-1]
last_ind

<__main__.GP at 0x7fda6bb3d3a0>

In [267]:
del population[-1]

In [269]:
population.append(last_ind)

In [270]:
len(population)

50

In [280]:
test4 = np.arange(2)
test4

array([0, 1])

In [281]:
test4 = np.delete(test4, 0)
test4

array([1])

In [282]:
test4[0]

1

In [293]:
select_ind

0

#### Archive

In [None]:
# Archive the best individual
archive = []

#?: For each selected individual

# If the number of selected individuals < remain num_archive
if len(archive) < num_archive:
    archive.append((population[select_ind], np.average(fit_vec[select_ind])))  # Add to the archive
    archive = sorted(archive, key=lambda ind: ind[1])
# replace the worst individual in the archive with the best individual
else:
    # Find the minimum fitness value; fitness value = average of the fitness vector
    archive[0] = (population[select_ind], fit_vec[select_ind]) if fit_vec[select_ind] > archive[0][1] else archive[0]


##### Test

In [322]:
archive = []
archive.append((population[select_ind], np.average(fit_vec[select_ind])))
print(len(archive))
archive

1


[(<__main__.GP at 0x7fda7545b1c0>, 0.04356574273464188)]

In [323]:
archive.append((population[1], np.average(fit_vec[1])))
archive = sorted(archive, key=lambda ind: ind[1])
archive

[(<__main__.GP at 0x7fda742fbfd0>, 0.0),
 (<__main__.GP at 0x7fda7545b1c0>, 0.04356574273464188)]

In [315]:
if fit_vec[select_ind] > archive[0][1]:
    archive[0] = (population[select_ind], fit_vec[select_ind])

[(0.00039713937229500946,), (0.0007900872193458331,), (0.0007900872193458331,), (0.21631643317361815,), (0.21631643317361815,)]
