Raffaele Pane `<raffaele.pane@studenti.polito.it>`  
[`https://github.com/bred91/Computational_Intelligence_2022-2023/tree/main/lab2`](https://github.com/bred91/Computational_Intelligence_2022-2023/tree/main/lab2) 

# Lab 2: Set Covering with an Evolutionary Algorithm

First lab + peer review. List this activity in your final report, it will be part of your exam.

## Task

Given a number $N$ and some lists of integers $P = (L_0, L_1, L_2, ..., L_n)$, 
determine, if possible, $S = (L_{s_0}, L_{s_1}, L_{s_2}, ..., L_{s_n})$
such that each number between $0$ and $N-1$ appears in at least one list

$$\forall n \in [0, N-1] \ \exists i : n \in L_{s_i}$$

and that the total numbers of elements in all $L_{s_i}$ is minimum. 

## Solution

In [1]:
# import
import random
import numpy as np
import logging
import itertools
import matplotlib.pyplot as plt
from collections import namedtuple
import time

In [2]:
logging.getLogger().setLevel(logging.INFO)

### Preprocessing and Problem creation

Call Counter annotation for fitness functions

In [72]:
__CALLS__ = dict()


def CallCounter(fn):
    """Annotation @CallCounter"""
    assert fn.__name__ not in __CALLS__, f"Function '{fn.__name__}' already listed in __CALLS__"
    __CALLS__[fn.__name__] = 0
    logging.debug(f"CallCounter: Counting __CALLS__['{fn.__name__}'] ({fn})")

    def call_count(*args, **kwargs):
        __CALLS__[fn.__name__] += 1
        return fn(*args, **kwargs)

    return call_count

In [73]:
def problem(N, seed=None):
    """Generate a random problem instance"""
    random.seed(seed)
    return [
        list(set(random.randint(0, N - 1) for n in range(random.randint(N // 5, N // 2))))
        for n in range(random.randint(N, N * 5))
    ]

def preprocessing (N):
    '''Preprocessing to get all possible lists and sort them'''
    sorted_all_list = sorted(problem(N, seed=42), key=lambda l: len(l))
    all_lists = list(sorted_all_list for sorted_all_list, _ in itertools.groupby(sorted_all_list)) # remove duplicates
    return all_lists

### GA Functions

In [74]:
Individual = namedtuple("Individual", ["genome", "fitness"])


@CallCounter
def fitness(genome,N):
    '''Fitness function'''
    # The idea is to weight the fitness by a couple made by the coverage of the genoma and the weight of the lists
    cov = set()
    cont = 0
    logging.debug(f"Genome: {genome}")
    for l in genome:
        logging.debug(f"List: {l}")  
        cont += len(l)     
        cov = cov.union(set(l))
        logging.debug(f"cov: {cov}")
    
    logging.debug(f"Cont: {cont}")        
    return (len(cov) , - (cont-N)/N*100) # (coverage, -w)

def tournament(population, tournament_size = 2):
    '''Tournament selection'''
    return max(random.choices(population, k = tournament_size), key= lambda i: i.fitness)


def cross_over(g1, g2, N):
    '''Cross-Over function'''
    # The idea is to create a new genoma that is made by parts (lists) of the parents' genomes
    cut = random.randint(0, N - 1)
    logging.debug(f"Cut: {cut}")
    return tuple(list(g1[:cut]) + list(g2[cut:]))    


def mutation(g, N, all_lists):
    '''Mutation function'''
    # The idea is to sobstitute one gene (a list) with an other from the initial (problem) list (Allele)
    point = random.randint(0, N - 1)
    g = list(g)
    g[point] = random.choice(all_lists)
    return g  

 The idea is to create a popolation of individuals in the form of a list of tuples (genome, fitness). <br>
 Each genome is a list of N lists (genes), 1 is a list of integers and the other N-1 are empty lists.


In [67]:
def initialize_population(N):
    '''Creation of the population'''
    population = list()

    lista = preprocessing(N)

    # list of empty lists
    listo = []
    for _ in range(N-1):
        listo.append(list())
    listo

    for gene in lista:
        genome = listo.copy()
        # add the single gene
        genome.append(gene)
        logging.debug(f"Genome pre: {genome}")
        # let it shuffle
        random.shuffle(genome)
        logging.debug(f"Genome post: {genome}")
        population.append(Individual(tuple(genome), fitness(genome,N)))
    # The idea is to create a popolation of individuals in the form of a list of tuples (genome, fitness)
    # Each genome is a list of N lists (genes), 1 is a list of integers and the other N-1 are empty lists
    
    logging.info(f"init: pop_size={len(population)}; max={max(population, key=lambda i: i.fitness)[1]}")
    return population, lista

In [7]:
def GA_algh(population, all_lists, N, num_GENERATIONS, offspring_SIZE, population_SIZE,  shutdown = False ):
    '''Genetic Algorithm'''
    fitness_log = [(0, i.fitness) for i in population]

    for g in range(num_GENERATIONS):
        
        offspring = list()
        for i in range(offspring_SIZE):
            if random.random() < 0.3:
                p = tournament(population)
                o = mutation(p.genome, N, all_lists)
            else:        
                p1 = tournament(population, 20)
                p2 = tournament(population, 20)
                logging.debug(f"p1: {p1}, p2: {p2}")
                o = cross_over(p1.genome, p2.genome, N)
                logging.debug(f"o: {o}")

            f = fitness(o,N)
            fitness_log.append((g + 1, f))
            offspring.append(Individual(o, f))
            
        population += offspring                     # first sorted by coverage, then by -w
        population = sorted(population, key=lambda t: (t.fitness[0], t.fitness[1]), reverse=True)[:population_SIZE]      

    if not shutdown:    
        logging.info(f"Best solution, genoma: {population[0].genome}")
    logging.info(f"Best solution, fitness: {population[0].fitness}")
    logging.info(f"Best solution, coverage: {len(set().union(*population[0].genome))}")
    logging.info(f"W: {sum(len(_) for _ in population[0].genome)} (bloat = {(sum(len(_) for _ in population[0].genome)-N)/N*100:.0f}%)")

### Execution

In [8]:
last = 0

5 , 10 , 20

In [40]:
for n in [5, 10, 20]:    
    population_SIZE = 5
    offspring_SIZE = 20

    if n in [5, 10]:        
        num_GENERATIONS = 100
    else:
        num_GENERATIONS = 200

    logging.info(f"N = {n}")
    population, all_lists = initialize_population(n)
    logging.debug(population[0])
    
    start = time.time()    
    GA_algh(population, all_lists, n, num_GENERATIONS, offspring_SIZE, population_SIZE)
    end = time.time()
    logging.info(f"Time: {end - start} s")
    logging.info(f"Call count: {__CALLS__['fitness'] - last}\n\n")
    last = __CALLS__['fitness']

INFO:root:N = 5
INFO:root:init: pop_size=21; max=(2, 60.0)
INFO:root:Best solution, genoma: [[0, 1], [], [], [3], [2, 4]]
INFO:root:Best solution, fitness: (5, 0.0)
INFO:root:Best solution, coverage: 5
INFO:root:W: 5 (bloat = 0%)
INFO:root:Time: 0.30500292778015137 s
INFO:root:Call count: 2021


INFO:root:N = 10
INFO:root:init: pop_size=49; max=(5, 50.0)
INFO:root:Best solution, genoma: [[4, 5, 6], [], [], [8, 9, 3, 6], [2, 5, 7], [], [0, 1], [], [], []]
INFO:root:Best solution, fitness: (10, -20.0)
INFO:root:Best solution, coverage: 10
INFO:root:W: 12 (bloat = 20%)
INFO:root:Time: 0.30899906158447266 s
INFO:root:Call count: 2049


INFO:root:N = 20
INFO:root:init: pop_size=34; max=(9, 55.00000000000001)
INFO:root:Best solution, genoma: [[], [], [18, 2, 15], [], [3, 5, 6, 7, 8, 11, 12, 14], [], [0, 1, 3, 7, 9, 10, 11, 15], [], [], [], [], [], [], [], [], [], [], [], [4, 5, 8, 13, 15, 16, 17, 19], []]
INFO:root:Best solution, fitness: (20, -35.0)
INFO:root:Best solution, coverage: 20
INF

50 , 100

In [9]:
for n in [50, 100]:    
    population_SIZE = 20
    offspring_SIZE = 20
    num_GENERATIONS = 500    

    logging.info(f"N = {n}")
    population, all_lists = initialize_population(n)
    logging.debug(population[0])

    start = time.time()
    GA_algh(population, all_lists, n, num_GENERATIONS, offspring_SIZE, population_SIZE)
    end = time.time()
    logging.info(f"Time: {end - start} s")
    logging.info(f"Call count: {__CALLS__['fitness'] - last}\n\n")
    last = __CALLS__['fitness']

INFO:root:N = 50
INFO:root:init: pop_size=213; max=(22, 56.00000000000001)
INFO:root:Best solution, genoma: [[], [], [], [], [], [34, 2, 4, 5, 6, 39, 40, 43, 45, 14, 15, 48, 47, 18, 27, 31], [], [], [4, 38, 7, 40, 9, 10, 43, 24, 27], [], [], [], [32, 2, 34, 5, 38, 6, 40, 41, 42, 10, 44, 39, 15, 49, 22, 23, 26], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [0, 1, 4, 6, 11, 12, 13, 17, 20, 23, 24, 25, 27, 28, 34, 36, 37, 39, 45, 47, 48], [], [], [], [], [3, 6, 41, 42, 43, 12, 46, 15, 21, 22, 25, 26, 27, 29, 30], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [0, 33, 35, 6, 39, 8, 41, 42, 7, 9, 38, 47, 16, 17, 18, 19]]
INFO:root:Best solution, fitness: (50, -88.0)
INFO:root:Best solution, coverage: 50
INFO:root:W: 94 (bloat = 88%)
INFO:root:Time: 6.457998275756836 s
INFO:root:Call count: 10213


INFO:root:N = 100
INFO:root:init: pop_size=427; max=(43, 56.99999999999999)
INFO:root:Best solution, genoma: [[], [], [], [], [], [], [], [], [0, 5, 6, 10, 14, 15, 17, 

200 , 500 , 1000

In [10]:
for n in [200, 500, 1000]:
    population_SIZE = 20
    offspring_SIZE = 20
    num_GENERATIONS = 500

    logging.info(f"N = {n}")
    population, all_lists = initialize_population(n)
    logging.debug(population[0])

    start = time.time()
    GA_algh(population, all_lists, n, num_GENERATIONS, offspring_SIZE, population_SIZE, shutdown = True)
    end = time.time()
    logging.info(f"Time: {end - start} s")
    logging.info(f"Call count: {__CALLS__['fitness'] - last}\n\n")
    last = __CALLS__['fitness']

INFO:root:N = 200
INFO:root:init: pop_size=854; max=(87, 56.49999999999999)
INFO:root:Best solution, fitness: (200, -198.5)
INFO:root:Best solution, coverage: 200
INFO:root:W: 597 (bloat = 198%)
INFO:root:Time: 63.01859164237976 s
INFO:root:Call count: 10854


INFO:root:N = 500
INFO:root:init: pop_size=1809; max=(208, 58.4)
INFO:root:Best solution, fitness: (500, -247.8)
INFO:root:Best solution, coverage: 500
INFO:root:W: 1739 (bloat = 248%)
INFO:root:Time: 271.87761878967285 s
INFO:root:Call count: 11809


INFO:root:N = 1000
INFO:root:init: pop_size=3619; max=(409, 59.099999999999994)
INFO:root:Best solution, fitness: (1000, -310.0)
INFO:root:Best solution, coverage: 1000
INFO:root:W: 4100 (bloat = 310%)
INFO:root:Time: 923.0733354091644 s
INFO:root:Call count: 13619




## Updated Version (2022/11/18)

In [75]:
def mutation_rate(g, N):
    '''Mutation rate function'''
    cov = set()
    for l in g:
        logging.debug(f"List: {l}")   
        cov = cov.union(set(l))
        logging.debug(f"cov: {cov}")

    return len(cov)/N # more exploration in early generations, more exploitation in later ones

def clone_control(offspring, population):
    '''Control of clones'''
    # The idea is to check if the offspring is already present in the population
    # If it is, it is not added to the population
    for o in offspring:
        if o not in population:
            population.append(o)
        else:
            logging.debug(f"Clone: {o}")
    return population  

def tournament_with_a_hole(population, tournament_size = 2):
    '''Tournament selection with fitness hole'''
    if random.random() < 0.3:
        return max(random.choices(population, k = tournament_size), key= lambda i: [len(_) for _ in i.genome])
    else:
        return max(random.choices(population, k = tournament_size), key= lambda i: i.fitness)  


In [76]:
def GA_algh_v2(population, all_lists, N, num_GENERATIONS, offspring_SIZE, population_SIZE,  shutdown = False ):
    '''Genetic Algorithm v2'''
    fitness_log = [(0, i.fitness) for i in population]
    last_best_genome = []
    cont = 0

    while cont < 10:
        for g in range(num_GENERATIONS):
            
            offspring = list()
            for i in range(offspring_SIZE):
                # added a variable mutation rate (more exploration in early generations, more exploitation in later ones)
                if random.random() < mutation_rate(population[0].genome, N):
                    p = tournament_with_a_hole(population)
                    o = mutation(p.genome, N, all_lists)
                else:        
                    p1 = tournament_with_a_hole(population, 20)
                    p2 = tournament_with_a_hole(population, 20)
                    logging.debug(f"p1: {p1}, p2: {p2}")
                    o = cross_over(p1.genome, p2.genome, N)
                    logging.debug(f"o: {o}")

                f = fitness(o,N)
                fitness_log.append((g + 1, f))
                offspring.append(Individual(o, f))

                
            population = clone_control(offspring, population)         # first sorted by coverage, then by -w
            population = sorted(population, key=lambda t: (t.fitness[0], t.fitness[1]), reverse=True)[:population_SIZE] 

            # added an early stopping condition (if the best solution doesn't change for cont generations, it stops)
            if (population[0].fitness[0] == N and population[0].genome == last_best_genome):
                cont += 1
            else:
                last_best_genome = population[0].genome
                cont = 0

    if not shutdown:    
        logging.info(f"Best solution, genoma: {population[0].genome}")
    logging.info(f"Best solution, fitness: {population[0].fitness}")
    logging.info(f"Best solution, coverage: {len(set().union(*population[0].genome))}")
    logging.info(f"W: {sum(len(_) for _ in population[0].genome)} (bloat = {(sum(len(_) for _ in population[0].genome)-N)/N*100:.0f}%)")

In [77]:
last = 0

In [78]:
for n in [5, 10, 20]:    
    population_SIZE = 5
    offspring_SIZE = 20

    if n in [5, 10]:        
        num_GENERATIONS = 100
    else:
        num_GENERATIONS = 200

    logging.info(f"N = {n}")
    population, all_lists = initialize_population(n)
    logging.debug(population[0])
    
    start = time.time()    
    GA_algh_v2(population, all_lists, n, num_GENERATIONS, offspring_SIZE, population_SIZE)
    end = time.time()
    logging.info(f"Time: {end - start} s")
    logging.info(f"Call count: {__CALLS__['fitness'] - last}\n\n")
    last = __CALLS__['fitness']

INFO:root:N = 5
INFO:root:init: pop_size=21; max=(2, 60.0)
INFO:root:Best solution, genoma: [[1, 3], [], [], [0], [2, 4]]
INFO:root:Best solution, fitness: (5, 0.0)
INFO:root:Best solution, coverage: 5
INFO:root:W: 5 (bloat = 0%)
INFO:root:Time: 0.1929948329925537 s
INFO:root:Call count: 2021


INFO:root:N = 10
INFO:root:init: pop_size=49; max=(5, 50.0)
INFO:root:Best solution, genoma: [[], [], [], [], [8, 1, 3, 7], [9, 6], [0, 4], [], [], [2, 5]]
INFO:root:Best solution, fitness: (10, 0.0)
INFO:root:Best solution, coverage: 10
INFO:root:W: 10 (bloat = 0%)
INFO:root:Time: 0.24799776077270508 s
INFO:root:Call count: 2049


INFO:root:N = 20
INFO:root:init: pop_size=34; max=(9, 55.00000000000001)
INFO:root:Best solution, genoma: [[], [], [], [], [], [16, 9, 19, 6], [], [], [0, 1, 2, 7], [], [], [], [], [4, 7, 11, 12, 15, 16, 18], [], [0, 3, 5, 8, 9, 10, 13, 14, 17], [18, 2, 15], [], [], []]
INFO:root:Best solution, fitness: (20, -35.0)
INFO:root:Best solution, coverage: 20
INFO:root:W: 27

In [79]:
for n in [50, 100]:    
    population_SIZE = 20
    offspring_SIZE = 20
    num_GENERATIONS = 500 

    logging.info(f"N = {n}")
    population, all_lists = initialize_population(n)
    logging.debug(population[0])

    start = time.time()
    GA_algh_v2(population, all_lists, n, num_GENERATIONS, offspring_SIZE, population_SIZE)
    end = time.time()
    logging.info(f"Time: {end - start} s")
    logging.info(f"Call count: {__CALLS__['fitness'] - last}\n\n")
    last = __CALLS__['fitness']

INFO:root:N = 50
INFO:root:init: pop_size=213; max=(22, 56.00000000000001)
INFO:root:Best solution, genoma: [[], [], [], [], [], [], [3, 6, 7, 9, 10, 12, 15, 17, 18, 20, 24, 26, 28, 29, 31, 32, 36, 39, 46, 48], [], [], [], [], [], [0, 33, 34, 3, 2, 4, 41, 42, 11, 46, 48, 31], [], [34, 38, 47, 48, 19, 22, 24, 26], [], [32, 7, 40, 39, 8, 43, 44, 45, 14, 49, 18, 19, 23, 25], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [1, 2, 40, 47, 16, 22, 23, 24, 30], [], [], [], [], [], [35, 41, 13, 46, 14, 49, 21, 25, 26, 27, 28, 29], [], [], [], [], [], [32, 33, 3, 4, 37, 5, 38, 10, 11, 30], [], [], [], [], []]
INFO:root:Best solution, fitness: (50, -70.0)
INFO:root:Best solution, coverage: 50
INFO:root:W: 85 (bloat = 70%)
INFO:root:Time: 9.127003908157349 s
INFO:root:Call count: 10213


INFO:root:N = 100
INFO:root:init: pop_size=427; max=(43, 56.99999999999999)
INFO:root:Best solution, genoma: [[], [], [], [], [], [], [9, 13, 18, 19, 22, 24, 25, 30, 32, 44, 65, 67, 77, 82, 83, 93, 97

In [80]:
for n in [200, 500, 1000]:
    population_SIZE = 20
    offspring_SIZE = 20
    num_GENERATIONS = 500

    logging.info(f"N = {n}")
    population, all_lists = initialize_population(n)
    logging.debug(population[0])

    start = time.time()
    GA_algh_v2(population, all_lists, n, num_GENERATIONS, offspring_SIZE, population_SIZE, shutdown = True)
    end = time.time()
    logging.info(f"Time: {end - start} s")
    logging.info(f"Call count: {__CALLS__['fitness'] - last}\n\n")
    last = __CALLS__['fitness']

INFO:root:N = 200
INFO:root:init: pop_size=854; max=(87, 56.49999999999999)
INFO:root:Best solution, fitness: (200, -196.0)
INFO:root:Best solution, coverage: 200
INFO:root:W: 592 (bloat = 196%)
INFO:root:Time: 101.25542783737183 s
INFO:root:Call count: 10854


INFO:root:N = 500
INFO:root:init: pop_size=1809; max=(208, 58.4)
INFO:root:Best solution, fitness: (500, -231.4)
INFO:root:Best solution, coverage: 500
INFO:root:W: 1657 (bloat = 231%)
INFO:root:Time: 536.6628746986389 s
INFO:root:Call count: 11809


INFO:root:N = 1000
INFO:root:init: pop_size=3619; max=(409, 59.099999999999994)
INFO:root:Best solution, fitness: (1000, -352.59999999999997)
INFO:root:Best solution, coverage: 1000
INFO:root:W: 4526 (bloat = 353%)
INFO:root:Time: 1963.1160719394684 s
INFO:root:Call count: 13619


