Raffaele Pane `<raffaele.pane@studenti.polito.it>`  
[`https://github.com/bred91/Computational_Intelligence_2022-2023/tree/main/lab2`](https://github.com/bred91/Computational_Intelligence_2022-2023/tree/main/lab2) 

# Lab 2: Set Covering with an Evolutionary Algorithm

First lab + peer review. List this activity in your final report, it will be part of your exam.

## Task

Given a number $N$ and some lists of integers $P = (L_0, L_1, L_2, ..., L_n)$, 
determine, if possible, $S = (L_{s_0}, L_{s_1}, L_{s_2}, ..., L_{s_n})$
such that each number between $0$ and $N-1$ appears in at least one list

$$\forall n \in [0, N-1] \ \exists i : n \in L_{s_i}$$

and that the total numbers of elements in all $L_{s_i}$ is minimum. 

## Solution

In [1]:
# import
import random
import numpy as np
import logging
import itertools
import matplotlib.pyplot as plt
from collections import namedtuple
import time

In [2]:
logging.getLogger().setLevel(logging.INFO)

### Preprocessing and Problem creation

In [3]:
def problem(N, seed=None):
    """Generate a random problem instance"""
    random.seed(seed)
    return [
        list(set(random.randint(0, N - 1) for n in range(random.randint(N // 5, N // 2))))
        for n in range(random.randint(N, N * 5))
    ]

def preprocessing (N):
    '''Preprocessing to get all possible lists and sort them'''
    sorted_all_list = sorted(problem(N, seed=42), key=lambda l: len(l))
    all_lists = list(sorted_all_list for sorted_all_list, _ in itertools.groupby(sorted_all_list)) # remove duplicates
    return all_lists

### GA Functions

In [4]:
Individual = namedtuple("Individual", ["genome", "fitness"])


def fitness(genome):
    '''Fitness function'''
    # The idea is to weight the fitness by the coverage of the genoma minus the length/4 of the genoma
    cov = set()
    cont = 0
    logging.debug(f"Genome: {genome}")
    for l in genome:
        logging.debug(f"List: {l}")        
        cont += len(l)/4        
        cov = cov.union(set(l))
        logging.debug(f"cov: {cov}")
    
    logging.debug(f"Cont: {cont}")        
    return len(cov) - cont   

def tournament(population, tournament_size = 2):
    '''Tournament selection'''
    return max(random.choices(population, k = tournament_size), key= lambda i: i.fitness)


def cross_over(g1, g2, N):
    '''Cross-Over function'''
    # The idea is to create a new genoma that is made by parts (lists) of the parents' genomes
    cut = random.randint(0, N - 1)
    logging.debug(f"Cut: {cut}")
    return tuple(list(g1[:cut]) + list(g2[cut:]))    


def mutation(g, N, all_lists):
    '''Mutation function'''
    # The idea is to sobstitute one gene (a list) with an other from the initial (problem) list (Allele)
    point = random.randint(0, N - 1)
    g = list(g)
    g[point] = random.choice(all_lists)
    return g    

 The idea is to create a popolation of individuals in the form of a list of tuples (genome, fitness). <br>
 Each genome is a list of N lists (genes), 1 is a list of integers and the other N-1 are empty lists.


In [5]:
def initialize_population(N):
    '''Creation of the population'''
    population = list()

    lista = preprocessing(N)

    # list of empty lists
    listo = []
    for _ in range(N-1):
        listo.append(list())
    listo

    for gene in lista:
        genome = listo.copy()
        # add the single gene
        genome.append(gene)
        logging.debug(f"Genome pre: {genome}")
        # let it shuffle
        random.shuffle(genome)
        logging.debug(f"Genome post: {genome}")
        population.append(Individual(tuple(genome), fitness(genome)))
    # The idea is to create a popolation of individuals in the form of a list of tuples (genome, fitness)
    # Each genome is a list of N lists (genes), 1 is a list of integers and the other N-1 are empty lists
    
    logging.info(f"init: pop_size={len(population)}; max={max(population, key=lambda i: i.fitness)[1]}")
    return population, lista

In [6]:
def GA_algh(population, all_lists, N, shutdown = False ):
    '''Genetic Algorithm'''
    fitness_log = [(0, i.fitness) for i in population]

    for g in range(NUM_GENERATIONS):
        offspring = list()
        for i in range(OFFSPRING_SIZE):
            if random.random() < 0.3:
                p = tournament(population)
                o = mutation(p.genome, N, all_lists)
            else:        
                p1 = tournament(population, 20)
                p2 = tournament(population, 20)
                logging.debug(f"p1: {p1}, p2: {p2}")
                o = cross_over(p1.genome, p2.genome, N)
                logging.debug(f"o: {o}")

            f = fitness(o)
            fitness_log.append((g + 1, f))
            offspring.append(Individual(o, f))
            
        population += offspring
        population = sorted(population, key=lambda i: i.fitness, reverse=True)[:POPULATION_SIZE]

    if not shutdown:    
        logging.info(f"Best solution: {population[0].genome}")
    logging.info(f"W: {sum(len(_) for _ in population[0].genome)} (bloat = {(sum(len(_) for _ in population[0].genome)-N)/N*100:.0f}%)")

In [7]:
#PROBLEM_SIZE = 500
POPULATION_SIZE = 5
OFFSPRING_SIZE = 20
# N = PROBLEM_SIZE
NUM_GENERATIONS = 100

### Execution

In [10]:
for n in [5, 10, 50, 100]:
    logging.info(f"N = {n}")
    population, all_lists = initialize_population(n)
    logging.debug(population[0])

    start = time.time()
    GA_algh(population, all_lists, n)
    end = time.time()
    logging.info(f"Time: {end - start} s\n\n")

INFO:root:N = 5
INFO:root:init: pop_size=21; max=1.5
INFO:root:Best solution: [[0, 1], [], [], [3], [2, 4]]
INFO:root:W: 5 (bloat = 0%)
INFO:root:Time: 0.13698577880859375 s


INFO:root:N = 10
INFO:root:init: pop_size=49; max=3.75
INFO:root:Best solution: [[], [], [], [8, 9, 3, 6], [2, 5, 7], [], [8, 0, 4, 1], [6], [], []]
INFO:root:W: 12 (bloat = 20%)
INFO:root:Time: 0.19199585914611816 s


INFO:root:N = 50
INFO:root:init: pop_size=213; max=16.5
INFO:root:Best solution: [[], [], [], [], [], [], [], [], [], [], [], [], [32, 2, 34, 5, 38, 6, 40, 41, 42, 10, 44, 39, 15, 49, 22, 23, 26], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [0, 1, 4, 6, 11, 12, 13, 17, 20, 23, 24, 25, 27, 28, 34, 36, 37, 39, 45, 47, 48], [], [], [], [], [3, 8, 11, 13, 20, 21, 22, 24, 29, 30, 33, 34, 35, 41, 42, 43, 45, 46, 47, 49], [], [], [], [], [36, 38, 7, 10, 44, 47, 18, 19, 28, 29], [], [], [], [], [], [], [], [], [], [], []]
INFO:root:W: 68 (bloat = 36%)
INFO:root:Time: 1.062014102935791 s


I

In [11]:
for n in [200, 500, 1000]:
    logging.info(f"N = {n}")
    population, all_lists = initialize_population(n)
    logging.debug(population[0])

    start = time.time()
    GA_algh(population, all_lists, n, shutdown = True)
    end = time.time()
    logging.info(f"Time: {end - start} s\n\n")

INFO:root:N = 200
INFO:root:init: pop_size=854; max=65.25
INFO:root:W: 318 (bloat = 59%)
INFO:root:Time: 8.127999782562256 s


INFO:root:N = 500
INFO:root:init: pop_size=1809; max=156.0
INFO:root:W: 679 (bloat = 36%)
INFO:root:Time: 46.81200408935547 s


INFO:root:N = 1000
INFO:root:init: pop_size=3619; max=306.75
INFO:root:W: 1401 (bloat = 40%)
INFO:root:Time: 173.43124175071716 s


