# Part 3 NSGA II

> Code is inspired from:
> 
> https://medium.com/@rossleecooloh/optimization-algorithm-nsga-ii-and-python-package-deap-fca0be6b2ffc
>
> https://github.com/DEAP/deap/blob/master/examples/ga/nsga2.py
>
>  https://github.com/DEAP/deap/blob/master/deap/tools/emo.py

In [45]:
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
import re
import pandas as pd
import numpy as np
from copy import deepcopy
from distutils.command.build_scripts import first_line_re
# Import deque for the stack structure, copy for deep copy nodes
from collections import deque
from sklearn.metrics import accuracy_score
import sklearn 

import seaborn as sns
# Encoding categorical features with preserving the missing values in incomplete features
from sklearn.preprocessing import (KBinsDiscretizer, LabelEncoder,
                                   OneHotEncoder, OrdinalEncoder,
                                   StandardScaler)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot
from sklearn.tree import DecisionTreeClassifier
import array
import random
import json
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from math import sqrt

from deap import algorithms
from deap import base
from deap import benchmarks
from deap.benchmarks.tools import diversity, convergence, hypervolume
from deap import creator
from deap import tools
from sklearn.pipeline import Pipeline

In [46]:

# define some constants for the genetic algorithm



CONSTANTS_DICT = {
    "POPULATION_SIZE": 80, # number of individuals in each population
    "MAX_GENERATIONS": 100, # number of generations to run the algorithm
    "CROSSOVER_RATE": 0.9, # crossover rate should always be 100%, based on slides
    "MUTATION_RATE": 0.15, # mutation rate
    "CLASSIFIER":KNeighborsClassifier() , # classifier to use
    "BOUND_LOW": 0.0, # lower bound for the features
    "BOUND_UP": 1.0, # upper bound for the features
    "ETA": 20.0, # crowding degree for mutation  and crossover
}


In [47]:
from queue import Empty


class DatasetPart3:
    def __init__(self, df) :
        self.df=df
        self.df.columns = self.df.columns.str.strip()
        self.x = self.df.iloc[:,:-1]
        self.y = self.df.iloc[:,-1]
        # self.M = self.df.shape[0]  # number of rows
    
    # @classmethod
    # def constructFromFile(cls, filePath):
    #     """Depends on different ds"""
    #     pass

    def getDfWithSelectedFeatures(self, selectedFeatures:list):
        """No need to avoid FS bias, just based on df"""
        returnedDf = pd.DataFrame()
        selectedCount = 0
        for i in range(len(selectedFeatures)):
            isSelected = True if selectedFeatures[i] > 0.5 else False
            if isSelected:
                selectedCount += 1
                # concat this feature to the returned dataframe
                returnedDf = pd.concat([returnedDf,self.df.iloc[:,i]],axis=1)
        # concat the class column
        returnedDf = pd.concat([returnedDf, self.y],axis=1)
        assert returnedDf.empty == False

        return returnedDf, selectedCount
    
    @staticmethod
    def run_model(df:pd.DataFrame, classifier):
        assert df.empty == False
        # pipe = Pipeline([
        #     ('scaler', StandardScaler()),
        #     ('classifier', classifier)
        #                  ])
        # X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size=0.2, random_state=42)
        
        # pipe.fit(X_train, y_train)
        # return pipe.score(X_test, y_test)
        
        x = df.iloc[:,:-1]
        y = df.iloc[:,-1]

        # # # evaluate the model
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
        n_scores = cross_val_score(classifier, x, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
        return np.mean(n_scores)
        
        

class Vehicle(DatasetPart3):
    def __init__(self, df):
        super().__init__(df)
    
    @classmethod
    def constructFromFile(cls, filePath):
        df = pd.read_csv(filePath, header=None, delim_whitespace=True)
        df.columns = [f"f_{i}" for i in range(len(df.columns))]
        df.rename(columns = {f'f_{len(df.columns)-1}':'class'}, inplace = True)
        return cls(df)
    
class MuskClean(DatasetPart3):
    def __init__(self, df):
        super().__init__(df)

    @classmethod
    def constructFromFile(cls, filePath):
        df = pd.read_csv(filePath, header=None)
        # ignore the first 2 columns since they are NOT numerical, so it would be betteer to ignore them 
        df.drop([0,1], axis=1, inplace=True)
        df.columns = [f"f_{i}" for i in range(len(df.columns))]
        df.rename(columns = {f'f_{len(df.columns)-1}':'class'}, inplace = True)
        return cls(df)
    


In [48]:
ds_vehicle = Vehicle.constructFromFile("./vehicle/vehicle.dat")
print(len(ds_vehicle.x.columns))
ds_vehicle.df.info()


18
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   f_0     846 non-null    int64 
 1   f_1     846 non-null    int64 
 2   f_2     846 non-null    int64 
 3   f_3     846 non-null    int64 
 4   f_4     846 non-null    int64 
 5   f_5     846 non-null    int64 
 6   f_6     846 non-null    int64 
 7   f_7     846 non-null    int64 
 8   f_8     846 non-null    int64 
 9   f_9     846 non-null    int64 
 10  f_10    846 non-null    int64 
 11  f_11    846 non-null    int64 
 12  f_12    846 non-null    int64 
 13  f_13    846 non-null    int64 
 14  f_14    846 non-null    int64 
 15  f_15    846 non-null    int64 
 16  f_16    846 non-null    int64 
 17  f_17    846 non-null    int64 
 18  class   846 non-null    object
dtypes: int64(18), object(1)
memory usage: 125.7+ KB


In [49]:
ds_mushclean = MuskClean.constructFromFile("./musk/clean1.data")
ds_mushclean.df.info()
# # len(ds_mushclean.x.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476 entries, 0 to 475
Columns: 167 entries, f_0 to class
dtypes: float64(1), int64(166)
memory usage: 621.2 KB


set up creator

In [50]:
# 2 minimum objectives, so -1,-1
creator.create("FitnessMin", base.Fitness, weights=(-1.0, -1.0)) 
# Individual should be a list of binary values, i.e. a list of 0s and 1s
creator.create("Individual", list, fitness=creator.FitnessMin)



define wrapper based fitness evaluate function

In [51]:
# def getTransformedDf(df2Transform:pd.DataFrame):
#     """transform the continous features to discontinous. In other words, due to all features are continous, this functions are used to discretise all continous features.

#     KBins is used to discretise the continous features. The number of bins is set to 10. The strategy is set to uniform.
    
#     Tutorial: https://machinelearningmastery.com/discretization-transforms-for-machine-learning/
    
#     Args:
#         df2Transform (pd.DataFrame): df to transform, all features should be continous
        
#     """ 
#     tempDf = deepcopy(df2Transform)
#     tempDf_x = tempDf.iloc[:,:-1]
#     tempDf_y = tempDf.iloc[:,-1]
#     # tempDf_y = LabelEncoder().fit_transform(tempDf_y)
#     # only transform the continous features, ignore Y
#     kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
#     tempDf_x = kbins.fit_transform(tempDf_x)
#     tempDf = pd.concat([pd.DataFrame(tempDf_x),tempDf_y],axis=1)
#     tempDf.columns = [f"f_{i}" for i in range(len(tempDf.columns))]
#     tempDf.rename(columns = {f'f_{len(tempDf.columns)-1}':'class'}, inplace = True)
#     return tempDf

def wrapperFitnessEvaluation(ds:DatasetPart3, individual:creator.Individual, 
                             classifier=CONSTANTS_DICT["CLASSIFIER"]): #KNN by default
    df_selected,selected_count = ds.getDfWithSelectedFeatures(individual)
    # df_selected = getTransformedDf(df_selected)
        
    acc_score = DatasetPart3.run_model(df_selected, classifier)
    obj1 = 1.0-acc_score # classification error
    obj2 = selected_count/len(individual) #ratio of selected features
    assert 0<=obj1<=1
    assert 0<=obj2<=1
    return obj1, obj2

tool box

In [52]:
# toolbox is a class contains the operators that we will use in our genetic programming algorithm
# it can be also be used as the container of methods which enables us to add new methods to the toolbox 
def setup_toolbox(ds:DatasetPart3, randSeed:int) -> base.Toolbox:
    toolbox = base.Toolbox()
    # for population size, we use the random.randint function to generate a random integer in the range [min, max]
    random.seed(randSeed)
    # register a method to generate random boolean values
    toolbox.register("attr_bool", random.randint, 0, 1)
    # register a method to generate random individuals
    toolbox.register("IndividualCreator", 
                     tools.initRepeat, 
                     creator.Individual, 
                     toolbox.attr_bool, 
                     n=len(ds.x.columns) # feature number, exclude the class column
                    )
    
    # N is not specificied, so need to specify number of individuals to generate within each population when we call it later
    toolbox.register("PopulationCreator", tools.initRepeat, list, toolbox.IndividualCreator) 

    # toolbox.register("select", tools.emo.selTournamentDCD)
    toolbox.register("select", tools.emo.selNSGA2)
    toolbox.register('selectGen1', tools.selTournament, tournsize=2)
    
    
    
    # toolbox.register("mate", tools.cxSimulatedBinaryBounded, low=CONSTANTS_DICT["BOUND_LOW"], up=CONSTANTS_DICT["BOUND_UP"], eta=CONSTANTS_DICT["ETA"])
    # toolbox.register("mutate", tools.mutPolynomialBounded, low=CONSTANTS_DICT["BOUND_LOW"], up=CONSTANTS_DICT["BOUND_UP"], eta=CONSTANTS_DICT["ETA"], indpb=1.0/len(ds.x.columns))
    

    
    toolbox.register("mate", tools.cxTwoPoint) # TODO: might need to change this to cxOnePoint
    # indpb refer to the probability of mutate happening on each gene, it is NOT the same as mutation rate
    toolbox.register("mutate", tools.mutFlipBit, indpb=1.0/len(ds.x.columns)) 
    
    toolbox.register("evaluate", wrapperFitnessEvaluation, ds) # need to pass individual:list
    return toolbox

run NSGA once 

> https://github.dev/DEAP/deap/blob/master/deap/tools/emo.py
> https://github.dev/DEAP/deap/blob/master/examples/ga/nsga2.py

In [53]:
import copy
from select import select
import time

def run_NSGAII(ds:DatasetPart3, randSeed:int, 
                ngen:int=CONSTANTS_DICT["MAX_GENERATIONS"], 
                popSize:int=CONSTANTS_DICT["POPULATION_SIZE"]):
    # stats
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("min", np.min, axis=0)
    stats.register("max", np.max, axis=0)
    stats.register("mean", np.mean, axis = 0)
    stats.register("std", np.std, axis=0)
    # for record keeping
    logbook = tools.Logbook()    
    logbook.header = "gen", "mean", "std", "min",  "max"
    
    # create toolbox
    random.seed(randSeed)
    toolbox = setup_toolbox(ds, randSeed)
    # create the initial population
    pop = toolbox.PopulationCreator(n=popSize)
    
    # calculate objectives
    def evaluate_fitness_values(pop) :
        """Update the fitness values of each individual for the given the population"""
        # invalid_ind = [ind for ind in pop if not ind.fitness.valid]
        # fitnesses =toolbox.map(toolbox.evaluate, invalid_ind)
        fitnesses = toolbox.map(toolbox.evaluate, pop)
        # print(f"fitnesses: {fitnesses}")
        # print(pop)
        for ind, fit in zip(pop, fitnesses):
            ind.fitness.values = fit
    evaluate_fitness_values(pop)
    
    # fronts = tools.emo.sortNondominated(pop,k=popSize,first_front_only=False)
    # for idx,front in enumerate(fronts):
    #     #print(idx,front)
    #     for ind in front:
    #         ind.fitness.values = (idx+1),# change fitness to the order of pareto front
    # offspring = toolbox.selectGen1(pop, len(pop))
    # # apply mate and mutate only once
    # offspring = algorithms.varAnd(offspring,toolbox,
    #                               CONSTANTS_DICT["CROSSOVER_RATE"],
    #                               CONSTANTS_DICT["MUTATION_RATE"]) 

    # for gen_counter in range(1,ngen):
    #     combined_population = pop + offspring
        
    #     # fitnesses = toolbox.map(toolbox.evaluate, combined_population)
    #     # # print(f"fitnesses: {fitnesses}")

    #     # for ind, fit in zip(combined_population, fitnesses):
    #     #     ind.fitness.values = fit
    #     evaluate_fitness_values(combined_population)
        
    #     # stats
    #     record = stats.compile(combined_population)
    #     logbook.record(gen=gen_counter,  **record)
    #     print(logbook.stream)

        
    #     fronts = tools.emo.sortNondominated(combined_population,
    #                                         k=popSize,
    #                                         first_front_only=False)
        
    #     for front in fronts:
    #         tools.emo.assignCrowdingDist(front) # for computing crowding distance
            
    #     pop = []
    #     for front in fronts:
    #         pop += front
    #     pop = toolbox.clone(pop)
    #     pop = tools.selNSGA2(pop,k=popSize,nd='standard') # elitism strategy basd on crowded distance

    #     offspring = toolbox.select(pop,popSize) 
    #     offspring = toolbox.clone(offspring)
    #     offspring = algorithms.varAnd(offspring,toolbox,
    #                               CONSTANTS_DICT["CROSSOVER_RATE"],
    #                               CONSTANTS_DICT["MUTATION_RATE"]) 
    # bestInd = tools.selBest(pop,1)[0]
    # bestFit = bestInd.fitness.values
    # print("Final population hypervolume is %f" % hypervolume(pop, [11.0, 11.0]))
    # return pop, logbook, hypervolume(pop, [11.0, 11.0]) # set of non-dominated individuals solutions

    # This is just to assign the crowding distance to the individuals
    # no actual selection is done
    pop = toolbox.select(pop, len(pop))
    record = stats.compile(pop)
    logbook.record(gen=0, **record)
    print(logbook.stream)

    # Begin the generational process
    for gen_counter in range(1,ngen):
        
        # Vary the pop
        offspring = tools.selTournamentDCD(pop, len(pop))
        offspring = [toolbox.clone(ind) for ind in offspring]
        
        offspring = algorithms.varAnd(offspring, toolbox, 
                                      cxpb=CONSTANTS_DICT["CROSSOVER_RATE"], 
                                      mutpb=CONSTANTS_DICT["MUTATION_RATE"])
           
        # Evaluate all  offsprings individuals 
        combined_pop = offspring + pop
        evaluate_fitness_values(combined_pop)
      
        # elitism strategy
        # Select the next generation pop
        pop = toolbox.select(combined_pop, popSize)
        # stats
        record = stats.compile(pop)
        logbook.record(gen=gen_counter,  **record)
        print(logbook.stream)
        
    print("Final pop hypervolume is %f" % hypervolume(pop, [11.0, 11.0]))
    return pop, logbook, hypervolume(pop, [11.0, 11.0]) # set of non-dominated individuals solutions


In [54]:
import matplotlib.pyplot as plt
def run_3_times_with_different_seed(ds:DatasetPart3,
                                     title:str, 
                                     max_gen=CONSTANTS_DICT["MAX_GENERATIONS"],
                                     classifier = CONSTANTS_DICT["CLASSIFIER"],
                                     randSeed = [i+1 for i in range(3)],
                                     run_times=3):
    # run 3 times with different seed
    population_list = []
    logbook_list = []
    hypervolume_list = []
    
    for i in range(run_times):
        print('-'*80)
        print('-'*80)
        print(title,"\nRunning GA with seed: ", randSeed[i])
        population, logbook, hypervolume = run_NSGAII(ds, randSeed=randSeed[i], ngen=max_gen, popSize=CONSTANTS_DICT["POPULATION_SIZE"])
        population_list.append(population)
        logbook_list.append(logbook)
        hypervolume_list.append(hypervolume)    
        print('-'*80)
        print('-'*80)
        
        front = tools.emo.sortNondominated(population,len(population))[0]
        for ind in front:
            plt.plot(ind.fitness.values[0],ind.fitness.values[1],'r.',ms=2)
        plt.xlabel('f1')
        plt.ylabel('f2')
        plt.tight_layout()
        plt.show()
        
        
        # plot the result
        fitTuple = [ind.fitness.values for ind in population]
            
        plt.plot(fitTuple[0], fitTuple[1], label=f"seed {randSeed[i]}\n hypervolume: {hypervolume}")
        plt.legend(bbox_to_anchor =(1.3,-0.1), loc='lower center')
        plt.ylabel("ratio of selected features")
        plt.xlabel("classification error rate")
        plt.title(f"dataset: {title} \n Objective space")
    plt.show()
        
        
    # compare error rates of the obtained solutions with that of using the entire feature set.
    subset_mean_err_rate = [logbook.select("mean")[0] for logbook in logbook_list]
    entire_mean_err_rate = DatasetPart3.run_model(ds.df)
    
    print(f"{title}:\n error rates of the obtained solution: {subset_mean_err_rate}\n error rate of using the entire feature set: {entire_mean_err_rate}")
        

    return population_list, logbook_list

In [55]:
ds_vehicle = Vehicle.constructFromFile("./vehicle/vehicle.dat")
run_3_times_with_different_seed(ds_vehicle, "vehicle",
                                max_gen=100,
                                run_times=3)


--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
vehicle 
Running GA with seed:  1
gen	mean                   	std                    	min                    	max                    
0  	[0.37366422 0.50763889]	[0.03711397 0.11410675]	[0.30005602 0.22222222]	[0.46881793 0.72222222]
1  	[0.36070347 0.45069444]	[0.04533964 0.11784908]	[0.28653221 0.22222222]	[0.46881793 0.72222222]
2  	[0.35572955 0.40902778]	[0.05979356 0.13978703]	[0.28653221 0.11111111]	[0.5732605  0.72222222]
3  	[0.35284734 0.36388889]	[0.06260502 0.13548609]	[0.28347619 0.11111111]	[0.5732605  0.66666667]
4  	[0.35221415 0.35208333]	[0.07286624 0.14853808]	[0.28347619 0.11111111]	[0.5732605  0.61111111]
5  	[0.35980963 0.32430556]	[0.08567356 0.16653786]	[0.28347619 0.05555556]	[0.5690112  0.61111111]
6  	[0.37851106 0.28333333]	[0.10838957 0.16377114]	[0.28157983 0.05555556]	[0.73518768 0.61111111]


ValueError: at least one array or dtype is required

In [None]:
ds_mushclean = MuskClean.constructFromFile("./musk/clean1.data")
run_3_times_with_different_seed(ds_vehicle, "mushclean",
                                max_gen=100,
                                run_times=3)