# Part 3 NSGA II

> Code is inspired from:
> 
> https://medium.com/@rossleecooloh/optimization-algorithm-nsga-ii-and-python-package-deap-fca0be6b2ffc
>
> https://github.com/DEAP/deap/blob/master/examples/ga/nsga2.py
>
>  https://github.com/DEAP/deap/blob/master/deap/tools/emo.py

In [40]:
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
import re
import pandas as pd
import numpy as np
from copy import deepcopy
from distutils.command.build_scripts import first_line_re
from tkinter.tix import COLUMN
# Import deque for the stack structure, copy for deep copy nodes
from collections import deque
from sklearn.metrics import accuracy_score
import sklearn 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import (DecisionTreeClassifier, DecisionTreeRegressor,
                          ExtraTreeClassifier)
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
# Encoding categorical features with preserving the missing values in incomplete features
from sklearn.preprocessing import (KBinsDiscretizer, LabelEncoder,
                                   OneHotEncoder, OrdinalEncoder,
                                   StandardScaler)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot

import array
import random
import json

from sklearn.model_selection import train_test_split

from math import sqrt

from deap import algorithms
from deap import base
from deap import benchmarks
from deap.benchmarks.tools import diversity, convergence, hypervolume
from deap import creator
from deap import tools
from sklearn.pipeline import Pipeline

In [41]:

# define some constants for the genetic algorithm
CONSTANTS_DICT = {
    "POPULATION_SIZE": 100, # number of individuals in each population
    "MAX_GENERATIONS": 200, # number of generations to run the algorithm
    "CROSSOVER_RATE": 1.0, # crossover rate should always be 100%, based on slides
    "MUTATION_RATE": 0.2, # mutation rate
    "CLASSIFIER": KNeighborsClassifier(), # classifier to use
    "BOUND_LOW": 0.0, # lower bound for the features
    "BOUND_UP": 1.0, # upper bound for the features
    "ETA": 20.0, # crowding degree for mutation  and crossover
}


In [42]:
from queue import Empty


class DatasetPart3:
    def __init__(self, df) :
        self.df=df
        self.df.columns = self.df.columns.str.strip()
        self.x = self.df.iloc[:,:-1]
        self.y = self.df.iloc[:,-1]
        self.M = self.df.shape[0]  # number of rows
    
    @classmethod
    def constructFromFile(cls, filePath):
        """Depends on different ds"""
        pass

    def getDfWithSelectedFeatures(self, selectedFeatures:list):
        """No need to avoid FS bias, just based on df"""
        returnedDf = pd.DataFrame()
        selectedCount = 0
        for i in range(len(selectedFeatures)):
            isSelected = True if selectedFeatures[i] == 1 else False
            if isSelected:
                selectedCount += 1
                # concat this feature to the returned dataframe
                returnedDf = pd.concat([returnedDf,self.df.iloc[:,i]],axis=1)
        # concat the class column
        returnedDf = pd.concat([returnedDf, self.df.iloc[:,-1]],axis=1)
        return returnedDf, selectedCount
    
    @staticmethod
    def run_model(df:pd.DataFrame, classifier = CONSTANTS_DICT["CLASSIFIER"]):
        # pipe = Pipeline([
        #     ('scaler', StandardScaler()),
        #     ('classifier', classifier)
        #                  ])
        # X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size=0.2, random_state=42)
        
        # pipe.fit(X_train, y_train)
        # return pipe.score(X_test, y_test)
        
        x = df.iloc[:,:-1]
        y = df.iloc[:,-1]
        # y = LabelEncoder().fit_transform(y)
        
        # X_train, X_test, y_train, y_test = train_test_split(
        #                                                         x,
        #                                                         y,
        #                                                         test_size=1/3,
        #                                                         random_state=0)
        # classifier.fit(X_train, y_train)
                                                                
        # return classifier.score(X_test, y_test)
        

        # # # evaluate the model
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        n_scores = cross_val_score(classifier, x, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
        return np.mean(n_scores)
        
        

class Vehicle(DatasetPart3):
    def __init__(self, df):
        super().__init__(df)
    
    @classmethod
    def constructFromFile(cls, filePath):
        df = pd.read_csv(filePath, header=None, delim_whitespace=True)
        df.columns = [f"f_{i}" for i in range(len(df.columns))]
        df.rename(columns = {f'f_{len(df.columns)-1}':'class'}, inplace = True)
        return cls(df)
    
class MuskClean(DatasetPart3):
    def __init__(self, df):
        super().__init__(df)

    @classmethod
    def constructFromFile(cls, filePath):
        df = pd.read_csv(filePath, header=None)
        # ignore the first 2 columns since they are NOT numerical, so it would be betteer to ignore them 
        df.drop([0,1], axis=1, inplace=True)
        df.columns = [f"f_{i}" for i in range(len(df.columns))]
        df.rename(columns = {f'f_{len(df.columns)-1}':'class'}, inplace = True)
        return cls(df)
    


In [43]:
# ds_vehicle = Vehicle.constructFromFile("./vehicle/vehicle.dat")

# ds_vehicle.df


In [44]:
# ds_mushclean = MuskClean.constructFromFile("./musk/clean1.data")
# ds_mushclean.df
# # len(ds_mushclean.x.columns)

set up creator

In [45]:
# 2 minimum objectives, so -1,-1
creator.create("FitnessMin", base.Fitness, weights=(-1.0, -1.0)) 
# Individual should be a list of binary values, i.e. a list of 0s and 1s
creator.create("Individual", list, fitness=creator.FitnessMin)



define wrapper based fitness evaluate function

In [46]:
# def getTransformedDf(df2Transform:pd.DataFrame):
#     """transform the continous features to discontinous. In other words, due to all features are continous, this functions are used to discretise all continous features.

#     KBins is used to discretise the continous features. The number of bins is set to 10. The strategy is set to uniform.
    
#     Tutorial: https://machinelearningmastery.com/discretization-transforms-for-machine-learning/
    
#     Args:
#         df2Transform (pd.DataFrame): df to transform, all features should be continous
        
#     """ 
#     tempDf = deepcopy(df2Transform)
#     tempDf_x = tempDf.iloc[:,:-1]
#     tempDf_y = tempDf.iloc[:,-1]
#     # tempDf_y = LabelEncoder().fit_transform(tempDf_y)
#     # only transform the continous features, ignore Y
#     kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
#     tempDf_x = kbins.fit_transform(tempDf_x)
#     tempDf = pd.concat([pd.DataFrame(tempDf_x),tempDf_y],axis=1)
#     tempDf.columns = [f"f_{i}" for i in range(len(tempDf.columns))]
#     tempDf.rename(columns = {f'f_{len(tempDf.columns)-1}':'class'}, inplace = True)
#     return tempDf

def wrapperFitnessEvaluation(ds:DatasetPart3, individual:creator.Individual, 
                             classifier=CONSTANTS_DICT["CLASSIFIER"]): #KNN by default
    df_selected,selected_count = ds.getDfWithSelectedFeatures(individual)
    # df_selected = getTransformedDf(df_selected)
        
    acc_score = DatasetPart3.run_model(df_selected, classifier)
    obj1 = 1.0-acc_score # classification error
    obj2 = selected_count/len(individual) #ratio of selected features
    return obj1, obj2

tool box

In [47]:
# toolbox is a class contains the operators that we will use in our genetic programming algorithm
# it can be also be used as the container of methods which enables us to add new methods to the toolbox 
def setup_toolbox(ds:DatasetPart3, randSeed:int, evaluateFunction=wrapperFitnessEvaluation) -> base.Toolbox:
    toolbox = base.Toolbox()
    # for population size, we use the random.randint function to generate a random integer in the range [min, max]
    random.seed(randSeed)
    # register a method to generate random boolean values
    toolbox.register("attr_bool", random.randint, 0, 1)
    # register a method to generate random individuals
    toolbox.register("IndividualCreator", 
                     tools.initRepeat, 
                     creator.Individual, 
                     toolbox.attr_bool, 
                     n=len(ds.x.columns) # feature number, exclude the class column
                    )
    
    # N is not specificied, so need to specify number of individuals to generate within each population when we call it later
    toolbox.register("PopulationCreator", tools.initRepeat, list, toolbox.IndividualCreator) 
    toolbox.register("select", tools.selNSGA2)
    # toolbox.register("mate", tools.cxSimulatedBinaryBounded, low=CONSTANTS_DICT["BOUND_LOW"], up=CONSTANTS_DICT["BOUND_UP"], eta=20.0)
    # toolbox.register("mutate", tools.mutPolynomialBounded, low=CONSTANTS_DICT["BOUND_LOW"], up=CONSTANTS_DICT["BOUND_UP"], eta=20.0, indpb=1.0/len(ds.x.columns))
    
    # toolbox.register("elitism", tools.
    
    # toolbox.register("elitism", tools.selBest, k=int(CONSTANTS_DICT["ELITIST_PERCENTAGE"]*ds.M))
    # # toolbox.register("select", tools.selTournament, k=2, tournsize=3)
    
    toolbox.register("mate", tools.cxTwoPoint) # TODO: might need to change this to cxOnePoint
    # indpb refer to the probability of mutate happening on each gene, it is NOT the same as mutation rate
    toolbox.register("mutate", tools.mutFlipBit, indpb=1.0/ds.M) 
    
    toolbox.register("evaluate", evaluateFunction, ds) # need to pass individual:list
    return toolbox

run NSGA once 

> https://github.dev/DEAP/deap/blob/master/deap/tools/emo.py
> https://github.dev/DEAP/deap/blob/master/examples/ga/nsga2.py

In [48]:
import copy
from select import select
import time

def run_NSGAII(ds:DatasetPart3, randSeed:int, 
                ngen:int=CONSTANTS_DICT["MAX_GENERATIONS"], 
                popSize:int=CONSTANTS_DICT["POPULATION_SIZE"]):
    # stats
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("min", np.min, axis=0)
    stats.register("max", np.max, axis=0)
    stats.register("mean", np.mean, axis = 0)
    stats.register("std", np.std, axis=0)
    # for record keeping
    logbook = tools.Logbook()    
    logbook.header = "gen", "mean", "std", "min",  "max"
    
    # create toolbox
    random.seed(randSeed)
    toolbox = setup_toolbox(ds, randSeed)
    # create the initial population
    population = toolbox.PopulationCreator(n=popSize)
    
    # calculate objectives
    def evaluate_fitness_values(pop) :
        """Update the fitness values of each individual for the given the population"""
        # invalid_ind = [ind for ind in pop if not ind.fitness.valid]
        # fitnesses =toolbox.map(toolbox.evaluate, invalid_ind)
        fitnesses = toolbox.map(toolbox.evaluate, pop)
        # print(f"fitnesses: {fitnesses}")

        for ind, fit in zip(pop, fitnesses):
            ind.fitness.values = fit
    evaluate_fitness_values(population)
    
    # This is just to assign the crowding distance to the individuals
    # no actual selection is done
    population = toolbox.select(population, len(population))

    # Begin the generational process
    for gen_counter in range(ngen):
        
        # Vary the population
        offspring = tools.selTournamentDCD(population, len(population))
        offspring = [toolbox.clone(ind) for ind in offspring]
        
        # Apply crossover and mutation on the offspring
        for ind1, ind2 in zip(offspring[::2], offspring[1::2]):
            if random.random() <= CONSTANTS_DICT["CROSSOVER_RATE"]: # always crossover
                toolbox.mate(ind1, ind2)
            if random.random() <= CONSTANTS_DICT["MUTATION_RATE"]:
                toolbox.mutate(ind1)
                toolbox.mutate(ind2)
            # # print(ind1,ind2)

            # del ind1.fitness.values
            # del ind2.fitness.values
                
                # del ind1.fitness.values
                # del ind2.fitness.values
        # for mutant in offspring:
        #     if random.random() <= CONSTANTS_DICT["MUTATION_RATE"]:
        #         toolbox.mutate(mutant)
                # del mutant.fitness.values
                
        # Evaluate all  offsprings individuals 
        evaluate_fitness_values(offspring)

      
        # elitism strategy
        # Select the next generation population
        population = toolbox.select(population + offspring, popSize)
        
        
        
        # stats
        record = stats.compile(population)
        logbook.record(gen=gen_counter,  **record)
        print(logbook.stream)
        
    print("Final population hypervolume is %f" % hypervolume(population, [11.0, 11.0]))
    return population, logbook, hypervolume(population, [11.0, 11.0]) # set of non-dominated individuals solutions


In [49]:
import matplotlib.pyplot as plt
def run_3_times_with_different_seed(ds:DatasetPart3,
                                     title:str, 
                                     max_gen=CONSTANTS_DICT["MAX_GENERATIONS"],
                                     classifier = CONSTANTS_DICT["CLASSIFIER"],
                                     randSeed = [i for i in range(3)],
                                     run_times=3):
    # run 3 times with different seed
    population_list = []
    logbook_list = []
    hypervolume_list = []
    
    for i in range(run_times):
        print('-'*80)
        print('-'*80)
        print(title,"\nRunning GA with seed: ", randSeed[i])
        population, logbook, hypervolume = run_NSGAII(ds, randSeed=randSeed[i], ngen=max_gen, popSize=CONSTANTS_DICT["POPULATION_SIZE"])
        population_list.append(population)
        logbook_list.append(logbook)
        hypervolume_list.append(hypervolume)    
        print('-'*80)
        print('-'*80)
        
        # plot the result
        fitTuple = [ind.fitness.values for ind in population]
            
        plt.plot(fitTuple[0], fitTuple[1], label=f"seed {randSeed[i]}\n hypervolume: {hypervolume}")
        plt.legend(bbox_to_anchor =(1.3,-0.1), loc='lower center')
        plt.ylabel("ratio of selected features")
        plt.xlabel("classification error rate")
        plt.title(f"dataset: {title} \n Objective space")
    plt.show()
        
        
    # compare error rates of the obtained solutions with that of using the entire feature set.
    subset_mean_err_rate = [logbook.select("mean")[0] for logbook in logbook_list]
    entire_mean_err_rate = DatasetPart3.run_model(ds.df)
    
    print(f"{title}:\n error rates of the obtained solution: {subset_mean_err_rate}\n error rate of using the entire feature set: {entire_mean_err_rate}")
        

    return population_list, logbook_list

In [50]:
ds_vehicle = Vehicle.constructFromFile("./vehicle/vehicle.dat")
run_3_times_with_different_seed(ds_vehicle, "vehicle",
                                max_gen=10,
                                run_times=3)


--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
vehicle 
Running GA with seed:  0
gen	mean                   	std                    	min                    	max                    
0  	[0.35807311 0.43388889]	[0.05246388 0.1153885 ]	[0.29148926 0.11111111]	[0.54964052 0.72222222]
1  	[0.35039991 0.39777778]	[0.06231751 0.13140269]	[0.2844211  0.11111111]	[0.58433707 0.72222222]
2  	[0.34692923 0.36833333]	[0.07012796 0.15548709]	[0.2844211  0.11111111]	[0.58433707 0.66666667]
3  	[0.34070677 0.37      ]	[0.07372455 0.17395544]	[0.27934641 0.05555556]	[0.64927638 0.66666667]
4  	[0.35860756 0.32944444]	[0.09616896 0.17972115]	[0.27888422 0.05555556]	[0.73873483 0.66666667]


ValueError: at least one array or dtype is required

In [None]:
ds_mushclean = MuskClean.constructFromFile("./musk/clean1.data")
run_3_times_with_different_seed(ds_vehicle, "mushclean",
                                max_gen=10,
                                run_times=3)