# Demo of Binary classification with Genetic Programming using DEAP

## Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import gplearn
import os
import enum
import random

import DeapWrapperComponent as de



## Define constant and variable settings

In [2]:
NEW_LINE = '\n'
DATA_PATH = r'.\Data'
def getDataPaths():
    paths = [x for x in os.listdir(DATA_PATH)]
    paths.sort()
    paths = [os.path.join(DATA_PATH, x) for x in paths]
    paths = tuple(paths)
    return paths

BREAST_PATH, HORSE_PATH, IONOSPHERE_PATH, PIMA_PATH, SONAR_PATH, SPAMBASE_TEST, SPAMBASE_TRAIN = getDataPaths()

class ProblemType(enum.Enum):
    breast_cancer_problem = 0
    horse_problem = 1
    ionospere_problem = 2
    sonar_problem = 3
    pima_problem = 4
    spambase_problem = 5
#getDataPaths()

## Specify ProblemDataProvider component for sourcing problem data

In [3]:
class ProblemDataProvider(object):
    def __init__(self,problem_type):
        self.__problem_type = problem_type
        self.__problem_path_map = ProblemDataProvider.getProblemDataPaths(problem_type)
        self.__problem_data_map = self.readData()
        self.__le = preprocessing.LabelEncoder()
        self.__pre_processed_data_map = self.preProcessData()
        self.__data_matrix_map = self.dataAsMatrix()
        
    def __str__(self):
        msg = f'problem_type: {self.__problem_type}'
        msg += f'{NEW_LINE}problem_path_map: {self.__problem_path_map}{NEW_LINE}'
        for k, v in self.__problem_data_map.items():
            msg += f'{self.__problem_path_map[k]} data has the shape: {self.__problem_data_map[k].shape}{NEW_LINE}'
        msg += f'{NEW_LINE}'
        return msg
    
    def readData(self):
        data_map = {}
        for k, v in self.__problem_path_map.items():
            data_map[k] = pd.read_csv(v)
        return data_map
    
    def preProcessData(self):
        data_map = {}
        for k, v in self.__problem_data_map.items():
            data_map[k] = self.applyPreProcessorPerDataset(k, v)
        return data_map
    
    def applyPreProcessorPerDataset(self, key, old_data_df):
        if self.__problem_type == ProblemType.spambase_problem:
            new_data_df = old_data_df.copy()
            new_data_df = new_data_df.drop('Id', 1)
            new_data_df.capital_run_length_longest = new_data_df.capital_run_length_longest.astype(float)
            new_data_df.capital_run_length_total = new_data_df.capital_run_length_total.astype(float)
#             if key is 'train':                
#                 new_data_df.ham = new_data_df.ham.astype(int)
        return new_data_df
    
    def dataAsMatrix(self):
        data_map = {}
        for k, v in self.__pre_processed_data_map.items():
            data_map[k] = v.values
        return data_map
    
    @property
    def problem_type(self):
        return self.__problem_type
    
    @property
    def problem_path_map(self):
        return self.__problem_path_map
    
    @property
    def problem_data_map(self):
        return self.__problem_data_map
    
    @property
    def pre_processed_data_map(self):
        return self.__pre_processed_data_map
    
    @property
    def data_matrix_map(self):
        return self.__data_matrix_map
    
    @staticmethod
    def isTestDataPath(path):
        head, tail = os.path.split(path)
        if tail.split('_')[1].lower() .startswith('test'):
            return True
        else:
            return False
    
    @staticmethod
    def getProblemDataPaths(problem_type):
        found_paths = []
        paths_map = {}
        key = problem_type.name.split('_')[0].lower()
        print('key: {}'.format(key))
        paths = getDataPaths()
        print(f'There are {len(paths)} Problem data files found, they include:')
        for x in paths:
            head, tail = os.path.split(x)
            print(f'{x}')
            if tail.startswith(key):
                found_paths.append(x)
        #print(f'found_paths: {found_paths}')
        if len(found_paths) > 1:
            for path in found_paths:
                if ProblemDataProvider.isTestDataPath(path):
                    paths_map['test'] = path
                else:
                    paths_map['train'] = path
        else:
            paths_map['all'] = found_paths[0]
        print(f'{problem_type.name} paths_map is: {paths_map}')
        return paths_map

         
    
        
# spambase_data = pd.read_csv(SPAMBASE_TEST)
# spambase_data.head()

## Test ProblemDataProvider

In [4]:
problem_type = ProblemType.spambase_problem
data_provider = ProblemDataProvider(problem_type)
print(f'\ndata_provider: {data_provider}')
problem_raw_data_map = data_provider.problem_data_map
problem_clean_data_map = data_provider.pre_processed_data_map
problem_map_keys = problem_raw_data_map.keys()
print(f'Problem map keys: {problem_map_keys}')
train_data = problem_clean_data_map['train'].head(5)
test_data = problem_clean_data_map['test'].head(5)
print(f'train data: {train_data}')
print('\n\n')
print(f'train data types: {train_data.dtypes}')
#print(f'test data: {problem_clean_data_map['test'].head(5)}')

key: spambase
There are 7 Problem data files found, they include:
.\Data\breast-cancer-wisconsin.csv
.\Data\horse-colic.csv
.\Data\ionosphere.csv
.\Data\pima-indians-diabetes.csv
.\Data\sonar.csv
.\Data\spambase_test_data.csv
.\Data\spambase_train_data.csv
spambase_problem paths_map is: {'test': '.\\Data\\spambase_test_data.csv', 'train': '.\\Data\\spambase_train_data.csv'}

data_provider: problem_type: ProblemType.spambase_problem
problem_path_map: {'test': '.\\Data\\spambase_test_data.csv', 'train': '.\\Data\\spambase_train_data.csv'}
.\Data\spambase_test_data.csv data has the shape: (921, 58)
.\Data\spambase_train_data.csv data has the shape: (3680, 59)


Problem map keys: dict_keys(['test', 'train'])
train data:    word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0             0.0              14.28            0.0           0.0   
1             0.0               0.00            1.0           0.0   
2             0.0               0.00            0.0           0.0  

## Specify Problem Evaluation Functions

In [5]:
  
def breastCancerEvalFunc(data, individual):
    pass

def horseEvalFunc(data, individual):
    pass

def ionosphereEvalFunc(data, individual):
    pass

def pimaEvalFunc(data, individual):
    pass

def sonarEvalFunc(data, individual):
    pass

def spambaseEvalFunc(spam, toolbox, individual):
    n_samples = len(spam)
    # Transform the tree expression in a callable function
    func = toolbox.compile(expr=individual)
    # Randomly sample 400 mails in the spam database
    spam_samp = random.sample(spam, n_samples)
    # Evaluate the sum of correctly identified mail as spam
    result = sum(bool(func(*mail[:57])) is bool(mail[57]) for mail in spam_samp)
    return result,

class ProblemEvaluationFuncProvider(object):
    def __init__(self, problem_type):
        self.__problem_type = problem_type
        self.__eval_func_map = eval_func_map = {
            ProblemType.breast_cancer_problem: breastCancerEvalFunc,
            ProblemType.horse_problem: horseEvalFunc,
            ProblemType.ionospere_problem: ionosphereEvalFunc,
            ProblemType.pima_problem: pimaEvalFunc,
            ProblemType.sonar_problem: sonarEvalFunc,
            ProblemType.spambase_problem: spambaseEvalFunc
        }
    
    @property
    def eval_func(self):        
        return self.__eval_func_map[self.__problem_type]
    
        
        
    

In [6]:
def trainGPModel(eval_func, train_data):
    primitive_set, toolbox = de.runPreCompileState()
    pop, stats, hof = de.runPostCompileState(eval_func, train_data, toolbox, primitive_set)


def runDemoForSpamProblem():
    problem_type = ProblemType.spambase_problem
    data_provider = ProblemDataProvider(problem_type)
    data_map = data_provider.data_matrix_map
    train_data = data_map['train']
    test_data = data_map['test']
    print(f'Training data size is: {train_data.shape}')
    print(f'Testing data size is: {test_data.shape}')
    print(f"Sample (2) rows of train_data:{NEW_LINE}{train_data[:2]}")
    eval_func_provider = ProblemEvaluationFuncProvider(problem_type)
    eval_func = eval_func_provider.eval_func
    print(f"Evaluation function for {problem_type.name} is {eval_func}")
    #result = eval_func(train_data.tolist(), None)
    trainGPModel(eval_func, train_data.tolist())
    
runDemoForSpamProblem()

key: spambase
There are 7 Problem data files found, they include:
.\Data\breast-cancer-wisconsin.csv
.\Data\horse-colic.csv
.\Data\ionosphere.csv
.\Data\pima-indians-diabetes.csv
.\Data\sonar.csv
.\Data\spambase_test_data.csv
.\Data\spambase_train_data.csv
spambase_problem paths_map is: {'test': '.\\Data\\spambase_test_data.csv', 'train': '.\\Data\\spambase_train_data.csv'}
Training data size is: (3680, 58)
Testing data size is: (921, 57)
Sample (2) rows of train_data:
[[0.0 14.28 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 1.8 5.0 9.0 True]
 [0.0 0.0 1.0 0.0 0.5 0.0 0.0 0.0 0.0 0.5 0.0 0.0 0.0 0.0 0.0 0.5 0.0 0.0
  2.5 0.0 1.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.5 0.0 0.0 0.0 0.0 0.35700000000000004
  0.0 0.892 0.0 0.0 2.0 19.0 172.0 False]]
Evaluation functi