# Genetic Programming for Feature Engineering
- TODO: build procedure to turn a tree into an executable function
- TODO: figure out how to implement that in a dataframe
- TODO: build mutation and crossover in trees

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import ipdb
from collections import OrderedDict
import time

## Class and Function Definitions

In [2]:
# a node encapsulates an operation / fatures / constant; it remembers it's parent, and knows it's children
class Node(object):
    def __init__(self, typ, val, par):
        self.type = typ
        self.value = val
        self.parent = par
        # init the string and left & right to nothing
        self.Str = '%s(%s)'%(self.type, self.value)
        self.left = None
        self.right = None
        self.leftStr = '_'
        self.rightStr = '_'
        
    def __str__(self):
        return '(%s) -> [%s, %s]'%(self.Str, self.leftStr, self.rightStr)
    
    def setLeft(self, L):
        self.left = L
        self.leftStr = '%s(%s)'%(self.left.type, self.left.value)
        
    def setRight(self, R):
        self.right = R
        self.rightStr = '%s(%s)'%(self.right.type, self.right.value)

In [129]:
# a tree represents the entire function; it knows the root, level-ordered structure, and depth
class Tree(object):
    def __init__(self, root, maxDepth):
        self.root = root
        self.depth = maxDepth # init with just the max depth allowed for now
        self.struct = None
        # build the structure dict
        self.GenStruct()
        
    def __str__(self):
        # first build the struct dict, if necessary
        if self.struct is None:
            self.GenStruct()
        # now print
        return '\n'.join(['%d: %s'%(key, '|'.join([str(node.value) for node in val])) for (key, val) in self.struct.items()])

    @staticmethod
    def __RecTreeStruct(currNode, tree, currKey):
        '''
        Recursive tree structuring function; only to be called by TreeStruct
        '''
        # save the node
        this = tree[currKey].copy()
        this.append(currNode)
        tree[currKey] = this
        if (currNode.left is None) & (currNode.right is None):
            return tree

        if currNode.left is not None:
            tree = Tree.__RecTreeStruct(currNode.left, tree, currKey+1)
        if currNode.right is not None:
            tree = Tree.__RecTreeStruct(currNode.right, tree, currKey+1)
        return tree

    def GenStruct(self):
        '''
        Return the function tree structure as a dictionary.
        :return tree: level number-keyed ordered dict of the tree
        '''
        # populate the tree view dict
        self.struct = OrderedDict.fromkeys(range(self.depth), [])
        self.struct = Tree.__RecTreeStruct(self.root, self.struct, 0)
        # prune it now (remove unused rows)
        for key in list(self.struct.keys()):
            if self.struct[key] == []:
                self.struct.pop(key)
        
        # set the depth
        self.depth = max(self.struct.keys())
        return self.struct

    def GenFunction(self):
        '''
        Returns a string representation of the function tree as
        a function.
        :return function: the string of the function
        '''
        
        funcStrings = {}

        # special handling of const or feat root nodes
        if self.root.type != 'ops':
            funcStrings[self.root] = str(self.root.value)
        else:
            # start at the top & climb down the tree
            for currLev in range(self.depth, 0, -1):
                nodes = self.struct[currLev]
                # parse the nodes at this level and iterate in pairs
                for indx in range(0, len(nodes), 2):
                    # if there's a func string already defined, use them
                    lVal = funcStrings.get(nodes[indx], str(nodes[indx].value))
                    rVal = funcStrings.get(nodes[indx+1], str(nodes[indx+1].value))
                    # build and store the function string
                    funcStrings[nodes[indx].parent] = nodes[indx].parent.value + '(' + lVal + ',' + rVal + ')'
                    
        return funcStrings[self.root]    

In [102]:
def BuildTreeRec(currNode, currDepth, maxDepth, nodeMeta):
    '''
    Recursive tree building function; only to be called by BuildTree
    '''

    # exit if too deep or at a leaf
    if (currDepth > maxDepth) or (currNode.type != 'ops'):
        return currNode    
    # hit max depth, so ensure only consts or feats selected
    if currDepth == maxDepth:
        noOpsK = [k for k in nodeMeta.keys() if k != 'ops']
        noOpsW = [nodeMeta[t][2] for t in noOpsK]
        nodeTypeL, _ = randomWeightedSelect(noOpsK, noOpsW, 0)
        nodeTypeR, _ = randomWeightedSelect(noOpsK, noOpsW, 0)
    else:
        nodeTypeL, _ = randomWeightedSelect(nodeMeta.keys(), [v[2] for v in nodeMeta.values()], 0)
        nodeTypeR, _ = randomWeightedSelect(nodeMeta.keys(), [v[2] for v in nodeMeta.values()], 0)
        
    # randomly generate the left node
    nodeValuL = nodeMeta[nodeTypeL][0][np.random.randint(nodeMeta[nodeTypeL][1])]
    nodeL = BuildTreeRec(Node(nodeTypeL, nodeValuL, currNode),
                                 currDepth+1, maxDepth, nodeMeta)
    currNode.setLeft(nodeL)

    # randomly generate the right node
    nodeValuR = nodeMeta[nodeTypeR][0][np.random.randint(nodeMeta[nodeTypeR][1])]
    nodeR = BuildTreeRec(Node(nodeTypeR, nodeValuR, currNode),
                                 currDepth+1, maxDepth, nodeMeta)
    currNode.setRight(nodeR)
    
    return currNode

def BuildTree(maxDepth, nodeMeta, randSeed=None):
    '''
    Using a set of types of nodes, build a genetic programming functional tree.
    :param maxDepth: integer maximum depth allowed for the tree
    :param nodeMeta: dictionary holding the a tuple of a list of the node values
        allowed, the number of node values allowed, and node weight for random
        selection; keys are node types of 'ops, 'feats', and 'consts'
    :param randSeed: optional random seed for np.random
    :return tree: the complete GP tree
    :return randSeed: random seed used
    '''
    
    # ranodmize, perhaps
    if randSeed is None:
        randSeed = int(str(time.time()).split('.')[1])
        print('Random Seed = %d'%randSeed)
    np.random.seed(randSeed)
    
    # randomly generate the root node type & value
    nodeType, _ = randomWeightedSelect(nodeMeta.keys(), [v[2] for v in nodeMeta.values()], 0)
    nodeValu = nodeMeta[nodeType][0][np.random.randint(nodeMeta[nodeType][1])]
    
    # build the tree
    rootNode = BuildTreeRec(Node(nodeType, nodeValu, None), 0, 3, nodeMeta)

    return Tree(rootNode, maxDepth), randSeed

In [5]:
def randomWeightedSelect(keys, wats, randSeed=None):
    '''
    Randomly select an item from a list, according to a set of
    specified weights.
    :param keys: array-like of items from which to select
    :param wats: array-like of weights associated with the input
        keys; must be sorted in descending weight
    :param randSeed: optional random seed for np.random; if no
        randomization is desired, pass 0
    :return selection: selected item
    :return randSeed: random seed used
    '''
    
    # ranodmize, perhaps
    if randSeed != 0:
        if randSeed is None:
            randSeed = int(str(time.time()).split('.')[1])
        np.random.seed(randSeed)
    
    # ensure weights sum to 1
    totWats = sum(wats)
    if totWats != 1:
        wats = [v/totWats for v in wats]
    
    # get the cumulative weights
    cumWats = np.cumsum(wats)
    # get the indices of where the random [0,1] is < the cum weight
    rnd = np.random.rand()
    seld = rnd < cumWats
    
    return [k for (k,s) in zip(keys, seld) if s][0], randSeed

In [140]:
def sum(a, b):
    return a+b
def sub(a, b):
    return a-b
def mul(a, b):
    return a*b
def div(a, b):
    if b != 0:
        res = a/b
    else:
        res = np.nan
    return res
def pow(a, b):
    return a**b

## The Works

In [130]:
# set the possible node values
ops = ['sum', 'sub', 'mul', 'div', 'pow', 'max', 'min']
feats = ['x%d'%i for i in range(5)]
consts = list(range(1, 11))

# must be orderd by descending weight - [values, length, weight] 
nodeMeta = OrderedDict()
nodeMeta['ops'] = [ops, len(ops), 1/3]
nodeMeta['feats'] = [feats, len(feats), 1/3]
nodeMeta['consts'] = [consts, len(consts), 1/3]

In [134]:
''' randomly generate some trees '''
# set the depth
maxDepth = 10

# build the tree, starting from the top node
treeCnt = 10
trees = [None]*treeCnt
for t in range(treeCnt):
    time.sleep(np.random.rand()) # setting a random wait time to allow seed differentiation
    trees[t], prngs = BuildTree(maxDepth, nodeMeta, None)
    print(trees[t])

Random Seed = 7440412
0: 9
Random Seed = 6926198
0: mul
1: sum|x4
2: x3|9
Random Seed = 6146874
0: x1
Random Seed = 96236
0: min
1: x2|max
2: 9|pow
3: 8|10
Random Seed = 6972764
0: 2
Random Seed = 1203244
0: x4
Random Seed = 6443644
0: 5
Random Seed = 378418
0: sub
1: pow|1
2: min|max
3: x2|max|mul|7
4: 10|3|10|x2
Random Seed = 882455
0: max
1: x0|x1
Random Seed = 5340269
0: sub
1: 7|sub
2: div|9
3: min|max
4: x3|x2|6|5
