# Genetic Programming for Feature Engineering

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import ipdb
from collections import OrderedDict
import time

## Class and Function Definitions

In [2]:
# a node encapsulates an operation / fatures / constant; it remembers it's parent, and knows it's children
class Node(object):
    def __init__(self, typ, val, par):
        self.type = typ
        self.value = val
        self.parent = par
        # init the string and left & right to nothing
        self.Str = '%s(%s)'%(self.type, self.value)
        self.left = None
        self.right = None
        self.leftStr = '_'
        self.rightStr = '_'
        
    def __str__(self):
        return '(%s) -> [%s, %s]'%(self.Str, self.leftStr, self.rightStr)
    
    def setLeft(self, L):
        self.left = L
        self.leftStr = '%s(%s)'%(self.left.type, self.left.value)
        
    def setRight(self, R):
        self.right = R
        self.rightStr = '%s(%s)'%(self.right.type, self.right.value)

In [3]:
# a tree represents the entire function; it knows the depth, root, final leaves, and level-ordered structure
class Tree(object):
    def __init__(self, root, maxDepth, leaves):
        self.root = root
        self.leaves = leaves
        self.depth = maxDepth # init with just the max depth allowed for now
        self.struct = None
        # build the structure dict
        self.TreeStruct()
        
    def __str__(self):
        # first build the struct dict, if necessary
        if self.struct is None:
            self.TreeStruct()
        # now print
        return str(self.struct).replace('), ', '\n')[13:-2]

    @staticmethod
    def __RecTreeStruct(currNode, tree, currKey):
        '''
        Recursive tree structuring function; only to be called by TreeStruct
        '''
        tree[currKey] += currNode.Str+'|'
        if (currNode.left is None) & (currNode.right is None):
            return tree

        if currNode.left is not None:
            tree = Tree.__RecTreeStruct(currNode.left, tree, currKey+1)
        if currNode.right is not None:
            tree = Tree.__RecTreeStruct(currNode.right, tree, currKey+1)
        return tree

    def TreeStruct(self):
        '''
        Return a genetic programming function tree structure as a dictionary.
        :return tree: level number-keyed ordered dict of the tree
        '''
        # populate the tree view dict
        self.struct = OrderedDict.fromkeys(range(self.depth), '')
        self.struct = Tree.__RecTreeStruct(self.root, self.struct, 0)
        # unpopulate it now, and remove the trailing pipes
        for key in list(self.struct.keys()):
            if self.struct[key] == '':
                self.struct.pop(key)
            else:
                self.struct[key] = self.struct[key][:-1]
        
        # set the depth
        self.depth = max(self.struct.keys())
        return self.struct

In [99]:
def BuildTreeRec(currNode, leaves, currDepth, maxDepth, nodeMeta):
    '''
    Recursive tree building function; only to be called by BuildTree
    '''
    # exit if too deep or at a leaf
    if (currDepth > maxDepth) or (currNode.type != 'ops'):
        # but first add current node to leaves if it is
        if currNode.type != 'ops':
            this = leaves[currDepth].copy()
            this.append(currNode)
            leaves[currDepth] = this
        return currNode, leaves    
    # hit max depth, so ensure only consts or feats selected
    if currDepth == maxDepth:
        noOpsK = [k for k in nodeMeta.keys() if k != 'ops']
        noOpsW = [nodeMeta[t][2] for t in noOpsK]
        
        nodeTypeL, _ = randomWeightedSelect(noOpsK, noOpsW)
        nodeTypeR, _ = randomWeightedSelect(noOpsK, noOpsW)
    else:
        nodeTypeL, _ = randomWeightedSelect(nodeMeta.keys(), [v[2] for v in nodeMeta.values()])
        nodeTypeR, _ = randomWeightedSelect(nodeMeta.keys(), [v[2] for v in nodeMeta.values()])
        
    # randomly generate the left node
    nodeValuL = nodeMeta[nodeTypeL][0][np.random.randint(nodeMeta[nodeTypeL][1])]
    nodeL, leaves = BuildTreeRec(Node(nodeTypeL, nodeValuL, currNode), leaves,
                                 currDepth+1, maxDepth, nodeMeta)
    currNode.setLeft(nodeL)

    # randomly generate the right node
    nodeValuR = nodeMeta[nodeTypeR][0][np.random.randint(nodeMeta[nodeTypeR][1])]
    nodeR, leaves = BuildTreeRec(Node(nodeTypeR, nodeValuR, currNode), leaves,
                                 currDepth+1, maxDepth, nodeMeta)
    currNode.setRight(nodeR)
    
    return currNode, leaves

def BuildTree(maxDepth, nodeMeta, randSeed=None):
    '''
    Using a set of types of nodes, build a genetic programming functional tree.
    :param maxDepth: integer maximum depth allowed for the tree
    :param nodeMeta: dictionary holding the a tuple of a list of the node values
        allowed, the number of node values allowed, and node weight for random
        selection; keys are node types of 'ops, 'feats', and 'consts'
    :param randSeed: optional random seed for np.random
    :return tree: the complete GP tree
    :return randSeed: random seed used
    '''
    
    # ranodmize, perhaps
    if randSeed is None:
        randSeed = int(str(time.time()).split('.')[1])
        print('Random Seed = %d'%randSeed)
    np.random.seed(randSeed)
    
    # randomly generate the root node type & value
    nodeType, _ = randomWeightedSelect(nodeMeta.keys(), [v[2] for v in nodeMeta.values()])
    nodeValu = nodeMeta[nodeType][0][np.random.randint(nodeMeta[nodeType][1])]
    
    # build the tree
    leaves = OrderedDict.fromkeys(range(maxDepth), [])
    rootNode, leaves = BuildTreeRec(Node(nodeType, nodeValu, None), leaves, 0, 3, nodeMeta)
    
    return Tree(rootNode, maxDepth, leaves), randSeed

In [104]:
def randomWeightedSelect(keys, wats, randSeed=None):
    '''
    Randomly select an item from a list, according to a set of
    specified weights.
    :param keys: array-like of items from which to select
    :param wats: array-like of weights associated with the input
        keys; must be sorted in descending weight
    :param randSeed: optional random seed for np.random
    :return selection: selected item
    :return randSeed: random seed used
    '''
    
    # ranodmize, perhaps
    if randSeed is None:
        randSeed = int(str(time.time()).split('.')[1])
    np.random.seed(randSeed)
    
    # ensure weights sum to 1
    totWats = sum(wats)
    if totWats != 1:
        wats = [v/totWats for v in wats]
    
    # get the cumulative weights
    cumWats = np.cumsum(wats)
    # get the indices of where the random [0,1] is < the cum weight
    rnd = np.random.rand()
    seld = rnd < cumWats
    
    return [k for (k,s) in zip(keys, seld) if s][0], randSeed

## The Works

In [23]:
# set the possible node values
ops = ['+', '-', '*', '\\', '^', 'max', 'min']
feats = ['x%d'%i for i in range(5)]
consts = list(range(1, 11))

# must be orderd by descending weight - [values, length, weight] 
nodeMeta = OrderedDict()
nodeMeta['ops'] = [ops, len(ops), 1/3]
nodeMeta['feats'] = [feats, len(feats), 1/3]
nodeMeta['consts'] = [consts, len(consts), 1/3]

In [107]:
''' randomly generate some trees '''
# set the depth
maxDepth = 10

# build the tree, starting from the top node
treeCnt = 10
trees = [None]*treeCnt
for t in range(treeCnt):
    time.sleep(np.random.rand())
    trees[t], prngs = BuildTree(maxDepth, nodeMeta, None)
    print(trees[t])

Random Seed = 9640234
(0, 'feats(x3)')
Random Seed = 7129908
(0, 'ops(max)'
(1, 'feats(x0)|feats(x3)')
Random Seed = 881054
(0, 'consts(10)')
Random Seed = 182565
(0, 'feats(x4)')
Random Seed = 4121256
(0, 'feats(x3)')
Random Seed = 70445
(0, 'consts(1)')
Random Seed = 4214015
(0, 'feats(x4)')
Random Seed = 3213494
(0, 'consts(6)')
Random Seed = 1445422
(0, 'ops(-)'
(1, 'ops(-)|ops(^)'
(2, 'ops(-)|ops(^)|ops(-)|ops(^)'
(3, 'ops(-)|ops(^)|ops(-)|ops(^)|ops(-)|ops(^)|ops(-)|ops(^)'
(4, 'feats(x1)|feats(x4)|feats(x1)|feats(x4)|feats(x1)|feats(x4)|feats(x1)|feats(x4)|feats(x1)|feats(x4)|feats(x1)|feats(x4)|feats(x1)|feats(x4)|feats(x1)|feats(x4)')
Random Seed = 4355378
(0, 'feats(x2)')


# SOMETHING HAS CLEARLY GOTTEN F'D UP HERE; NEED TO FIX