# Genetic Programming for Feature Engineering

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import ipdb
from collections import OrderedDict

## Class and Function Definitions

In [34]:
# a node encapsulates an operation / fatures / constant; it remembers it's parent, and knows it's children
class Node(object):
    def __init__(self, typ, val, par):
        self.type = typ
        self.value = val
        self.parent = par
        # init the string and left & right to nothing
        self.Str = '%s(%s)'%(self.type, self.value)
        self.left = None
        self.right = None
        self.leftStr = '_'
        self.rightStr = '_'
        
    def __str__(self):
        return '(%s) -> [%s, %s]'%(self.Str, self.leftStr, self.rightStr)
    
    def setLeft(self, L):
        self.left = L
        self.leftStr = '%s(%s)'%(self.left.type, self.left.value)
        
    def setRight(self, R):
        self.right = R
        self.rightStr = '%s(%s)'%(self.right.type, self.right.value)

In [None]:
# a tree represents the entire function; it knows the root and final leaves
class Tree(object):
    sef __init__(self, root, leaves):
        

In [106]:
def BuildTree(currNode, leaves, currDepth, maxDepth, nodes, nodeLens):
    '''
    Using a set of types of nodes, build a genetic programming functional tree.
    :param currNode: Node object representing the root of the tree
    :param leaves: level number-keyed ordered dict of the tree's leaves
    :param currDepth: integer number (default=0) of the current node depth
    :param maxDepth: integer maximum depth allowed for the tree
    :param nodes: dictionary holding the node values allowed, with node type
        keys of 'ops, 'feats', and 'consts'
    :param nodeLens: dictionary holding the number of node values by type,
        with node type keys of 'ops, 'feats', and 'consts'
    :return currNode: root node of finished tree
    '''
    # exit if too deep or at a leaf
    if (currDepth > maxDepth) or (currNode.type != 'ops'):
        if currNode.type != 'ops':
            this = leaves[currDepth]
            this.append(currNode)
            leaves[currDepth] = this
        return currNode, leaves    
    # hit max depth, so ensure only consts or feats selected
    if currDepth == maxDepth:
        nt = [t for t in nodes.keys() if t != 'ops']
        nodeTypeL = nt[np.random.randint(len(nt))]
        nodeTypeR = nt[np.random.randint(len(nt))]
    else:
        nodeTypeL = nodeTypes[np.random.randint(len(nodes.keys()))]
        nodeTypeR = nodeTypes[np.random.randint(len(nodes.keys()))]
        
    # randomly generate the left node
    nodeValuL = nodes[nodeTypeL][np.random.randint(nodeLens[nodeTypeL])]
    nodeL, leaves = BuildTree(Node(nodeTypeL, nodeValuL, currNode), leaves, currDepth+1, maxDepth, nodes, nodeLens)
    currNode.setLeft(nodeL)

    # randomly generate the right node
    nodeValuR = nodes[nodeTypeR][np.random.randint(nodeLens[nodeTypeR])]
    nodeR, leaves = BuildTree(Node(nodeTypeR, nodeValuR, currNode), leaves, currDepth+1, maxDepth, nodes, nodeLens)
    currNode.setRight(nodeR)
    
    return currNode, leaves

In [107]:
nw = dt.datetime.now()
randSeed = nw.second*10000 + nw.minute*100 + nw.hour
print('Random seed = %d'%randSeed)
nodeType = nodeTypes[np.random.randint(3)]
nodeValu = nodes[nodeType][np.random.randint(nodeLens[nodeType])]

d = OrderedDict.fromkeys(range(maxDepth), [])
t, d = BuildTree(Node(nodeType, nodeValu, None), d, 0, 3, nodes, nodeLens)
print(PrintTree(t, maxDepth))
print(d)

Random seed = 572323
OrderedDict([(0, 'ops(\\)'), (1, 'feats(x1)|consts(5)')])
OrderedDict([(0, [<__main__.Node object at 0x00000215FA97F208>, <__main__.Node object at 0x00000215FA8A99C8>]), (1, [<__main__.Node object at 0x00000215FA97F208>, <__main__.Node object at 0x00000215FA8A99C8>]), (2, [<__main__.Node object at 0x00000215FA97F208>, <__main__.Node object at 0x00000215FA8A99C8>]), (3, [<__main__.Node object at 0x00000215FA97F208>, <__main__.Node object at 0x00000215FA8A99C8>]), (4, [<__main__.Node object at 0x00000215FA97F208>, <__main__.Node object at 0x00000215FA8A99C8>]), (5, [<__main__.Node object at 0x00000215FA97F208>, <__main__.Node object at 0x00000215FA8A99C8>]), (6, [<__main__.Node object at 0x00000215FA97F208>, <__main__.Node object at 0x00000215FA8A99C8>]), (7, [<__main__.Node object at 0x00000215FA97F208>, <__main__.Node object at 0x00000215FA8A99C8>]), (8, [<__main__.Node object at 0x00000215FA97F208>, <__main__.Node object at 0x00000215FA8A99C8>]), (9, [<__main__.No

# WORKING ON TRACKING LEAVES WHILE CREATING THE TREE; THEN NEXT NEED TO FINISH DEFINING TREE OBJECT

In [108]:
d

OrderedDict([(0,
              [<__main__.Node at 0x215fa97f208>,
               <__main__.Node at 0x215fa8a99c8>]),
             (1,
              [<__main__.Node at 0x215fa97f208>,
               <__main__.Node at 0x215fa8a99c8>]),
             (2,
              [<__main__.Node at 0x215fa97f208>,
               <__main__.Node at 0x215fa8a99c8>]),
             (3,
              [<__main__.Node at 0x215fa97f208>,
               <__main__.Node at 0x215fa8a99c8>]),
             (4,
              [<__main__.Node at 0x215fa97f208>,
               <__main__.Node at 0x215fa8a99c8>]),
             (5,
              [<__main__.Node at 0x215fa97f208>,
               <__main__.Node at 0x215fa8a99c8>]),
             (6,
              [<__main__.Node at 0x215fa97f208>,
               <__main__.Node at 0x215fa8a99c8>]),
             (7,
              [<__main__.Node at 0x215fa97f208>,
               <__main__.Node at 0x215fa8a99c8>]),
             (8,
              [<__main__.Node at 0x215fa97f208>

In [42]:
def RecPrintTree(currNode, tree, currKey):
    '''
    Recursive tree printing function; only to be called by PrintTree
    '''
    tree[currKey] += currNode.Str+'|'
    if (currNode.left is None) & (currNode.right is None):
        return tree
    
    if currNode.left is not None:
        tree = RecPrintTree(currNode.left, tree, currKey+1)
    if currNode.right is not None:
        tree = RecPrintTree(currNode.right, tree, currKey+1)
    return tree

def PrintTree(rootNode, maxDepth):
    '''
    Return a printable genetic programming function tree.
    :param rootNode: root of the tree
    :param maxDepth: the maximum possible depth of the tree
    :return tree: level number-keyed ordered dict of the tree
    '''
    # populate the tree view dict
    tree = OrderedDict.fromkeys(range(maxDepth), '')
    tree = RecPrintTree(rootNode, tree, 0)
    # unpopulate it now
    for key in list(tree.keys()):
        if tree[key] == '':
            tree.pop(key)
        else:
            tree[key] = tree[key][:-1]
    
    return tree

## The Works

In [5]:
# set the possible node values
ops = ['+', '-', '*', '\\', '^', 'max', 'min']
feats = ['x%d'%i for i in range(5)]
consts = list(range(1, 11))

nodes = {'ops':ops, 'feats':feats, 'consts':consts}
nodeLens = {'ops':len(ops), 'feats':len(feats), 'consts':len(consts)}
nodeTypes = list(nodes.keys())

In [18]:
''' randomly generate some trees '''
# set seed
nw = dt.datetime.now()
randSeed = 42#nw.hour*10000 + nw.minute*100 + nw.second
print('Random seed = %d'%randSeed)
np.random.seed(randSeed)

# set the depth
maxDepth = 10
depth = np.random.randint(low=1, high=maxDepth+1)

# build the tree, starting from the top node
treeCnt = 10
trees = [None]*treeCnt
for t in range(treeCnt):
    nodeType = nodeTypes[np.random.randint(3)]
    nodeValu = nodes[nodeType][np.random.randint(nodeLens[nodeType])]
    trees[t] = BuildTree(Node(nodeType, nodeValu, None), 0, 3, nodes, nodeLens)
    print(PrintTree(trees[t], maxDepth))

Random seed = 204835
OrderedDict([(0, 'consts(7)')])
OrderedDict([(0, 'consts(5)')])
OrderedDict([(0, 'consts(2)')])
OrderedDict([(0, 'ops(*)'), (1, 'ops(^)|consts(5)'), (2, 'consts(3)|feats(x0)')])
OrderedDict([(0, 'feats(x2)')])
OrderedDict([(0, 'feats(x4)')])
OrderedDict([(0, 'consts(7)')])
OrderedDict([(0, 'feats(x2)')])
OrderedDict([(0, 'feats(x4)')])
OrderedDict([(0, 'feats(x2)')])
