# Genetic Programming for Feature Engineering
- TODO: build mutation and crossover in trees

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import ipdb
from collections import OrderedDict
import time

## Class and Function Definitions

In [2]:
# a node encapsulates an operation / fatures / constant; it remembers it's parent, and knows it's children
class Node(object):
    def __init__(self, typ, val, par):
        self.type = typ
        self.value = val
        self.parent = par
        # init the string and left & right to nothing
        self.Str = '%s(%s)'%(self.type, self.value)
        self.left = None
        self.right = None
        self.leftStr = '_'
        self.rightStr = '_'
        
    def __str__(self):
        return '(%s) -> [%s, %s]'%(self.Str, self.leftStr, self.rightStr)
    
    def setLeft(self, L):
        self.left = L
        self.leftStr = '%s(%s)'%(self.left.type, self.left.value)
        
    def setRight(self, R):
        self.right = R
        self.rightStr = '%s(%s)'%(self.right.type, self.right.value)

In [3]:
# a tree represents the entire function; it knows the root, level-ordered structure, and depth
class Tree(object):
    def __init__(self, root, maxDepth):
        self.root = root
        self.depth = maxDepth # init with just the max depth allowed for now
        self.struct = None
        # build the structure dict
        self.GenStruct()
        
    def __str__(self):
        # first build the struct dict, if necessary
        if self.struct is None:
            self.GenStruct()
        # now print
        return '\n'.join(['%d: %s'%(key, '|'.join([str(node.value) for node in val])) for (key, val) in self.struct.items()])

    @staticmethod
    def __RecTreeStruct(currNode, tree, currKey):
        '''
        Recursive tree structuring function; only to be called by TreeStruct
        '''
        # save the node
        this = tree[currKey].copy()
        this.append(currNode)
        tree[currKey] = this
        if (currNode.left is None) & (currNode.right is None):
            return tree

        if currNode.left is not None:
            tree = Tree.__RecTreeStruct(currNode.left, tree, currKey+1)
        if currNode.right is not None:
            tree = Tree.__RecTreeStruct(currNode.right, tree, currKey+1)
        return tree

    def GenStruct(self):
        '''
        Return the function tree structure as a dictionary.
        :return tree: level number-keyed ordered dict of the tree
        '''
        # populate the tree view dict
        self.struct = dict.fromkeys(range(self.depth), [])
        self.struct = Tree.__RecTreeStruct(self.root, self.struct, 0)
        # prune it now (remove unused rows)
        for key in list(self.struct.keys()):
            if self.struct[key] == []:
                self.struct.pop(key)
        
        # set the depth
        self.depth = max(self.struct.keys())
        return self.struct

    def GenFunction(self):
        '''
        Returns a string representation of the function tree as
        a function.
        :return function: the string of the function
        '''
        
        funcStrings = {}

        # special handling of const or feat root nodes
        if self.root.type != 'ops':
            funcStrings[self.root] = str(self.root.value)
        else:
            # start at the top & climb down the tree
            for currLev in range(self.depth, 0, -1):
                nodes = self.struct[currLev]
                # parse the nodes at this level and iterate in pairs
                for indx in range(0, len(nodes), 2):
                    # if there's a func string already defined, use them
                    lVal = funcStrings.get(nodes[indx], str(nodes[indx].value))
                    rVal = funcStrings.get(nodes[indx+1], str(nodes[indx+1].value))
                    # build and store the function string
                    funcStrings[nodes[indx].parent] = nodes[indx].parent.value + '(' + lVal + ',' + rVal + ')'
                    
        return funcStrings[self.root]    

In [4]:
def BuildTreeRec(currNode, currDepth, maxDepth, nodeMeta):
    '''
    Recursive tree building function; only to be called by BuildTree
    '''

    # exit if too deep or at a leaf
    if (currDepth > maxDepth) or (currNode.type != 'ops'):
        return currNode    
    # hit max depth, so ensure only consts or feats selected
    if currDepth == maxDepth:
        noOpsK = [k for k in nodeMeta.keys() if k != 'ops']
        noOpsW = [nodeMeta[t][2] for t in noOpsK]
        nodeTypeL, _ = randomWeightedSelect(noOpsK, noOpsW, 0)
        nodeTypeR, _ = randomWeightedSelect(noOpsK, noOpsW, 0)
    else:
        nodeTypeL, _ = randomWeightedSelect(nodeMeta.keys(), [v[2] for v in nodeMeta.values()], 0)
        nodeTypeR, _ = randomWeightedSelect(nodeMeta.keys(), [v[2] for v in nodeMeta.values()], 0)
        
    # randomly generate the left node
    nodeValuL = nodeMeta[nodeTypeL][0][np.random.randint(nodeMeta[nodeTypeL][1])]
    nodeL = BuildTreeRec(Node(nodeTypeL, nodeValuL, currNode),
                                 currDepth+1, maxDepth, nodeMeta)
    currNode.setLeft(nodeL)

    # randomly generate the right node
    nodeValuR = nodeMeta[nodeTypeR][0][np.random.randint(nodeMeta[nodeTypeR][1])]
    nodeR = BuildTreeRec(Node(nodeTypeR, nodeValuR, currNode),
                                 currDepth+1, maxDepth, nodeMeta)
    currNode.setRight(nodeR)
    
    return currNode

def BuildTree(maxDepth, nodeMeta, randSeed=None):
    '''
    Using a set of types of nodes, build a genetic programming functional tree.
    :param maxDepth: integer maximum depth allowed for the tree
    :param nodeMeta: dictionary holding the a tuple of a list of the node values
        allowed, the number of node values allowed, and node weight for random
        selection; keys are node types of 'ops, 'feats', and 'consts'
    :param randSeed: optional random seed for np.random
    :return tree: the complete GP tree
    :return randSeed: random seed used
    '''
    
    # ranodmize, perhaps
    if randSeed is None:
        randSeed = int(str(time.time()).split('.')[1])
        print('Random Seed = %d'%randSeed)
    np.random.seed(randSeed)
    
    # randomly generate the root node type & value
    nodeType, _ = randomWeightedSelect(nodeMeta.keys(), [v[2] for v in nodeMeta.values()], 0)
    nodeValu = nodeMeta[nodeType][0][np.random.randint(nodeMeta[nodeType][1])]
    
    # build the tree
    rootNode = BuildTreeRec(Node(nodeType, nodeValu, None), 0, 3, nodeMeta)

    return Tree(rootNode, maxDepth), randSeed

In [5]:
def randomWeightedSelect(keys, wats, randSeed=None):
    '''
    Randomly select an item from a list, according to a set of
    specified weights.
    :param keys: array-like of items from which to select
    :param wats: array-like of weights associated with the input
        keys; must be sorted in descending weight
    :param randSeed: optional random seed for np.random; if no
        randomization is desired, pass 0
    :return selection: selected item
    :return randSeed: random seed used
    '''
    
    # ranodmize, perhaps
    if randSeed != 0:
        if randSeed is None:
            randSeed = int(str(time.time()).split('.')[1])
        np.random.seed(randSeed)
    
    # ensure weights sum to 1
    totWats = sum(wats)
    if totWats != 1:
        wats = [v/totWats for v in wats]
    
    # get the cumulative weights
    cumWats = np.cumsum(wats)
    # get the indices of where the random [0,1] is < the cum weight
    rnd = np.random.rand()
    seld = rnd < cumWats
    
    return [k for (k,s) in zip(keys, seld) if s][0], randSeed

In [51]:
''' binary arithmetic operations that can be called as functions '''
# summation
def sm(a, b):
    return a+b

# subtraction
def sb(a, b):
    return a-b

# multiplication
def ml(a, b):
    return np.nan_to_num(a * b, posinf=np.nan)

# division
def dv(a, b):
    # check for longest dimensional match
    try:
        lna = len(a)
    except TypeError:
        lna = 1
    try:
        lnb = len(b)
    except TypeError:
        lnb = 1
    # compute
    if (lna == lnb) & (lna == 1):
        # both scalars
        if b != 0:
            res = a/b
        else:
            res = np.nan
    else:
        # at least 1 iterable
        if lna < lnb:
            # a is scalar, b is not
            a = [a]*lnb
        elif lnb < lna:
            # b is scalar, a is not
            b = [b]*lna
        res = np.nan_to_num(a / b, posinf=np.nan)
    return res

# power
def pw(a, b):
    return np.nan_to_num(a ** b, posinf=np.nan)

# minimum
def mn(a, b):
    # check for longest dimensional match
    try:
        lna = len(a)
    except TypeError:
        lna = 1
    try:
        lnb = len(b)
    except TypeError:
        lnb = 1
    # compute
    if (lna == lnb) & (lna == 1):
        # both scalars
        res = min(a, b)
    elif lna == lnb:
        # both iterables
        res = np.where(a < b, a, b)
    elif lna < lnb:
        # a is scalar, b is not
        tmp = [a]*lnb
        res = np.where(tmp < b, tmp, b)
    elif lnb < lna:
        # b is scalar, a is not
        tmp = [b]*lna
        res = np.where(a < tmp, a, tmp)
    return res

# maximum
def mx(a, b):
    # check for longest dimensional match
    try:
        lna = len(a)
    except TypeError:
        lna = 1
    try:
        lnb = len(b)
    except TypeError:
        lnb = 1
    # compute
    if (lna == lnb) & (lna == 1):
        # both scalars
        res = max(a, b)
    elif lna == lnb:
        # both iterables
        res = np.where(a > b, a, b)
    elif lna < lnb:
        # a is scalar, b is not
        tmp = [a]*lnb
        res = np.where(tmp > b, tmp, b)
    elif lnb < lna:
        # b is scalar, a is not
        tmp = [b]*lna
        res = np.where(a > tmp, a, tmp)
    return res

## Build Trees

In [52]:
# set the possible node values
ops = ['sm', 'sb', 'ml', 'dv', 'pw', 'mx', 'mn']
feats = ['X%d'%i for i in range(5)]
consts = list(range(1, 11))

# must be orderd by descending weight - [values, length, weight] 
nodeMeta = OrderedDict()
nodeMeta['ops'] = [ops, len(ops), 0.5]
nodeMeta['feats'] = [feats, len(feats), 0.25]
nodeMeta['consts'] = [consts, len(consts), 0.25]

In [59]:
''' randomly generate some trees '''
# set the depth
maxDepth = 10

# build the tree, starting from the top node
treeCnt = 20
trees = [None]*treeCnt
for indx in range(treeCnt):
    print('Creating tree %0d'%indx)
    time.sleep(np.random.rand()) # setting a random wait time to allow seed differentiation
    trees[indx], prngs = BuildTree(maxDepth, nodeMeta, None)
    print(trees[indx])

Creating tree 0
Random Seed = 141022
0: mx
1: X0|X2
Creating tree 1
Random Seed = 861056
0: 10
Creating tree 2
Random Seed = 7261512
0: dv
1: ml|6
2: X4|9
Creating tree 3
Random Seed = 1621847
0: X4
Creating tree 4
Random Seed = 1162584
0: 5
Creating tree 5
Random Seed = 7052982
0: X2
Creating tree 6
Random Seed = 2193546
0: mn
1: X3|pw
2: mn|5
3: sb|X4
4: 3|X1
Creating tree 7
Random Seed = 75417
0: 9
Creating tree 8
Random Seed = 7564678
0: 2
Creating tree 9
Random Seed = 3770256
0: 6
Creating tree 10
Random Seed = 520756
0: sm
1: 2|mx
2: X3|dv
3: ml|6
4: X4|X4
Creating tree 11
Random Seed = 6591203
0: 10
Creating tree 12
Random Seed = 473688
0: mx
1: mx|pw
2: dv|2|8|dv
3: pw|5|dv|mn
4: X0|8|X4|X0|X1|X3
Creating tree 13
Random Seed = 942489
0: dv
1: 5|10
Creating tree 14
Random Seed = 94499
0: sb
1: X2|dv
2: sb|sm
3: 6|X2|X1|X1
Creating tree 15
Random Seed = 6505418
0: 6
Creating tree 16
Random Seed = 858554
0: 6
Creating tree 17
Random Seed = 6686153
0: sb
1: ml|X1
2: pw|3
3: 9|X1
Cr

## Apply to a Dataframe

In [60]:
# generate some data
p = len(feats)
n = 1000
data = pd.DataFrame(data=np.random.rand(n, p), columns=feats)
display(data.head())

Unnamed: 0,X0,X1,X2,X3,X4
0,0.406056,0.645021,0.470934,0.59068,0.38181
1,0.859213,0.813436,0.0563,0.208929,0.508588
2,0.838423,0.062791,0.25153,0.164746,0.187578
3,0.417126,0.500608,0.739036,0.925935,0.915562
4,0.194754,0.733608,0.046353,0.217484,0.30477


In [61]:
# now apply all trees
for indx in range(len(trees)):
    print('Processing tree %0d'%indx)
    func = trees[indx].GenFunction()
    data['tree%0d'%indx] = eval(func.replace('X', 'data.X'))
# talk
display(data.head())

Processing tree 0
Processing tree 1
Processing tree 2
Processing tree 3
Processing tree 4
Processing tree 5
Processing tree 6
Processing tree 7
Processing tree 8
Processing tree 9
Processing tree 10
Processing tree 11
Processing tree 12
Processing tree 13
Processing tree 14
Processing tree 15
Processing tree 16
Processing tree 17
Processing tree 18
Processing tree 19




Unnamed: 0,X0,X1,X2,X3,X4,tree0,tree1,tree2,tree3,tree4,...,tree10,tree11,tree12,tree13,tree14,tree15,tree16,tree17,tree18,tree19
0,0.406056,0.645021,0.470934,0.59068,0.38181,0.470934,10,0.572716,0.38181,5,...,2.59068,10,27.39104,0.5,-3.815022,6,6,11.73234,10,7
1,0.859213,0.813436,0.0563,0.208929,0.508588,0.859213,10,0.762882,0.508588,5,...,2.208929,10,361.8868,0.5,-3.597151,6,6,17.106515,10,7
2,0.838423,0.062791,0.25153,0.164746,0.187578,0.838423,10,0.281368,0.187578,5,...,2.164746,10,1651.022,0.5,-45.523048,6,6,3.381019,10,7
3,0.417126,0.500608,0.739036,0.925935,0.915562,0.739036,10,1.373344,0.915562,5,...,2.925935,10,9112.197,0.5,-4.515539,6,6,8.511423,10,7
4,0.194754,0.733608,0.046353,0.217484,0.30477,0.194754,10,0.457156,0.30477,5,...,2.217484,10,3148842.0,0.5,-4.011433,6,6,14.303388,10,7


In [None]:
+