# Genetic Programming for Feature Engineering
- TODO: build mutation and crossover in trees

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import ipdb
from collections import OrderedDict
import time
from itertools import chain
import copy

## Class and Function Definitions

In [2]:
# a node encapsulates an operation / fatures / constant; it remembers it's parent, and knows it's children
class Node(object):
    def __init__(self, typ, val, par):
        self.type = typ
        self.value = val
        self.parent = par
        # init the string and left & right to nothing
        self.Str = '%s(%s)'%(self.type, self.value)
        self.left = None
        self.right = None
        self.leftStr = '_'
        self.rightStr = '_'
        
    def __str__(self):
        return '(%s) -> [%s, %s]'%(self.Str, self.leftStr, self.rightStr)
    
    def setLeft(self, L):
        self.left = L
        self.leftStr = '%s(%s)'%(self.left.type, self.left.value)
        
    def setRight(self, R):
        self.right = R
        self.rightStr = '%s(%s)'%(self.right.type, self.right.value)

In [3]:
# a tree represents the entire function; it knows the root, level-ordered structure, and depth
class Tree(object):
    def __init__(self, root, maxDepth):
        self.root = root
        self.depth = maxDepth # init actual depth with just the max depth allowed for now
        self.struct = None
        # build the structure dict
        self.GenStruct()
        
    def __str__(self):
        # first build the struct dict, if necessary
        if self.struct is None:
            self.GenStruct()
        # now print
        return '\n'.join(['%d: %s'%(key, '|'.join([str(node.value) for node in val])) for (key, val) in self.struct.items()])

    @staticmethod
    def __RecTreeStruct(currNode, tree, currKey):
        '''
        Recursive tree structuring function; only to be called by TreeStruct
        '''
        # save the node
        this = tree[currKey].copy()
        this.append(currNode)
        tree[currKey] = this
        if (currNode.left is None) & (currNode.right is None):
            return tree

        if currNode.left is not None:
            tree = Tree.__RecTreeStruct(currNode.left, tree, currKey+1)
        if currNode.right is not None:
            tree = Tree.__RecTreeStruct(currNode.right, tree, currKey+1)
        return tree

    def GenStruct(self):
        '''
        Return the function tree structure as a dictionary.
        :return tree: level number-keyed ordered dict of the tree
        '''
        # populate the tree view dict; have to init the dict to a large number, because
        # with crossovers, trees could get very large
        self.struct = dict.fromkeys(range(1000), [])
        self.struct = Tree.__RecTreeStruct(self.root, self.struct, 0)
        # prune it now (remove unused rows)
        for key in list(self.struct.keys()):
            if self.struct[key] == []:
                self.struct.pop(key)
        
        # set the depth
        self.depth = max(self.struct.keys())+1
        return self.struct

    def GenFunction(self):
        '''
        Returns a string representation of the function tree as
        a function.
        :return function: the string of the function
        '''
        
        funcStrings = {}

        # special handling of const or feat root nodes
        if self.root.type != 'ops':
            funcStrings[self.root] = str(self.root.value)
        else:
            # start at the top & climb down the tree
            for currLev in range(self.depth, 0, -1):
                nodes = self.struct[currLev]
                # parse the nodes at this level and iterate in pairs
                for indx in range(0, len(nodes), 2):
                    # if there's a func string already defined, use them
                    lVal = funcStrings.get(nodes[indx], str(nodes[indx].value))
                    rVal = funcStrings.get(nodes[indx+1], str(nodes[indx+1].value))
                    # build and store the function string
                    funcStrings[nodes[indx].parent] = nodes[indx].parent.value + '(' + lVal + ',' + rVal + ')'
                    
        return funcStrings[self.root]    

In [4]:
def BuildTreeRec(currNode, currDepth, maxDepth, nodeMeta):
    '''
    Recursive tree building function; only to be called by BuildTree
    '''

    # exit if too deep or at a leaf
    if (currDepth == maxDepth) or (currNode.type != 'ops'):
        return currNode    
    # hit one short of max depth, so ensure only consts or feats selected
    if currDepth == (maxDepth-1):
        noOpsK = [k for k in nodeMeta.keys() if k != 'ops']
        noOpsW = [nodeMeta[t][2] for t in noOpsK]
        nodeTypeL, _ = randomWeightedSelect(noOpsK, noOpsW, 0)
        nodeTypeR, _ = randomWeightedSelect(noOpsK, noOpsW, 0)
    else:
        nodeTypeL, _ = randomWeightedSelect(nodeMeta.keys(), [v[2] for v in nodeMeta.values()], 0)
        nodeTypeR, _ = randomWeightedSelect(nodeMeta.keys(), [v[2] for v in nodeMeta.values()], 0)
        
    # randomly generate the left node
    nodeValuL = nodeMeta[nodeTypeL][0][np.random.randint(nodeMeta[nodeTypeL][1])]
    nodeL = BuildTreeRec(Node(nodeTypeL, nodeValuL, currNode),
                                 currDepth+1, maxDepth, nodeMeta)
    currNode.setLeft(nodeL)

    # randomly generate the right node
    nodeValuR = nodeMeta[nodeTypeR][0][np.random.randint(nodeMeta[nodeTypeR][1])]
    nodeR = BuildTreeRec(Node(nodeTypeR, nodeValuR, currNode),
                                 currDepth+1, maxDepth, nodeMeta)
    currNode.setRight(nodeR)
    
    return currNode

def BuildTree(maxDepth, nodeMeta, randSeed=None):
    '''
    Using a set of types of nodes, build a functional tree.
    :param maxDepth: integer maximum depth allowed for the tree (including the root)
    :param nodeMeta: dictionary holding the a tuple of a list of the node values
        allowed, the number of node values allowed, and node weight for random
        selection; keys are node types of 'ops, 'feats', and 'consts'
    :param randSeed: optional random seed for np.random
    :return tree: the complete functional tree
    :return randSeed: random seed used
    '''
    
    # ranodmize, perhaps
    if randSeed is None:
        randSeed = int(str(time.time()).split('.')[1])
        print('Random Seed = %d'%randSeed)
    np.random.seed(randSeed)
    
    # randomly generate the root node type & value
    nodeType, _ = randomWeightedSelect(nodeMeta.keys(), [v[2] for v in nodeMeta.values()], 0)
    nodeValu = nodeMeta[nodeType][0][np.random.randint(nodeMeta[nodeType][1])]
    
    # build the tree
    rootNode = BuildTreeRec(Node(nodeType, nodeValu, None), 0, maxDepth-1, nodeMeta)

    return Tree(rootNode, maxDepth), randSeed

In [5]:
def randomWeightedSelect(keys, wats, randSeed=None):
    '''
    Randomly select an item from a list, according to a set of
    specified weights.
    :param keys: array-like of items from which to select
    :param wats: array-like of weights associated with the input
        keys; must be sorted in descending weight
    :param randSeed: optional random seed for np.random; if no
        randomization is desired, pass 0
    :return selection: selected item
    :return randSeed: random seed used
    '''
    
    # ranodmize, perhaps
    if randSeed != 0:
        if randSeed is None:
            randSeed = int(str(time.time()).split('.')[1])
        np.random.seed(randSeed)
    
    # ensure weights sum to 1
    totWats = sum(wats)
    if totWats != 1:
        wats = [v/totWats for v in wats]
    
    # get the cumulative weights
    cumWats = np.cumsum(wats)
    # get the indices of where the random [0,1] is < the cum weight
    rnd = np.random.rand()
    seld = rnd < cumWats
    
    return [k for (k,s) in zip(keys, seld) if s][0], randSeed

In [43]:
def TreeCrossOver(this, that, verbose=False):
    '''
    Cross two trees at a node selected at random.
    :param this: first tree to cross
    :param that: second tree to cross
    :param verbose: optional (default=False) flag to print the crossover node
    :return this_new: new crossed-over tree
    :return that_new: new crossed-over tree
    '''
    
    # first create copies of the input trees
    thisC = copy.deepcopy(this)
    thatC = copy.deepcopy(that)
    
    # get the random crossover points
    thisXoverNode = np.random.permutation(list(chain.from_iterable(thisC.struct.values())))[0]
    if verbose:
        print('This crossover point: %s '%thisXoverNode)
    thatXoverNode = np.random.permutation(list(chain.from_iterable(thatC.struct.values())))[0]
    if verbose:
        print('That crossover point: %s '%thatXoverNode)
    
    # reassign the children
    try:
        if thisXoverNode.parent.right == thisXoverNode:
            thisXoverNode.parent.right = thatXoverNode
        elif thisXoverNode.parent.left == thisXoverNode:
            thisXoverNode.parent.left = thatXoverNode
    except AttributeError:
        # this is a root node, so there is no parent, so no left or right
        pass
    try:    
        if thatXoverNode.parent.right == thatXoverNode:
            thatXoverNode.parent.right = thisXoverNode
        elif thatXoverNode.parent.left == thatXoverNode:
            thatXoverNode.parent.left = thisXoverNode
    except AttributeError:
        # that is a root node, so there is no parent, so no left or right
        pass
    
    # reassign the parents
    thisXoverNode.parent, thatXoverNode.parent = thatXoverNode.parent, thisXoverNode.parent
    
    # if either is an orphan from having been crossed with a root, make it a new tree;
    # otherwise, just rebuild the structure
    if thisXoverNode.parent is None:
        thatC = Tree(thisXoverNode, thisC.depth + thatC.depth)
    else:
        thatC.GenStruct()
    if thatXoverNode.parent is None:
        thisC = Tree(thatXoverNode, thisC.depth + thatC.depth)
    else:
        thisC.GenStruct()

    return thisC, thatC

In [6]:
''' binary arithmetic operations that can be called as functions '''
# summation
def sm(a, b):
    return a+b

# subtraction
def sb(a, b):
    return a-b

# multiplication
def ml(a, b):
    return np.nan_to_num(a * b, posinf=np.nan)

# division
def dv(a, b):
    # check for longest dimensional match
    try:
        lna = len(a)
    except TypeError:
        lna = 1
    try:
        lnb = len(b)
    except TypeError:
        lnb = 1
    # compute
    if (lna == lnb) & (lna == 1):
        # both scalars
        if b != 0:
            res = a/b
        else:
            res = np.nan
    else:
        # at least 1 iterable
        if lna < lnb:
            # a is scalar, b is not
            a = [a]*lnb
        elif lnb < lna:
            # b is scalar, a is not
            b = [b]*lna
        res = np.nan_to_num(a / b, posinf=np.nan)
    return res

# power
def pw(a, b):
    return np.nan_to_num(a ** b, posinf=np.nan)

# minimum
def mn(a, b):
    # check for longest dimensional match
    try:
        lna = len(a)
    except TypeError:
        lna = 1
    try:
        lnb = len(b)
    except TypeError:
        lnb = 1
    # compute
    if (lna == lnb) & (lna == 1):
        # both scalars
        res = min(a, b)
    elif lna == lnb:
        # both iterables
        res = np.where(a < b, a, b)
    elif lna < lnb:
        # a is scalar, b is not
        tmp = [a]*lnb
        res = np.where(tmp < b, tmp, b)
    elif lnb < lna:
        # b is scalar, a is not
        tmp = [b]*lna
        res = np.where(a < tmp, a, tmp)
    return res

# maximum
def mx(a, b):
    # check for longest dimensional match
    try:
        lna = len(a)
    except TypeError:
        lna = 1
    try:
        lnb = len(b)
    except TypeError:
        lnb = 1
    # compute
    if (lna == lnb) & (lna == 1):
        # both scalars
        res = max(a, b)
    elif lna == lnb:
        # both iterables
        res = np.where(a > b, a, b)
    elif lna < lnb:
        # a is scalar, b is not
        tmp = [a]*lnb
        res = np.where(tmp > b, tmp, b)
    elif lnb < lna:
        # b is scalar, a is not
        tmp = [b]*lna
        res = np.where(a > tmp, a, tmp)
    return res

## Build Trees

In [7]:
# set the possible node values
ops = ['sm', 'sb', 'ml', 'dv', 'pw', 'mx', 'mn']
feats = ['X%d'%i for i in range(5)]
consts = list(range(1, 11))

# must be orderd by descending weight - [values, length, weight] 
nodeMeta = OrderedDict()
nodeMeta['ops'] = [ops, len(ops), 0.5]
nodeMeta['feats'] = [feats, len(feats), 0.25]
nodeMeta['consts'] = [consts, len(consts), 0.25]

In [None]:
''' randomly generate some trees '''
# set the depth
maxDepth = 10

# build the tree, starting from the top node
treeCnt = 20
trees = [None]*treeCnt
for indx in range(treeCnt):
    print('Creating tree %0d'%indx)
    time.sleep(np.random.rand()) # setting a random wait time to allow seed differentiation
    trees[indx], prngs = BuildTree(maxDepth, nodeMeta, None)
    print(trees[indx])

## Apply to a Dataframe

In [None]:
# generate some data
p = len(feats)
n = 1000
data = pd.DataFrame(data=np.random.rand(n, p), columns=feats)
display(data.head())

In [None]:
# now apply all trees
for indx in range(len(trees)):
    print('Processing tree %0d'%indx)
    func = trees[indx].GenFunction()
    data['tree%0d'%indx] = eval(func.replace('X', 'data.X'))
# talk
display(data.head())