# Genetic Programming for Feature Engineering
- <a href=#sample>Apply some Sample Trees to a Dataframe</a>
- <a href=#test>Apply the GP to a Test dataset</a>

<a id=top></a>

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import ipdb
from collections import OrderedDict
import time
import copy
import re
import warnings
import sys

import chart_studio.plotly as ply
import chart_studio.tools as plytool
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.offline as plyoff
import plotly.subplots as plysub

pd.set_option('display.max_columns', None)

In [None]:
# my imports
sys.path.append('../src/')
from util.Utils import *
from GP.GP import *
from GP.FunctionTree import *
from GP.Objective import *

## Apply some Sample Trees to a Dataframe
<a id=sample></a>
<a href=#top>Go to Top</a>

In [None]:
# set the possible node values
ops = ['ad', 'sb', 'ml', 'dv', 'pw', 'mx', 'mn']
feats = ['X%d'%i for i in range(5)]
consts = [0, 1, 2, 3, 10, 100]

# must be orderd by descending weight - [values, length, weight] 
nodeMeta = OrderedDict()
nodeMeta['op'] = [ops, len(ops), 0.5]
nodeMeta['feat'] = [feats, len(feats), 0.25]
nodeMeta['const'] = [consts, len(consts), 0.25]

In [None]:
''' randomly generate some trees '''
# set the prng seed
randSeed = int(str(time.time()).split('.')[1])
print('Random Seed = %d'%randSeed)
np.random.seed(randSeed)

# set the depth
maxDepth = 10

# build the tree, starting from the top node
treeCnt = 20
trees = [None]*treeCnt
for indx in range(treeCnt):
    print('Creating tree %0d'%indx)
    time.sleep(np.random.rand()) # setting a random wait time to allow seed differentiation
    trees[indx] = BuildTree(maxDepth, nodeMeta, True)
    print(trees[indx])

In [None]:
''' try some GP operations '''
# crossover 2 pairs
trees.extend(TreesCrossover(trees[8], trees[14], True))
trees.extend(TreesCrossover(trees[15], trees[16], True))

# mutate 2
trees.append(TreeMutate(trees[18], maxDepth, nodeMeta, True))
trees.append(TreeMutate(trees[19], maxDepth, nodeMeta, True))

In [None]:
print(trees[13])
print('--------')
print(trees[16])
print('--------')
print(trees[-4])
print('--------')
print(trees[-3])

In [None]:
# generate some data
p = len(feats)
n = 1000
data = pd.DataFrame(data=np.random.rand(n, p), columns=feats)
display(data.head())

In [None]:
# now apply all trees
for indx in range(len(trees)):
    print('Processing tree %0d'%indx)
    func = trees[indx].GenFunction()
    data['tree%0d'%indx] = eval(func.replace('X', 'data.X'))
# talk
display(data.head())

## Apply the GP to a Test dataset
<a id=test></a>
<a href=#top>Go to Top</a>

In [None]:
''' generate some data '''
np.random.seed(42)
p = 5
n = 100

# generate the features & build the dataframe
X = np.random.normal(loc=10, scale=1, size=(n,p))
feats = ['X%d'%i for i in range(p)]
data = pd.DataFrame(data=X, columns=feats)

# build the actual tree
actTreeRoot = Node('op', 'dv', None)
actTreeRoot.setLeft(Node('op', 'sb', actTreeRoot))
actTreeRoot.setRight(Node('feat', 'X2', actTreeRoot))
lft = actTreeRoot.left
lft.setLeft(Node('feat', 'X0', lft))
lft.setRight(Node('op', 'ml', lft))
rgt = lft.right
rgt.setLeft(Node('const', '10', rgt))
rgt.setRight(Node('feat', 'X1', rgt))
actTree = Tree(actTreeRoot, 4)

# print the tree
actFunc = actTree.function
for feat in feats:
    actFunc = actFunc.replace(feat, 'data.'+feat)
print(actTree)
print(actFunc)
    
# generate the response
data['target'] = eval(actFunc)# +  np.random.normal(loc=0, scale=0.5, size=(n,))

# talk
display(data.head())

In [None]:
''' prepare GP input parameters '''
# GP parameters
parmsGP = {'showTopSubs':10, 'populSize':500, 'numGens':200, 'noChangeTerm':180, 'convgCrit':0.00001,
           'elitism':True, 'mateType':1, 'probXover':0.8, 'probMutate':0.3, 'optimGoal':-1,
           'plotFlag':True, 'printFreq':10, 'maxDepth':4, 'probPrune':0.4, 'probSimp':0.2}
# data parameters
parmsData = {'data':data, 'name':'Simulated: %s'%actFunc}
# objective parameters
parmsObj = {'function':'TreeRegressionMetric',
            'arguments':{'data':None, 'tree':None, 'feats':feats, 'metric':'RMSE', 'optimGoal':parmsGP['optimGoal']}}

# set the possible node values
ops = ['ad', 'sb', 'ml', 'dv', 'pw', 'mx', 'mn']
consts = [0, 1, 2, 3, 10, 100]
nodeMeta = OrderedDict() # must be orderd by descending weight - [values, length, weight] 
nodeMeta['op'] = [ops, len(ops), 0.5]
nodeMeta['feat'] = [feats, len(feats), 0.25]
nodeMeta['const'] = [consts, len(consts), 0.25]

In [None]:
# run the GP - hold on to your butts
randSeed = None#42
verb = False
MSims = 5

# init
bestTrees = [None]*MSims
bestScores = [None]*MSims
genBestss = [None]*MSims
genScoress = [None]*MSims
randSeeds = [None]*MSims
timeStamps = [None]*MSims
figGPProgresss = [None]*MSims
seedTrees = []
seedFuncs = []

# ignore all warnings - may be a very bad idea
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for sim in range(MSims):
        print('Executing GP %d of %d'%(sim+1, MSims))
        bestTrees[sim], bestScores[sim], genBestss[sim], genScoress[sim],\
        randSeeds[sim], timeStamps[sim], figGPProgresss[sim] = RunGP(parmsGP, parmsData, parmsObj, nodeMeta, seedTrees, verb, randSeed)
        # add the best tree to seed the next GP run, if new
        bstFunc = bestTrees[sim].function
        try:
            seedFuncs.index(bstFunc)
        except ValueError:
            # this best is new, so add
            seedTrees.append(bestTrees[sim])
            seedFuncs.append(bstFunc)

# get the overall best
bestIndx = np.argmax(parmsGP['optimGoal']*np.array(bestScores))
bestScore = bestScores[bestIndx]
bestTree = bestTrees[bestIndx]
timeStamp = timeStamps[bestIndx]

In [None]:
''' see the actual target-generating function & tree '''
# print the function
print(actFunc)
# print the tree
print(actTree)

In [None]:
''' evaluate the tree predictions '''
# choose the tree
tree = bestTree
# score it
objFunc = parmsObj['function']
objArgs = parmsObj['arguments'] 
objArgs['data'] = data
objArgs['tree'] = tree.function
objArgs['feats'] = feats
objStr = '%s_%s'%(objFunc, ('_'.join(['%s%r'%(key, val) for (key, val) in objArgs.items()
                                      if key not in ['data', 'tree', 'feats']])).replace("'",''))
score, preds, linreg = globals()[objFunc](**objArgs)
print(tree)
print('Score = %0.3f'%score)

# create the tree function
treeFunc = tree.function
for feat in feats:
    treeFunc = treeFunc.replace(feat, 'data.'+feat)

# add the tree results & compute error
data['tree'] = eval(treeFunc)
data['error'] = data['target'] - data['tree']

# talk
display(data.head())

# plot
figGPPerformance = ResultsPlots(data, sequenceCol=None, responseCol='target',
                                predCol='tree', resdCol='error', colorCol=None,
                                overall_title='GP Performance: %s = %0.3f'%(tree.function, score), plot_colors=('red',)*4)
plyoff.plot(figGPPerformance, filename='../output/GPPerformance_%s_%s_%s.html'\
            %(timeStamp, re.sub('[^0-9A-Za-z_]', '_', actFunc), objStr), auto_open=True, include_mathjax='cdn')

In [None]:
''' evaluate the actual tree predictions '''
# choose the tree
tree = actTree
# score it
objFunc = parmsObj['function']
objArgs = parmsObj['arguments'] 
objArgs['data'] = data
objArgs['tree'] = tree.function
score, preds, linreg = globals()[objFunc](**objArgs)
print(tree)
print('Score = %0.3f'%score)

# create the tree function
treeFunc = tree.function
for feat in feats:
    treeFunc = treeFunc.replace(feat, 'data.'+feat)
    
# add the tree results & compute error
data['actTree'] = eval(treeFunc)
data['actError'] = data['target'] - data['actTree']

# talk
display(data.head())

# plot
figActPerformance = ResultsPlots(data, sequenceCol=None, responseCol='target',
                                predCol='actTree', resdCol='actError', colorCol=None,
                                overall_title='Actual Tree Performance: %s = %0.3f'%(tree.function, score), plot_colors=('green',)*4)
plyoff.plot(figActPerformance, filename='../output/ActPerformance_%s_%s.html'\
            %(re.sub('[^0-9A-Za-z_]', '_', actFunc), parmsObj['function']), auto_open=True, include_mathjax='cdn')

In [None]:
# see some final data results
display(data.head())

<a href=#top>Go to Top</a>