In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import ipdb
import time
import sys

import chart_studio.plotly as ply
import chart_studio.tools as plytool
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.offline as plyoff
import plotly.subplots as plysub

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# my imports
sys.path.append('../src/')
from GA.GA import *
from GA.Objective import *
from Utils.Utils import *

In [None]:
''' generate some data '''
np.random.seed(42)
p = 20
n = 100

# generate the features & target
X = np.random.rand(n, p)
y = X[:, 0]*5 + X[:, 1]*5 + np.random.normal(loc=0, scale=1, size=n)
simName = '5X0+5X1'
simSubs = np.zeros(shape=(p,1), dtype=int)
simSubs[:2, :] = 1

# create the dataframe
feats = ['X%d'%i for i in range(p)]
data = pd.DataFrame(data=y, columns=['target'])
data[feats] = X

# talk
display(data.head())

In [None]:
# review the features correlations
figCorr = correlationsPlot(data[feats].corr(), plotTitl='Feature Correlations Plot',
                           trcLims=(0.0, 0.75, 0.9, 0.95, 1.0), tweaks=(20, None, None, 1.05))
plyoff.plot(figCorr, auto_open=True, include_mathjax='cdn')

In [None]:
''' prepare GA input parameters '''
# GA parameters
parmsGA = {'initPerc':0.5, 'forceVars':None, 'showTopSubs':10, 'populSize':50, 'numGens':100,
           'noChangeTerm':180, 'convgCrit':0.00001, 'elitism':True, 'mateType':1, 'probXover':0.8,
           'probMutate':0.3, 'probEngineer':0.2, 'optimGoal':-1, 'plotFlag':True, 'printFreq':10,
           'xoverType':1}
# data parameters
parmsData = {'data':data, 'name':simName}
# objective parameters
parmsObj = {'function':'RegressionMetric',
            'arguments':{'data':None, 'subset':None, 'metric':'RMSE', 'optimGoal':parmsGA['optimGoal']}}

In [None]:
''' run the GA - hold on to your butts '''
# parameters
randSeed = 42
verb = False
MSims = 1

# init
bestSubss = [None]*MSims
bestScores = [None]*MSims
genBestss = [None]*MSims
genScoress = [None]*MSims
randSeeds = [None]*MSims
timeStamps = [None]*MSims
figGPProgresss = [None]*MSims
seedSubs = []

for sim in range(MSims):
    print('Executing GA %d of %d'%(sim+1, MSims))
    bestSubss[sim], bestScores[sim], genBestss[sim], genScoress[sim],\
    randSeeds[sim], timeStamps[sim], figGPProgresss[sim] = RunGASubset(parmsGA, parmsData, parmsObj, seedSubs, verb, randSeed)
    # add the best subset to seed the next GP run, if new
    try:
        seedSubs.index(bestSubss[sim])
    except ValueError:
        # this best is new, so add
        seedSubs.append(bestSubss[sim])

# get the overall best
bestIndx = np.argmax(parmsGA['optimGoal']*np.array(bestScores))
bestScore = bestScores[bestIndx]
bestSubs = bestSubss[bestIndx]
timeStamp = timeStamps[bestIndx]

In [None]:
# set some objective stuff for the plots
parmsObj['arguments']['data'] = data
objStr = '%s(%s)'%(parmsObj['function'], ', '.join(['%s=%r'%(key, val) for (key, val) in parmsObj['arguments'].items()\
        if key not in ['data', 'subset']]))
objStr = re.sub('[^0-9A-Za-z_]', '_', objStr)

In [None]:
''' evaluate the best subset '''
# subset name
name = BinaryStr(bestSubs)

# show the selected columns
keep = [f for b, f in zip(bestSubs, feats) if b]
print('Best Subset Columns: %r'%keep)

# get the predictions & model
parmsObj['arguments']['subset'] = bestSubs
_, preds, estim = globals()[parmsObj['function']](**parmsObj['arguments'])

# add the subset results & compute error
data[name] = preds
data['G_error'] = data['target'] - data[name]

# talk
display(data.head())

# plot
figGAPerformance = ResultsPlots(data, sequenceCol=None, responseCol='target',
                                predCol=name, resdCol='G_error', colorCol=None,
                                overall_title='GA Performance: %s = %0.4f'%(name, bestScore), plot_colors=('blue',)*4)
plyoff.plot(figGAPerformance, filename='../output/GAPerformance_%s_%s_%s.html'\
            %(timeStamp, re.sub('[^0-9A-Za-z_]', '_', simName), objStr), auto_open=True, include_mathjax='cdn')

In [None]:
''' evaluate the full subset '''
# subset
fullSubs = np.ones(shape=(p,1))
name = BinaryStr(fullSubs)

# get the predictions & model
parmsObj['arguments']['subset'] = fullSubs
fullScore, preds, estim = globals()[parmsObj['function']](**parmsObj['arguments'])

# add the subset results & compute error
data['full'] = preds
data['F_error'] = data['target'] - data['full']

# talk
display(data.head())

# plot
figFull = ResultsPlots(data, sequenceCol=None, responseCol='target', predCol='full',
                       resdCol='F_error', colorCol=None, overall_title='Full Model = %0.4f'%fullScore,
                       plot_colors=('red',)*4)
plyoff.plot(figFull, filename='../output/FullModel_%s_%s.html'\
            %(re.sub('[^0-9A-Za-z_]', '_', simName), objStr), auto_open=True, include_mathjax='cdn')

In [None]:
''' evaluate the true subset '''
# subset
name = BinaryStr(simSubs)

# get the predictions & model
parmsObj['subset'] = simSubs
simScore, preds, estim = globals()[parmsObj['function']](**parmsObj['arguments'])

# add the subset results & compute error
data['True'] = preds
data['T_error'] = data['target'] - data['full']

# talk
display(data.head())

# plot
figTrue = ResultsPlots(data, sequenceCol=None, responseCol='target', predCol='True',
                       resdCol='T_error', colorCol=None, overall_title='True Model = %0.4f'%fullScore,
                       plot_colors=('green',)*4)
plyoff.plot(figTrue, filename='../output/TrueModel_%s_%s.html'\
            %(re.sub('[^0-9A-Za-z_]', '_', simName), objStr), auto_open=True, include_mathjax='cdn')