# Use the GP to Predict Team Season Results Using Player Statistics

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import ipdb
from collections import OrderedDict
import time
import copy
import re
import warnings
import sys
import sqlite3
from sqlite3 import Error
from sqlalchemy import create_engine

import chart_studio.plotly as ply
import chart_studio.tools as plytool
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.offline as plyoff
import plotly.subplots as plysub

pd.set_option('display.max_columns', None)

In [None]:
# my imports
sys.path.append('../src/')
from util.Utils import *
from GP.GP import *
from GP.FunctionTree import *
from GP.Objective import *

## Load and Prep Data

In [None]:
''' load the data '''
# open
dbFile = '../data/baseballdata.db'
dbConn = sqlite3.connect(dbFile)
# query
sql = 'SELECT * FROM NLALRegularSeasonTeamStatsRanks;'
data = pd.read_sql(sql, dbConn)
display(data.head())
# close
dbConn.close()

In [None]:
''' final data prep ''' 
# drop a few columns
data.drop(columns=['_G', '_L'], inplace=True)
# encode division & league win columns as binary
data['_DivWin'] = np.where(data['_DivWin'] == 'Y', True, False)
data['_LgWin'] = np.where(data['_LgWin'] == 'Y', True, False)

In [None]:
# dropping pitcher's wins & losses for obvious reasons; can add this back when using year t stats
# to predict year t+1 performance
data.drop(columns=['P_Win', 'P_Loss'], inplace=True)

In [None]:
# define column roles
colID = ['_yearID', '_lgID', '_difID', '_teamID', 'Team']
colResp = ['_Rank', '_divWin', '_LgWin', 'WinPerc', 'WinLosPerc'] # all potentially useful targets
colPred = [col for col in data.columns if col[:2] in ['F_', 'B_', 'P_']]

## Run the GP

In [None]:
# set the target column
print(colResp)
response = colResp[int(input('Enter the index of the potential response column to use: '))]
print('Reponse = %s'%response)
data['target'] = data[response]

In [None]:
''' prepare GP input parameters '''
# GP parameters
parmsGP = {'showTopSubs':10, 'populSize':500, 'numGens':200, 'noChangeTerm':180, 'convgCrit':0.00001,
           'elitism':True, 'mateType':1, 'probXover':0.8, 'probMutate':0.3, 'optimGoal':-1,
           'plotFlag':True, 'printFreq':10, 'maxDepth':4, 'probPrune':0.4, 'probSimp':0.2}
# data parameters
parmsData = {'data':data, 'name':'NLALRegularSeasonTeamStatsRanks(%s)'%response}
# objective parameters
parmsObj = {'function':'TreeRegressionMetric',
            'arguments':{'data':None, 'tree':None, 'feats':colPred, 'metric':'RMSE', 'optimGoal':parmsGP['optimGoal']}}

# set the possible node values
ops = ['ad', 'sb', 'ml', 'dv', 'pw', 'mx', 'mn']
consts = [0, 1, 2, 3, 10, 100]
nodeMeta = OrderedDict() # must be orderd by descending weight - [values, length, weight] 
nodeMeta['op'] = [ops, len(ops), 0.5]
nodeMeta['feat'] = [colPred, len(colPred), 0.25]
nodeMeta['const'] = [consts, len(consts), 0.25]

In [None]:
# run the GP - hold on to your butts
randSeed = 2990386#None#42
verb = False
MSims = 5

# init
bestTrees = [None]*MSims
bestScores = [None]*MSims
genBestss = [None]*MSims
genScoress = [None]*MSims
randSeeds = [None]*MSims
timeStamps = [None]*MSims
figGPProgresss = [None]*MSims
seedTrees = []
seedFuncs = []

# ignore all warnings - may be a very bad idea
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for sim in range(MSims):
        print('Executing GP %d of %d'%(sim+1, MSims))
        bestTrees[sim], bestScores[sim], genBestss[sim], genScoress[sim],\
        randSeeds[sim], timeStamps[sim], figGPProgresss[sim] = RunGP(parmsGP, parmsData, parmsObj, nodeMeta, seedTrees, verb, randSeed)
        # add the best tree to seed the next GP run, if new
        bstFunc = bestTrees[sim].function
        try:
            seedFuncs.index(bstFunc)
        except ValueError:
            # this best is new, so add
            seedTrees.append(bestTrees[sim])
            seedFuncs.append(bstFunc)

# get the overall best
bestIndx = np.argmax(parmsGP['optimGoal']*np.array(bestScores))
bestScore = bestScores[bestIndx]
bestTree = bestTrees[bestIndx]
timeStamp = timeStamps[bestIndx]

In [None]:
''' evaluate the tree predictions '''
# choose the tree
tree = bestTree
# score it
objFunc = parmsObj['function']
objArgs = parmsObj['arguments'] 
objArgs['data'] = data
objArgs['tree'] = tree.function
objArgs['feats'] = colPred
objStr = '%s_%s'%(objFunc, ('_'.join(['%s%r'%(key, val) for (key, val) in objArgs.items()
                                      if key not in ['data', 'tree', 'feats']])).replace("'",''))
score, preds, linreg = globals()[objFunc](**objArgs)
print(tree)
print('Score = %0.3f'%score)

# create the tree function
treeFunc = tree.function
for feat in colPred:
    treeFunc = treeFunc.replace(feat, 'data.'+feat)

# add the tree results & compute error
data['tree'] = eval(treeFunc)
data['error'] = data['target'] - data['tree']

# talk
display(data.head())

# plot
figGPPerformance = ResultsPlots(data, sequenceCol='_yearID', responseCol='target',
                                predCol='tree', resdCol='error', colorCol=None,
                                overall_title='GP Performance: %s = %0.3f'%(tree.function, score), plot_colors=('red',)*4)
plyoff.plot(figGPPerformance, filename='../output/GPPerformance_%s_%s_%s.html'\
            %(parmsData['name'], timeStamp, objStr), auto_open=True, include_mathjax='cdn')