# Use the GP to Predict Team Season Results Using Player Statistics

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import ipdb
from collections import OrderedDict
import time
import copy
import re
import warnings
import sys
import sqlite3
from sqlite3 import Error
from sqlalchemy import create_engine

from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

import chart_studio.plotly as ply
import chart_studio.tools as plytool
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.offline as plyoff
import plotly.subplots as plysub

pd.set_option('display.max_columns', None)

In [None]:
# my imports
#sys.path.append('../src/')
from util.Utils import *
from GP.GP import *
from GP.FunctionTree import *
from GP.Objective import *

## Load and Prep Data

In [None]:
''' load the data '''
# open
dbFile = '../data/baseballdata.db'
dbConn = sqlite3.connect(dbFile)
# query
sql = 'SELECT * FROM NLALRegularSeasonTeamStatsRanks;'
data = pd.read_sql(sql, dbConn)
display(data.head())
# close
dbConn.close()

In [None]:
''' final data prep ''' 
# drop a few columns
data.drop(columns=['_G', '_L'], inplace=True)
# encode division & league win columns as binary
data['_DivWin'] = np.where(data['_DivWin'] == 'Y', True, False)
data['_LgWin'] = np.where(data['_LgWin'] == 'Y', True, False)

In [None]:
# dropping pitcher's wins & losses for obvious reasons; can add this back when using year t stats
# to predict year t+1 performance
data.drop(columns=['P_Win', 'P_Loss'], inplace=True)

In [1]:
# define column roles
colID = ['_yearID', '_lgID', '_difID', '_teamID', 'Team']
colResp = ['_Rank', '_divWin', '_LgWin', 'WinPerc', 'WinLosPerc'] # all potentially useful targets
respTypes = ['C', 'C', 'C', 'R', 'R']
colPred = [col for col in data.columns if col[:2] in ['F_', 'B_', 'P_']]

## Run the GP
### How do player stats best relate to team performance?

In [3]:
# set the target column
print(colResp)
resp = int(input('Enter the index of the potential response column to use: '))
response = colResp[resp]
respType = respTypes[resp]
print('Reponse = %s(%s)'%(response, respType))
data['target'] = data[response]

['_Rank', '_divWin', '_LgWin', 'WinPerc', 'WinLosPerc']


Enter the index of the potential response column to use:  3


Reponse = WinPerc(R)


In [None]:
''' prepare GP input parameters '''
# GP parameters
parmsGP = {'showTopSubs':10, 'populSize':500, 'numGens':200, 'noChangeTerm':180, 'convgCrit':0.00001,
           'elitism':True, 'mateType':1, 'probXover':0.8, 'probMutate':0.3, 'plotFlag':True,
           'printFreq':10, 'maxDepth':4, 'probPrune':0.4, 'probSimp':0.2}
# data parameters
parmsData = {'data':data, 'name':'NLALRegularSeasonTeamStatsRanks_%s'%response}

# set the possible node values
ops = ['ad', 'sb', 'ml', 'dv', 'pw', 'mx', 'mn']
consts = [0, 1, 2, 3, 10, 100]
nodeMeta = OrderedDict() # must be orderd by descending weight - [values, length, weight] 
nodeMeta['op'] = [ops, len(ops), 0.5]
nodeMeta['feat'] = [colPred, len(colPred), 0.25]
nodeMeta['const'] = [consts, len(consts), 0.25]

In [None]:
''' objective parameters '''
if respType == 'R':
    # for a continuous target
    estim = LinearRegression(fit_intercept=False)
    parmsGP['optimGoal'] = -1
    parmsObj = {'function':'TreeRegressionMetric',
                'arguments':{'data':None, 'tree':None, 'estim':estim, 'feats':colPred, 'metric':'RMSE', 'optimGoal':parmsGP['optimGoal']}}
elif respType == 'C':
    # for a discrete target
    estim = DecisionTreeClassifier(max_depth=5, min_samples_leaf=20)
    parmsGP['optimGoal'] = 1
    parmsObj = {'function':'TreeClassificationMetric',
                'arguments':{'data':None, 'tree':None, 'estim':estim, 'feats':colPred, 'metric':'accuracy', 'optimGoal':parmsGP['optimGoal']}}

In [None]:
# run the GP - hold on to your butts
randSeed = None#42
verb = False
MSims = 1

# init
bestTrees = [None]*MSims
bestScores = [None]*MSims
genBestss = [None]*MSims
genScoress = [None]*MSims
randSeeds = [None]*MSims
timeStamps = [None]*MSims
figGPProgresss = [None]*MSims
seedTrees = []
seedFuncs = []

# ignore all warnings - may be a very bad idea
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for sim in range(MSims):
        print('Executing GP %d of %d'%(sim+1, MSims))
        bestTrees[sim], bestScores[sim], genBestss[sim], genScoress[sim],\
        randSeeds[sim], timeStamps[sim], figGPProgresss[sim] = RunGP(parmsGP, parmsData, parmsObj, nodeMeta, seedTrees, verb, randSeed)
        # add the best tree to seed the next GP run, if new
        bstFunc = bestTrees[sim].function
        try:
            seedFuncs.index(bstFunc)
        except ValueError:
            # this best is new, so add
            seedTrees.append(bestTrees[sim])
            seedFuncs.append(bstFunc)

# get the overall best
bestIndx = np.argmax(parmsGP['optimGoal']*np.array(bestScores))
bestScore = bestScores[bestIndx]
bestTree = bestTrees[bestIndx]
timeStamp = timeStamps[bestIndx]

In [None]:
''' evaluate the tree predictions '''
# choose the tree
tree = bestTree
# score it
objFunc = parmsObj['function']
objArgs = parmsObj['arguments'] 
objArgs['data'] = data
objArgs['tree'] = tree.function
objArgs['feats'] = colPred
objStr = '%s_%s'%(objFunc, ('_'.join(['%s%r'%(key, val) for (key, val) in objArgs.items()
                                      if key not in ['data', 'tree', 'feats']])).replace("'",''))
score, preds, objModel = globals()[objFunc](**objArgs)
print(tree)
print('Score = %0.3f'%score)

# create the tree function
treeFunc = tree.function
for feat in colPred:
    treeFunc = treeFunc.replace(feat, 'data.'+feat)

# add the tree results & compute error
data['treePred'] = eval(treeFunc)
data['treeErr'] = data['target'] - data['treePred']

# talk
display(data.head())

# plot
resPltTit = 'GP Performance: %s = %0.3f'%(tree.function, score)
figGPPerformance = ResultsPlots(data, sequenceCol='_yearID', responseCol='target',
                                predCol='treePred', resdCol='treeErr', colorCol=None,
                                overall_title=resPltTit, plot_colors=('red',)*4)
plyoff.plot(figGPPerformance, filename='../output/GPPerformance_%s_%s_%s.html'\
            %(parmsData['name'], timeStamp, objStr), auto_open=True, include_mathjax='cdn')

In [None]:
# plot correlation between target, predition, and features
cols = [response, 'treePred']
cols.extend(colPred)
figCorr = correlationsPlot(data[cols].corr(), plotTitl='Prediction & Feature Correlations Plot', trcLims=(0.0, 0.5, 0.75, 0.9, 1.0), tweaks=(20, None, None, 1.1))
plyoff.plot(figCorr, filename='../output/GPPredCorrMatrix_%s_%s_%s.html'%(parmsData['name'], timeStamp, objStr), auto_open=True, include_mathjax='cdn')

In [None]:
''' try a model '''
# setup predictor columns
cols = ['treePred']
cols.extend(colPred)

# use recursive feature elimination with CV to select some features
K = 5
selector = RFECV(estimator=estim, min_features_to_select=K, verbose=0, n_jobs=-1)
selector.fit(X=data[cols].values, y=data['target'].values)

# get best features
colSelPred = [c for r, c in zip(selector.ranking_, cols) if r == 1]
print('Selected Features: %s'%colSelPred)
print('Tree prediction is%s in the selected features!'%(['', ' not']['treePred' in colSelPred]))

# fit a model with best features after scaling
sdata = StandardScaler().fit_transform(data[colSelPred].values)
estim.fit(X=sdata, y=data['target'].values)
data['modelPred'] = estim.predict(X=data[colSelPred].values)
data['modelErr'] = data['target'] - data['modelPred']

# show feature importances
if hasattr(estim, 'coef_'):
    featimport = pd.DataFrame(index=colSelPred, data=estim.coef_, columns=['Coefficient'])
    featimport['absval'] = featimport['Coefficient'].abs()
    featimport = featimport.sort_values(by='absval', ascending=False, inplace=False).drop(columns='absval', inplace=False)
    display(featimport)
elif hasattr(estim, 'feature_importances_'):
    featimport = pd.DataFrame(index=colSelPred, data=estim.feature_importances_, columns=['Feature Importance'])
    featimport.sort_values(by='Feature Importance', ascending=False, inplace=True)
    display(featimport)
else:
    print('No feature importance information')


# plot
resPltTit = 'Best Model Performance: %s = %0.3f'%(tree.function, score)
figModel = ResultsPlots(data, sequenceCol='_yearID', responseCol='target',
                                predCol='modelPred', resdCol='modelErr', colorCol=None,
                                overall_title=resPltTit, plot_colors=('red',)*4)
plyoff.plot(figGPPerformance, filename='../output/MOdelPerformance_%s_%s_%s.html'\
            %(parmsData['name'], timeStamp, objStr), auto_open=True, include_mathjax='cdn')