In [19]:
import pandas as pd
import pymysql as mc 
import os
import sys
import time
import numpy as np
from sklearn.neural_network import MLPClassifier

In [42]:
cnx = mc.connect(user='akashgoyal',password=os.environ['DB_PASSWORD'],
                 host='stromberg.cs.uchicago.edu',db='mlb_practicum',port=3306)

In [3]:
query = """SELECT p1.gameID,p1.playerID,firstName,lastName,batPosition,
            teamAbbreviation AS player_mlb_team,100_avg,300_avg,500_avg, 1000_avg FROM Player p1
            INNER JOIN Batter_Run_Expectancy b1 ON
            (p1.gameID = b1.gameID AND p1.playerID = b1.playerID)"""
df = pd.read_sql_query(query,cnx)

In [17]:
a = WinProb(cnx,year=['2011'])

done running query
done preparing new df for regression
done adding pythagorean wins
you can manually run the neural net now if you like by calling self.create_neural_net(number_nodes)


In [43]:
b = WinProb(cnx,year=['2012'])

done running query
done preparing new df for regression
done adding pythagorean wins
you can manually run the neural net now if you like by calling self.create_neural_net(number_nodes)


In [44]:
print ('Your out-of-sample Neural Network Score for 2012 was {}'.format(nn.score(b.X,b.Y)))

Your out-of-sample Neural Network Score for 2012 was 0.7106711123723042


In [40]:
nn = MLPClassifier(solver='adam',activation='relu',hidden_layer_sizes=(80,40),random_state=1,max_iter=200)
nn.fit(a.X,a.Y)
print ('Your in sample Neural Network Score was {}'.format(nn.score(a.X,a.Y)))

Your in sample Neural Network Score was 0.8686578816727224


In [16]:
class WinProb:
    '''
        cnx is the SQL connection, sql_attrs is a list of items in either Pitch2 or
        Pitcher_Run_Expectancy. Enter a year between 2010 and 2017. you must enter
        at least one item in the list or else it will mess up. 
    '''
    def __init__(self,cnx,year=None):
        self.cnx = cnx
        self.year = year
        self.sql_attrs = ['Pitcher_Run_Expectancy.300_avg','Pitcher_Run_Expectancy.500_avg',
                          'Pitcher_Run_Expectancy.1000_avg','Pitcher_Run_Expectancy.2000_avg',
                          'timesFaced','cumulativePitches']
        self.df = self.run_sql_query()
        print ('done running query')
        self.add_wins_and_current_lead()
        self.classify_count()
        self.add_runners_on_base()
        self.X,self.Y = self.prepare_logistic_regression()
        print ('done preparing new df for regression')
        self.add_pythagorean_wins()
        print ('done adding pythagorean wins')
        print ('you can manually run the neural net now if you like by calling self.create_neural_net(number_nodes)')
        self.nn = None

    def run_sql_query(self):
        query = ''
        if not self.year:
            query = """SELECT Game.gameID AS gameID,homeTeamScore,awayTeamScore,curr_inn,balls,strikes,
                        firstBaseRunner, secondBaseRunner,thirdBaseRunner,home_team_runs,away_team_runs,
                        (trc1.100_avg - trc2.100_avg) AS home_less_away_100, (trc1.300_avg-trc2.300_avg) AS home_less_away_300,
                        (trc1.500_avg-trc2.500_avg) AS home_less_away_500, (trc1.1000_avg-trc2.1000_avg) AS home_less_away_1000,{}
                        FROM Pitch2 INNER JOIN Game ON Game.gameID = Pitch2.gameID INNER JOIN Pitcher_Run_Expectancy ON 
                        (Pitcher_Run_Expectancy.gameID = Pitch2.gameID AND
                        Pitcher_Run_Expectancy.playerID = Pitch2.pitcherID)
                        INNER JOIN team_starting_wrc_stats trc1 ON 
                        (trc1.gameID = Pitch2.gameID AND trc1.retrosheet = Game.homeTeam) 
                        INNER JOIN team_starting_wrc_stats trc2 ON 
                        (trc2.gameID = Pitch2.gameID AND trc2.retrosheet = Game.awayTeam)""".format(','+','.join(self.sql_attrs))
        else:
            #gets the home_team final score, away_team final score, current inning (if ends in 0.5 its home, if 0 away)
            # balls, strikes, runners on 1st,2nd,3rd, current runs for the home team/awayteam, and the average 
            # wrc for the home team less that for the away team for trailing 100,300,500,1000AB's
            # in the sql_attrs there is 300,500,1000,20000 trailing wrc from pitcher perspective, as well as times faced
            # and cumulative pitches 
            #note that team_starting_wrc_stats is this view below. 
            '''
                CREATE VIEW team_starting_wrc_stats AS 
                SELECT gameID,retrosheet,100_avg,300_avg,500_avg,1000_avg FROM Team_wrc_stats twrc
                INNER JOIN Team_Mapping t1 ON t1.mlb = twrc.team;
            '''
            query = """SELECT Game.gameID AS gameID,homeTeamScore,awayTeamScore,curr_inn,balls,strikes,
                        firstBaseRunner, secondBaseRunner,thirdBaseRunner,home_team_runs,away_team_runs,
                        (trc1.100_avg - trc2.100_avg) AS home_less_away_100, (trc1.300_avg-trc2.300_avg) AS home_less_away_300,
                        (trc1.500_avg-trc2.500_avg) AS home_less_away_500, (trc1.1000_avg-trc2.1000_avg) AS home_less_away_1000,{}
                        FROM Pitch2 INNER JOIN Game ON Game.gameID = Pitch2.gameID INNER JOIN Pitcher_Run_Expectancy ON 
                        (Pitcher_Run_Expectancy.gameID = Pitch2.gameID AND
                        Pitcher_Run_Expectancy.playerID = Pitch2.pitcherID)
                        INNER JOIN team_starting_wrc_stats trc1 ON 
                        (trc1.gameID = Pitch2.gameID AND trc1.retrosheet = Game.homeTeam) 
                        INNER JOIN team_starting_wrc_stats trc2 ON 
                        (trc2.gameID = Pitch2.gameID AND trc2.retrosheet = Game.awayTeam) 
                        WHERE YEAR(gameDate) IN ({})""".format(','.join(self.sql_attrs),','.join(self.year))
            df = pd.read_sql_query(query,self.cnx)
            return df 
    
    def add_wins_and_current_lead(self):
        #define whether or not the team batting now won or lost the game 
        self.df['win'] = pd.Series([(1 if (x[0]<x[1] and x[2]*2%2==0) or (x[0]>x[1] and x[2]*2%2==1) else 0)
                        for x in self.df[['homeTeamScore','awayTeamScore','curr_inn']].values])
        
        #get current team's wrc
        trailing_abs = ['100','300','500','1000']
        for ab_num in trailing_abs:
            self.df[f'home_less_away_{ab_num}'] = np.where(self.df['curr_inn']*2%2==1,
                                                           self.df[f'home_less_away_{ab_num}'],
                                                           -self.df[f'home_less_away_{ab_num}'])
        #add the current lead 
        l = []
        for x in self.df[['home_team_runs','away_team_runs','curr_inn']].values:
            #if home team
            if x[2]*2%2==1:
                l.append(x[0]-x[1])
            else:
                l.append(x[1]-x[0])
        self.df['current_lead'] = l
        #drop the homeTeamScore,awayTeamScore since we don't know the final outcome at the current pitch 
        self.df.drop(columns=['homeTeamScore','awayTeamScore','home_team_runs','away_team_runs'],inplace=True)
    
    #create 12 variables for each type of count 
    def classify_count(self):
        s = pd.Series([(x[0],x[1]) for x in self.df[['balls','strikes']].values])
        s = pd.get_dummies(s)
        self.df = self.df.join(s,how='outer')
        self.df.drop(columns=['balls','strikes'],inplace=True) 
        
    def add_runners_on_base(self):
        #indicator variables if runners on base
        firstRunner = []
        secondRunner = []
        thirdRunner = []
        mod_inning = []
        home_away = []
        for x in self.df[['firstBaseRunner','secondBaseRunner','thirdBaseRunner','curr_inn']].values:
            firstRunner.append(1 if x[0] else 0)
            secondRunner.append(1 if x[1] else 0)
            thirdRunner.append(1 if x[2] else 0)
            if (2*x[3])%2 == 1:
                mod_inning.append(x[3]-0.5)
                home_away.append(0)
            else:
                mod_inning.append(x[3])
                home_away.append(1)
        self.df['1b runner'] = firstRunner
        self.df['2b runner'] = secondRunner
        self.df['3b runner'] = thirdRunner
        self.df['mod inning'] = mod_inning
        self.df['home_away'] = home_away
        self.df.drop(columns=['curr_inn','firstBaseRunner','secondBaseRunner',
                              'thirdBaseRunner'],inplace=True)
        
    def prepare_logistic_regression(self):
        Y = self.df['win']
        X = self.df.drop(columns=['win'])
        return X,Y

    def add_pythagorean_wins(self):
        qry = 'SELECT * FROM Pythagorean_Wins'
        wins_df = pd.read_sql_query(qry,self.cnx)
        self.X = self.X.merge(wins_df,how='inner',left_on='gameID',right_on='gameID')

        #this part just takes the difference of the pythagorean win expectancy
        c10 = []
        c30 = []
        c50 = []
        c100 = []
        for (indx,row) in self.X.iterrows():
            x10 = row['away10']-row['home10']
            x30 = row['away30'] - row['home30']
            x50 = row['away50'] - row['home50']
            x100 = row['away100'] - row['home100']
            if row['home_away'] == 1: #if away
                c10.append(x10)
                c30.append(x30)
                c50.append(x50)
                c100.append(x100)
            else:
                c10.append(-x10)
                c30.append(-x30)
                c50.append(-x50)
                c100.append(-x100)
        self.X['p10'] = c10 
        self.X['p30'] = c30
        self.X['p50'] = c50
        self.X['p100'] = c100
        self.X = self.X.drop(columns=['home10','away10','home30','away30','home50',
                                      'away50','home100','away100','gameID'])

    #for now I'll support just one hidden layer
    def create_neural_net(self,hidden_nodes):
        nn = MLPClassifier(solver='lbfgs',alpha=0.001,hidden_layer_sizes=(hidden_nodes,),random_state=1)
        nn.fit(self.X,self.Y)
        self.nn = nn 
        print ('Your in sample Neural Network Score was {}'.format(nn.score(self.X,self.Y))) 

    def calculate_leverage(self,aRow):
        initial_win_prob_pitcher_perspective = nn.predict_proba(aRow.values.reshape(1,-1))[0]
        pitcher_team_gives_up_one_run = aRow.copy()
        pitcher_team_gives_up_one_run['current_lead']+=1 #this is correct bc current lead is from batters POV
        end_win_prob_pitcher_benefit = nn.predict_proba(pitcher_team_gives_up_one_run.values.reshape(1,-1))[0]
        return initial_win_prob_pitcher_perspective-end_win_prob_pitcher_benefit