In [1]:
import pandas as pd
import pymysql as mc 
import os
import sys
import time
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import getpass
from sklearn.preprocessing import PolynomialFeatures

In [2]:
pw = getpass.getpass()
cnx = mc.connect(user='akashgoyal',password=pw,
                 host='stromberg.cs.uchicago.edu',db='mlb_practicum',port=3306)

········


In [5]:
a = WinProb(cnx,year=['2016'])

done running query
done preparing new df for regression
done reading query. now onto merging
done merging now onto pwins
done with p10
done with p30
done with p50
done with p100
done adding pythagorean wins
you can manually run the neural net now if you like by calling self.create_neural_net(number_nodes)


In [6]:
X_train = a.X.iloc[:,2:]

In [7]:
nn = MLPClassifier(solver='adam',activation='relu',hidden_layer_sizes=(20,),random_state=1,max_iter=200)
nn.fit(X_train,a.Y)
print ('Your in sample Neural Network Score was {}'.format(nn.score(X_train,a.Y)))

Your in sample Neural Network Score was 0.763385209549546


In [8]:
b = WinProb(cnx,year=['2011,2012,2013,2014,2015,2017'])

done running query
done preparing new df for regression
done reading query. now onto merging
done merging now onto pwins
done with p10
done with p30
done with p50
done with p100
done adding pythagorean wins
you can manually run the neural net now if you like by calling self.create_neural_net(number_nodes)


In [9]:
X_test = b.X.iloc[:,2:]


In [35]:
predictions = nn.predict_proba(b.X.iloc[:,2:])[:,0]

In [22]:
lev = calculate_leverage(X_test,nn)

In [25]:
lev

array([0.03980725, 0.02576853, 0.04015454, ..., 0.00630492, 0.00505995,
       0.00503595])

In [30]:
b.X.shape,lev.shape

((4252289, 35), (4252289,))

In [37]:
output = np.hstack((b.X.iloc[:,:2],predictions.reshape(-1,1),lev.reshape(-1,1)))
output.shape

(4252289, 4)

In [41]:
df_output = pd.DataFrame(output,columns=['gameID','pitchID','win_prob','leverage'])

In [43]:
df_output.to_csv('leverage_calc.txt')

In [21]:
def map_expected_runs(df):
    total = 0
    if df['1b runner']==1:
        total+=1
    if df['2b runner']==1:
        total+=2
    if df['3b runner']==1:
        total+=4
    if df['outs']==1:
        total+=8
    elif df['outs']==2:
        total+=16
    
    leverage_dict = {0: 0.539, 1: 0.918, 2: 1.161, 3: 1.46, 4: 1.449, 5: 1.636, 6: 1.79,7: 2.113,
 8: 0.284, 9: 0.536, 10: 0.67, 11: 0.90, 12: 1.08, 13: 1.06, 14: 1.44, 15: 1.55,
 16: 0.13, 17: 0.254, 18: 0.365, 19: 0.51, 20: 0.38, 21: 0.503, 22: 0.55, 23: 0.764}
    return leverage_dict[total]+df['current_lead']

In [20]:
def calculate_leverage(X,nn):
    initial_win_prob_pitcher_perspective = nn.predict_proba(X)[:,0]
    X['current_lead'] = X.apply(map_expected_runs,axis=1)
    end_win_prob_pitcher_benefit = nn.predict_proba(X)[:,0]
    return initial_win_prob_pitcher_perspective-end_win_prob_pitcher_benefit

In [32]:
prediction_probabilities = None

In [4]:
class WinProb:
    '''
        cnx is the SQL connection, sql_attrs is a list of items in either Pitch2 or
        Pitcher_Run_Expectancy. Enter a year between 2010 and 2017. you must enter
        at least one item in the list or else it will mess up. 
    '''
    def __init__(self,cnx,year=None):
        self.cnx = cnx
        self.year = year
        self.sql_attrs = ['Pitcher_Run_Expectancy.300_avg','Pitcher_Run_Expectancy.500_avg',
                          'Pitcher_Run_Expectancy.1000_avg','Pitcher_Run_Expectancy.2000_avg',
                          'timesFaced','cumulativePitches']
        self.df = self.run_sql_query()
        print ('done running query')
        self.add_wins_and_current_lead()
        self.classify_count()
        self.add_runners_on_base()
        self.X,self.Y = self.prepare_logistic_regression()
        print ('done preparing new df for regression')
        self.add_pythagorean_wins()
        print ('done adding pythagorean wins')
        print ('you can manually run the neural net now if you like by calling self.create_neural_net(number_nodes)')
        self.df = None
        self.nn = None

    def run_sql_query(self):
        query = ''
        if 4==4:
            #gets the home_team final score, away_team final score, current inning (if ends in 0.5 its home, if 0 away)
            # balls, strikes, runners on 1st,2nd,3rd, current runs for the home team/awayteam, and the average 
            # wrc for the home team less that for the away team for trailing 100,300,500,1000AB's
            # in the sql_attrs there is 300,500,1000,20000 trailing wrc from pitcher perspective, as well as times faced
            # and cumulative pitches 
            #note that team_starting_wrc_stats is this view below. 
            '''
                CREATE VIEW team_starting_wrc_stats AS 
                SELECT gameID,retrosheet,100_avg,300_avg,500_avg,1000_avg FROM Team_wrc_stats twrc
                INNER JOIN Team_Mapping t1 ON t1.mlb = twrc.team;
            '''
            query = """SELECT Game.gameID AS gameID,Pitch2.pitchID AS pitchID,outs, homeTeamScore,awayTeamScore,curr_inn,balls,strikes,
                        firstBaseRunner, secondBaseRunner,thirdBaseRunner,home_team_runs,away_team_runs,
                        (trc1.100_avg - trc2.100_avg) AS home_less_away_100, (trc1.300_avg-trc2.300_avg) AS home_less_away_300,
                        (trc1.500_avg-trc2.500_avg) AS home_less_away_500, (trc1.1000_avg-trc2.1000_avg) AS home_less_away_1000,{}
                        FROM Pitch2 INNER JOIN Game ON Game.gameID = Pitch2.gameID INNER JOIN Pitcher_Run_Expectancy ON 
                        (Pitcher_Run_Expectancy.gameID = Pitch2.gameID AND
                        Pitcher_Run_Expectancy.playerID = Pitch2.pitcherID)
                        INNER JOIN team_starting_wrc_stats trc1 ON 
                        (trc1.gameID = Pitch2.gameID AND trc1.retrosheet = Game.homeTeam) 
                        INNER JOIN team_starting_wrc_stats trc2 ON 
                        (trc2.gameID = Pitch2.gameID AND trc2.retrosheet = Game.awayTeam) 
                        WHERE YEAR(gameDate) IN ({})""".format(','.join(self.sql_attrs),','.join(self.year))
            df = pd.read_sql_query(query,self.cnx)
            return df 
    
    def add_wins_and_current_lead(self):
        #define whether or not the team batting now won or lost the game 
        self.df['win'] = pd.Series([(1 if (x[0]<x[1] and x[2]*2%2==0) or (x[0]>x[1] and x[2]*2%2==1) else 0)
                        for x in self.df[['homeTeamScore','awayTeamScore','curr_inn']].values])
        
        #get current team's wrc
        trailing_abs = ['100','300','500','1000']
        for ab_num in trailing_abs:
            self.df['home_less_away_'+ab_num] = np.where(self.df['curr_inn']*2%2==1,
                                                           self.df['home_less_away_'+ab_num],
                                                           -self.df['home_less_away_'+ab_num])
        #add the current lead 
        l = []
        for x in self.df[['home_team_runs','away_team_runs','curr_inn']].values:
            #if home team
            if x[2]*2%2==1:
                l.append(x[0]-x[1])
            else:
                l.append(x[1]-x[0])
        self.df['current_lead'] = l
        #drop the homeTeamScore,awayTeamScore since we don't know the final outcome at the current pitch 
        self.df.drop(columns=['homeTeamScore','awayTeamScore','home_team_runs','away_team_runs'],inplace=True)
    
    #create 12 variables for each type of count 
    def classify_count(self):
        s = pd.Series([(x[0],x[1]) for x in self.df[['balls','strikes']].values])
        s = pd.get_dummies(s)
        self.df = self.df.join(s,how='outer')
        self.df.drop(columns=['balls','strikes'],inplace=True) 
        
    def add_runners_on_base(self):
        #indicator variables if runners on base
        self.df['1b runner'] = self.df.apply(lambda row: 1 if row['firstBaseRunner'] else 0,axis=1)
        self.df['2b runner'] = self.df.apply(lambda row: 1 if row['secondBaseRunner'] else 0,axis=1)
        self.df['3b runner'] = self.df.apply(lambda row: 1 if row['thirdBaseRunner'] else 0,axis=1)
        self.df['mod_inning'] = self.df.apply(lambda row: int(row['curr_inn']),axis=1)
        self.df['home_away'] = self.df.apply(lambda row: 0 if row['curr_inn']*2%2==1 else 1,axis=1)
        self.df.drop(columns=['curr_inn','firstBaseRunner','secondBaseRunner',
                              'thirdBaseRunner'],inplace=True)
        
    def prepare_logistic_regression(self):
        Y = self.df['win']
        X = self.df.drop(columns=['win'])
        return X,Y

    def add_pythagorean_wins(self):
        qry = """SELECT gameID, (away10-home10) AS p10, (away30-home30) AS p30,
                (away50-home30) AS p50, (away100-home100) AS p100 
                FROM Pythagorean_Wins"""
        wins_df = pd.read_sql_query(qry,self.cnx)
        print ('done reading query. now onto merging')
        self.X = self.X.merge(wins_df,how='inner',left_on='gameID',right_on='gameID')
        print ('done merging now onto pwins')
        #this part just takes the difference of the pythagorean win expectancy
        self.X['p10'] = self.X.apply(lambda row: row['p10'] if row['home_away']==1 else -row['p10'],axis=1)
        print ('done with p10')
        self.X['p30'] = self.X.apply(lambda row: row['p30'] if row['home_away']==1 else -row['p30'],axis=1)
        print ('done with p30')
        self.X['p50'] = self.X.apply(lambda row: row['p50'] if row['home_away']==1 else -row['p50'],axis=1)
        print ('done with p50')
        self.X['p100'] = self.X.apply(lambda row: row['p100'] if row['home_away']==1 else -row['p100'],axis=1)
        print ('done with p100')

    #for now I'll support just one hidden layer
    def create_neural_net(self,hidden_nodes):
        nn = MLPClassifier(solver='adam',alpha=0.001,hidden_layer_sizes=(hidden_nodes,),random_state=1)
        nn.fit(self.X,self.Y)
        self.nn = nn 
        print ('Your in sample Neural Network Score was {}'.format(nn.score(self.X,self.Y))) 

    def calculate_leverage(self,aRow):
        initial_win_prob_pitcher_perspective = nn.predict_proba(aRow.values.reshape(1,-1))[0]
        pitcher_team_gives_up_one_run = aRow.copy()
        pitcher_team_gives_up_one_run['current_lead']+=1 #this is correct bc current lead is from batters POV
        end_win_prob_pitcher_benefit = nn.predict_proba(pitcher_team_gives_up_one_run.values.reshape(1,-1))[0]
        return initial_win_prob_pitcher_perspective-end_win_prob_pitcher_benefit