In [None]:
import pandas as pd
import numpy as np

In [None]:
mlb=pd.read_csv('./data/mlb_PA_2013to2018.csv')

In [None]:
mlb.head()

In [None]:
mlb.shape

In [None]:
mlb.columns

In [None]:
#Drop unnecessary columns
mlb_u=mlb.drop(columns=['Unnamed: 0', 'gameday_link'])

In [None]:
mlb_u.head(15)

In [None]:
#mlb_u['batter_team']=(mlb_u['pitcher_team']==mlb_u['home_team']).astype(int)
#mlb_u['batter_team'][0]
#for i in range(len(mlb_u['batter_team'])):
#    if mlb_u['batter_team'][i]==1:
#        mlb_u.loc[i,'batter_team']=mlb_u.loc[i,'away_team']
#    else:
#        mlb_u.loc[i,'batter_team']=mlb_u.loc[i,'home_team']
#mlb_u.head(3)

In [None]:
mlb_u.isnull().sum()

In [None]:
#Drop null values, model needs player names to attach stats
mlb_u=mlb_u.dropna()

In [None]:
mlb_u.shape

In [None]:
mlb_u['event'].value_counts()

In [None]:
#Use baseball knowledge to reformat events into 11 categories
mlb_u['event']=mlb_u['event'].map({
    'Strikeout':'K',
    'Groundout':'GB',
    'Single':'1B',
    'Flyout':'FB',
    'Walk':'BB',
    'Lineout':'LD',
    'Pop Out':'PU',
    'Double':'2B',
    'Home Run':'HR',
    'Forceout':'GB',
    'Grounded Into DP':'GB',
    'Hit By Pitch':'HBP',
    'Field Error':'Misc',
    'Sac Fly':'Misc',
    'Sac Bunt':'Misc',
    'Intent Walk':'BB',
    'Triple':'3B',
    'Double Play':'GB',
    'Runner Out':'Misc',
    'Bunt Groundout':'Misc',
    'Fielders Choice Out':'GB',
    'Bunt Pop Out':'Misc',
    'Strikeout - DP':'K',
    'Fielders Choice':'GB',
    'Fan interference':'HR',
    'Batter Interference':'K',
    'Catcher Interference':'1B',
    'Sac Fly DP':'FB',
    'Bunt Lineout':'Misc',
    'Triple Play':'Misc',
    'Sacrifice Bunt DP':'Misc'  
});

In [None]:
mlb_u[mlb_u['season']>=2016].shape

In [None]:
mlb_u['event'].value_counts()

In [None]:
#read in player statistics data
FH=pd.read_csv('./data/CleanedPlayerData/MergedHitters.csv')
FP=pd.read_csv('./data/CleanedPlayerData/MergedPitchers.csv')

In [None]:
FH.shape,FP.shape,mlb_u.shape

In [None]:
FH.head()

In [None]:
FP.head()

In [None]:
#Merge outcomes data with player stats
mlbx=pd.merge(mlb_u,FH,left_on=['batter_name','season'],right_on=['hName','hseason'],how='outer')

In [None]:
mlbx.shape
mlbx.isnull().sum().head(25)

In [None]:
#do the same for pitchers
mlbx=pd.merge(mlbx,FP,left_on=['pitcher_name','season'],right_on=['pName','pseason'],how='outer')

In [None]:
#statcast data is only available from 2015 on, so eliminate outcomes from before 2015
mlbx.shape
mlbx=mlbx[mlbx['season']>=2015]
mlbx.shape

In [None]:
mlbx=mlbx.sort_values(by=['date','inning','inning_side'],ascending=[True,True,False])
mlbx.head()

In [None]:
mlbx['away_team'].value_counts()

In [None]:
#remove all star games from the data as they are not normal at bats
mlbx=mlbx[(mlbx['away_team']!='nas')&(mlbx['away_team']!='aas')]

In [None]:
mlbx.shape

In [None]:
mlbx['pseason'].value_counts()

In [None]:
#eliminate players whos season stats may be too small sample or irrelevant
mlbx=mlbx[(mlbx['hPA']>=20)&(mlbx['pIP']>=3)]

In [None]:
mlbx.shape

In [None]:
mlbx.head()

In [None]:
#convert percentage columns into usable numbers
pctcols=[i for i in mlbx.columns if '%' in i]
for i in pctcols:
    mlbx[i]=mlbx[i].str.replace('%','').astype(float)/100
mlbx['hHR/FB']=mlbx['hHR/FB'].str.replace('%','').astype(float)/100
mlbx['pHR/FB']=mlbx['pHR/FB'].str.replace('%','').astype(float)/100

In [None]:
#remove events that happen because of runners on base, these will not be predictive
mlbx=mlbx[mlbx['event']!='Misc']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout

In [None]:
FH.columns

In [None]:
FP.columns

In [None]:
mlb_u.columns

In [None]:
mlb_u['stand'].value_counts()
mlb_u['p_throws'].value_counts()

In [None]:
#turn handedness into an integer variable, asking if b/p is right handed
mlbx['stand']=(mlbx['stand']=='R').astype(int)
mlbx['p_throws']=(mlbx['p_throws']=='R').astype(int)

In [None]:
mlbx['stand'].value_counts()
mlbx['p_throws'].value_counts()

In [None]:
mlbx.dropna(axis='index',inplace=True)

In [None]:
mlbx.to_csv('./data/mlbxdb.csv',index=False)

In [None]:
#Drop columns that are duplicates or other statistics or give unnecessary data
dropcols=['pitcher_name','batter_name','inning_side','inning','event','date','away_team','home_team','pitcher_team',
          'hName','hTeam','hPA','hplayerid','hseason','hlast_name','h first_name','hplayer_id','hattempts','hG','hAB',
          'pName','pTeam','pplayerid','pIP','pTBF','pseason','plast_name','p first_name','pplayer_id','pattempts']
X=mlbx.drop(columns=dropcols)

In [None]:
import seaborn as sns

In [None]:
#Show which statistics are most correlated to each outcome to help select features
for i in mlbx['event'].unique():
    X['event']=(mlbx['event']==i).astype(int)
    print(f"{i} baseline: {X['event'].value_counts(normalize=True)}")
    print(f'Most predictive variables for {i}: ')
    print(X.drop(columns='event').corrwith(other=X['event']).sort_values(ascending=False)[0:10])
    print('------------------------------------------')

In [None]:
import matplotlib.pyplot as plt

In [None]:
sea=[2015,2016,2017,2018,]
gb=[]
hrfb=[]
k=[]
hard=[]
bb=[]
ifh=[]
pull=[]
for i in sea:
    gb.append(mlbx['hGB%'][(mlbx['season']==i)&(mlbx['hName']=='Jose Ramirez')].mean())
    hrfb.append(mlbx['hHR/FB'][(mlbx['season']==i)&(mlbx['hName']=='Jose Ramirez')].mean())
    k.append(mlbx['hK%'][(mlbx['season']==i)&(mlbx['hName']=='Jose Ramirez')].mean())
    hard.append(mlbx['hHard%'][(mlbx['season']==i)&(mlbx['hName']=='Jose Ramirez')].mean())
    bb.append(mlbx['hBB%'][(mlbx['season']==i)&(mlbx['hName']=='Jose Ramirez')].mean())
    ifh.append(mlbx['hIFH%'][(mlbx['season']==i)&(mlbx['hName']=='Jose Ramirez')].mean())
    pull.append(mlbx['hPull%'][(mlbx['season']==i)&(mlbx['hName']=='Jose Ramirez')].mean())
plt.figure(figsize=(15,10))
plt.plot(gb,label='GB%'); 
plt.plot(hrfb,label='HR/FB');
plt.plot(k,label='K%'); 
plt.plot(hard,label='Hard Hit Ball%');
plt.plot(bb,label='BB%'); 
plt.plot(ifh,label='Infield Hit%');
plt.plot(pull,label='Pulled Ball Rate'); 
plt.ylabel('Percent');
plt.xlabel('Year');
plt.title('Y/Y Change in Stats for Jose Ramirez')
plt.legend();