In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import OneHotEncoder

In [None]:
import pandas as pd
import numpy as np

In [None]:
mlbx=pd.read_csv('./data/mlbxdb.csv')

In [None]:
mlbx.shape

In [None]:
#create a variable to help model understand opposite handed hitters have historically done better
mlbx['isoppo']=(mlbx['stand']!=mlbx['p_throws']).astype(int)*1000

In [None]:
mlbx['isoppo'].value_counts()

In [None]:
features=['stand','p_throws', 'season','isoppo',
          'hBB%','hK%','hSpd','hPull%','hGB%','hLD%','hFB%','hBABIP','hwOBA','hSoft%', 'hMed%', 'hHard%','hHR/FB',
          'havg_hit_angle','hmax_hit_speed', 'havg_hit_speed', 'hfbld','hgb','hev95percent',
          'pBB%','pK%','pPull%','pGB%','pLD%','pFB%','pBABIP','pSIERA','pSoft%', 'pMed%', 'pHard%','pHR/FB',
          'pavg_hit_angle','pmax_hit_speed', 'pavg_hit_speed', 'pfbld','pgb','pev95percent']

In [None]:
mlbx['event'].value_counts(normalize=True)

In [None]:
X=mlbx[features]
y=mlbx['event']

In [None]:
#scale x and y data
ss=StandardScaler()
Xsc=ss.fit_transform(X)

In [None]:
#reshape x and y into the proper format to feed to a neural network
y=pd.DataFrame(columns=y.unique()) 
for i in y:
    y[i]=(mlbx['event']==i).astype(int)

In [None]:
y

In [None]:
y.shape

In [None]:
X.shape

In [None]:
X.shape
sh=X.shape[1]

In [None]:
#create neural network
model=Sequential()
model.add(Dense(4_000, activation="relu", input_shape=(sh,)))
model.add(Dropout(.5))
model.add(Dense(800, activation="relu"))
model.add(Dropout(.5))
model.add(Dense(300, activation="relu"))
model.add(Dropout(.5))
model.add(Dense(100, activation="relu"))
model.add(Dropout(.5))
model.add(Dense(20, activation="relu"))
model.add(Dropout(.5))
model.add(Dense(11, activation="softmax"))

In [None]:
#compile model
model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=['categorical_accuracy']
)

In [None]:
model.fit(
    Xsc, y,
    batch_size=512,
    epochs=12,
    validation_split=0.1
)

In [None]:
#check out one input vector
X.loc[0,:]

In [None]:
mlbx.iloc[0,0:12]

In [None]:
#read in initial player statistical data
FH=pd.read_csv('./data/CleanedPlayerData/MergedHitters.csv')
FP=pd.read_csv('./data/CleanedPlayerData/MergedPitchers.csv')

In [None]:
#convert percentage data from a string to usable data format
pctcols=[i for i in FH.columns if '%' in i]
for i in pctcols:
    FH[i]=FH[i].str.replace('%','').astype(float)/100
pctcols=[i for i in FP.columns if '%' in i]
for i in pctcols:
    FP[i]=FP[i].str.replace('%','').astype(float)/100
FH['hHR/FB']=FH['hHR/FB'].str.replace('%','').astype(float)/100
FP['pHR/FB']=FP['pHR/FB'].str.replace('%','').astype(float)/100

In [None]:
def construct_input(atts,parr,harr):
    '''
    This function takes in a descriptive vector, a batter stats vector, and a hitter stats vector, 
    and constructs them into the correct data format to be fed into a fit neural network model to make predictions
    '''
    atts[2]=float(atts[2])
    features=['stand','p_throws', 'season', 'isoppo',
          'hBB%','hK%','hSpd','hPull%','hGB%','hLD%','hFB%','hBABIP','hwOBA','hSoft%', 'hMed%', 'hHard%','hHR/FB',
          'havg_hit_angle','hmax_hit_speed', 'havg_hit_speed', 'hfbld','hgb','hev95percent',
          'pBB%','pK%','pPull%','pGB%','pLD%','pFB%','pBABIP','pSIERA','pSoft%', 'pMed%', 'pHard%','pHR/FB',
          'pavg_hit_angle','pmax_hit_speed', 'pavg_hit_speed', 'pfbld','pgb','pev95percent']
    hfeats=['hBB%','hK%','hSpd','hPull%','hGB%','hLD%','hFB%','hBABIP','hwOBA','hSoft%', 'hMed%', 'hHard%','hHR/FB',
          'havg_hit_angle','hmax_hit_speed', 'havg_hit_speed', 'hfbld','hgb','hev95percent']
    pfeats=['pBB%','pK%','pPull%','pGB%','pLD%','pFB%','pBABIP','pSIERA','pSoft%', 'pMed%', 'pHard%','pHR/FB',
          'pavg_hit_angle','pmax_hit_speed', 'pavg_hit_speed', 'pfbld','pgb','pev95percent']
    data_z=np.zeros((1,4))
    df=pd.DataFrame(data=data_z,columns=features[0:4])
    df.loc[0,'stand']=atts[0]
    df.loc[0,'p_throws']=atts[1]
    df.loc[0,'season']=atts[2]
    df.loc[0,'isoppo']=atts[3]
    for i in hfeats:
        df[i]=harr[i].values
    for i in pfeats:
        df[i]=parr[i].values
    return df

In [None]:
def simAB(pitcher,hitter,ispRH=True,isbRH=True,pseason=2019,hseason=2019,season=2019,output=('probs',0)):
    '''
    This function takes a batter name and a pitcher name and simulates the interaction of those two players. 
    The function requires specification of if either party is left-handed. 
    The function can be customized in the following ways: 
    
    pseason: The season in which the named pitcher's data 
    is pulled from so at bats from past versions of players can be simulated
    
    hseason: Same as above, but for the hitter
    
    season: The season in which the interaction takes place can be specified, as different years have 
    different baseline occurrence rates for each event
    
    output: This tuple formats the output of the function
    Term 1 can be one of three values.
    1. 'probs' will give a probability distribution of each event of the simulated at bat
    2. 'pa' will simulate one plate appearance randomly using the above distribution and give a result
    3. 'statline' will summarize this output into more common baseball statistics
    
    Term 2 is the number of at bats over which this simulation occurs. 
    'probs' will not change regardless of this number 
    '''
    pitcher=FP[(FP['pName']==pitcher)&(FP['pseason']==pseason)]
    hitter=FH[(FH['hName']==hitter)&(FH['hseason']==hseason)]
    atts=[int(isbRH),int(ispRH),season]
    atts.append(int(ispRH!=isbRH)*1000)
    feats_arr=construct_input(atts,pitcher,hitter)
    feats_sc=ss.transform(feats_arr)
    preds=model.predict(feats_sc)
    preds_clean=np.round(preds,3)
    cols=['GB', '2B', '1B', 'K', 'BB', 'PU', 'FB', 'LD', 'HR', 'HBP', '3B']
    if output[0]=='probs':
        df=pd.DataFrame(data=preds_clean,columns=cols)
        return df
    elif output[0]=='statline' or output[0]=='pa':
        p=(preds_clean[0][0::])
        pnew=[round(i,2) for i in p]
        pnew[0]=round(1-np.sum(pnew[1::]),2)
        if output[0]=='pa':
            samps=np.random.choice(a=cols,size=(output[1],1),p=pnew)
            return samps
        elif output[0]=='statline':
            samps=list(np.random.choice(a=cols,size=(output[1]),p=pnew))   
            pa=output[1]
            ab=pa-samps.count('BB')-samps.count('HBP')
            outpa=ab-samps.count('1B')-samps.count('2B')-samps.count('3B')-samps.count('HR')
            ip=float(round(outpa/3,0))+float((outpa%3)/10)
            ba=round((samps.count('1B')+samps.count('2B')+samps.count('3B')+samps.count('HR'))/ab,3)
            hr=samps.count('HR')
            db=samps.count('2B')
            slg=round(((samps.count('1B')+(2*samps.count('2B'))+(3*samps.count('3B'))+(4*samps.count('HR')))/ab),3)
            obp=round((samps.count('1B')+samps.count('2B')+samps.count('3B')+samps.count('HR')+samps.count('BB'))/pa,3)
            ops=round(slg+obp,3)
            babip_pa=pa-samps.count('BB')-samps.count('HBP')-samps.count('K')-samps.count('HR')
            babip=round(((samps.count('1B')+samps.count('2B')+samps.count('3B'))/babip_pa),3)
            k=samps.count('K')
            bb=samps.count('BB')
            whip=round((samps.count('1B')+samps.count('2B')+samps.count('3B')+samps.count('HR')+samps.count('BB'))/ip,2)
            statcols=['PA','BA','HR','2B','SLG','OBP','oPS','BABIP','K','BB','pIP','pWHIP']
            data=[[pa,ba,hr,db,slg,obp,ops,babip,k,bb,ip,whip]]
            df=pd.DataFrame(data=data,columns=statcols)
            return df

In [None]:
simAB('Ryan Carpenter','Anthony Rendon',isbRH=False,output=('statline',6_000))

In [None]:
simAB('Shane Bieber','Miguel Sano',output=('probs',0))

In [None]:
a=list(simAB('Shane Bieber','Miguel Sano',output=('probs',0)).values[0][0::])
c=simAB('Shane Bieber','Miguel Sano',output=('probs',0)).columns
plt.figure(figsize=(15,10))
plt.bar(c,a);
plt.title('Shane Bieber v. Miguel Sano Outcomes')
plt.ylabel('Percent');

In [None]:
simAB('Shane Bieber','Miguel Sano',output=('pa',10))

In [None]:
simAB('Shane Bieber','Miguel Sano',output=('statline',600))

In [None]:
gamestate_labels=['inning','half','outs','home_runs','away_runs','1stB','2ndB','3rdB']

In [None]:
#def gamesim(l1,l2,p1,p2,pinn1,pinn2)

In [None]:
if == "GB":
    