In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import sys 
sys.path.append('../maths/') 
from baseball_stats import BasicHitting


In [None]:
k = ['playerID', 'birthYear', 'birthMonth', 'birthDay']
import_people = pd.read_csv('../datafiles/People.csv', encoding='latin-1', usecols=k)
import_batting = pd.read_csv('../datafiles/Batting.csv', encoding='latin-1')
people = import_people.copy()

#fix fucky dates
people['birthYear'] = people['birthYear'].fillna(1875).astype(int)
people['birthMonth'] = people['birthMonth'].fillna(1).astype(int)
people['birthDay'] = people['birthDay'].fillna(1).astype(int)

#create a birthdate column so we can calulate an age at start of season
people['birthdate'] = people.apply(lambda x: f"""{x['birthYear']}-{x['birthMonth']}-{x['birthDay']}""", axis=1)
people['birthdate'] = pd.to_datetime(people['birthdate'], errors='coerce')

#merge the people and batting dataframes
batting = people.copy().merge(import_batting, on='playerID', how ='inner')

#make sure the raw stats are 0 and not null
batting.loc[:, ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP']].fillna(0, inplace=True)

In [None]:
#TODO: create as function to do career and either 162 game or 600 pa stats
player_batting = batting.copy()
player_batting = player_batting.drop(columns=['stint', 'teamID', 'lgID', 'birthYear', 'birthMonth', 'birthDay', 'G_old', 'G_batting'], axis=1).groupby(['playerID', 'birthdate', 'yearID']).sum().reset_index()
#estimate the start of the season at april 1
player_batting['season_start'] = player_batting.apply(lambda x: f"""{x['yearID']}-04-01""", axis=1)
player_batting['season_start'] = pd.to_datetime(player_batting['season_start'], errors='coerce')

#calculate the age of the player at the start of the season
player_batting['age'] = player_batting.apply(lambda x: relativedelta(x['season_start'], x['birthdate']).years, axis=1)
player_batting.drop(columns=['birthdate', 'season_start'], inplace=True)
player_batting['rowType'] = 'season'
player_batting.rename(columns={'3B':'B3', '2B':'B2', 'GIDP':'GiDP'}, inplace=True)

#adding shit up
player_batting['PA'] = BasicHitting.calc_pa(player_batting)
player_batting['B1'] = BasicHitting.calc_b1(player_batting)
player_batting['TB'] = BasicHitting.calc_tb(player_batting)
player_batting['SBA'] = BasicHitting.calc_sba(player_batting)
player_batting['XBH'] = BasicHitting.calc_xbh(player_batting)
player_batting['TOB'] = BasicHitting.calc_tob(player_batting)
player_batting['RP'] = BasicHitting.calc_rp(player_batting)


Unnamed: 0,playerID,yearID,G,AB,R,H,B2,B3,HR,RBI,...,GiDP,age,PA,B1,TB,SBA,XBH,TOB,RP,rowType
0,aardsda01,9,331,4,0,0,0,0,0,0.0,...,0.0,27.0,5.0,0,0,0.0,0,0.0,0.0,career
1,aaronha01,23,3298,12364,2174,3771,624,98,755,2297.0,...,328.0,31.0,13940.0,2294,6856,313.0,1477,5205.0,3716.0,career
2,aaronto01,7,437,944,102,216,42,6,13,94.0,...,36.0,27.0,1045.0,155,309,17.0,61,302.0,183.0,career
3,aasedo01,13,448,5,0,0,0,0,0,0.0,...,0.0,29.0,5.0,0,0,0.0,0,0.0,0.0,career
4,abadan01,3,15,21,1,2,0,0,0,0.0,...,1.0,30.0,25.0,2,2,1.0,0,6.0,1.0,career
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20725,zupofr01,3,16,18,3,3,1,0,0,0.0,...,0.0,19.0,20.0,2,4,0.0,1,5.0,3.0,career
20726,zuvelpa01,9,209,491,41,109,17,2,2,20.0,...,8.0,27.0,545.0,88,136,2.0,21,145.0,59.0,career
20727,zuverge01,8,266,142,5,21,2,1,0,7.0,...,3.0,30.0,167.0,18,25,1.0,3,30.0,12.0,career
20728,zwilldu01,4,366,1280,167,364,76,15,30,202.0,...,0.0,25.0,1443.0,243,560,46.0,121,496.0,339.0,career


In [None]:
#create rows for career
career_df = player_batting.copy()
career_df = career_df.drop(['rowType'], axis=1).groupby(['playerID']).agg({
    'yearID': 'count',
    'G': 'sum',
    'AB': 'sum',
    'R': 'sum',
    'H': 'sum',
    'B2': 'sum',
    'B3': 'sum',
    'HR': 'sum',
    'RBI': 'sum',
    'SB': 'sum',
    'CS': 'sum',
    'BB': 'sum',
    'SO': 'sum',
    'IBB': 'sum',
    'HBP': 'sum',
    'SH': 'sum',
    'SF': 'sum',
    'GiDP': 'sum',
    'age': 'mean',
    'PA': 'sum',
    'B1': 'sum',
    'TB': 'sum',
    'SBA': 'sum',
    'XBH': 'sum',
    'TOB': 'sum',
    'RP': 'sum'
}).reset_index()

career_df['age'] = career_df['age'].round(0).astype(int)
career_df['rowType'] = 'career'

career_df

Unnamed: 0,playerID,yearID,G,AB,R,H,B2,B3,HR,RBI,...,GiDP,age,PA,B1,TB,SBA,XBH,TOB,RP,rowType
0,aardsda01,9,331,4,0,0,0,0,0,0.0,...,0.0,27,5.0,0,0,0.0,0,0.0,0.0,career
1,aaronha01,23,3298,12364,2174,3771,624,98,755,2297.0,...,328.0,31,13940.0,2294,6856,313.0,1477,5205.0,3716.0,career
2,aaronto01,7,437,944,102,216,42,6,13,94.0,...,36.0,27,1045.0,155,309,17.0,61,302.0,183.0,career
3,aasedo01,13,448,5,0,0,0,0,0,0.0,...,0.0,29,5.0,0,0,0.0,0,0.0,0.0,career
4,abadan01,3,15,21,1,2,0,0,0,0.0,...,1.0,30,25.0,2,2,1.0,0,6.0,1.0,career
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20725,zupofr01,3,16,18,3,3,1,0,0,0.0,...,0.0,19,20.0,2,4,0.0,1,5.0,3.0,career
20726,zuvelpa01,9,209,491,41,109,17,2,2,20.0,...,8.0,27,545.0,88,136,2.0,21,145.0,59.0,career
20727,zuverge01,8,266,142,5,21,2,1,0,7.0,...,3.0,30,167.0,18,25,1.0,3,30.0,12.0,career
20728,zwilldu01,4,366,1280,167,364,76,15,30,202.0,...,0.0,25,1443.0,243,560,46.0,121,496.0,339.0,career


In [None]:
#create rows for 162 Game Avg.
avg162_df = career_df.drop(['rowType'], axis=1 ).copy()
stat_cols = avg162_df.columns.difference(['playerID', 'age', 'yearID'])


avg162_df[stat_cols] = avg162_df[stat_cols].div(avg162_df['G'], axis=0).mul(162).round(2)

avg162_df['rowType'] = '162Avg'

avg162_df

Unnamed: 0,playerID,yearID,G,AB,R,H,B2,B3,HR,RBI,...,GiDP,age,PA,B1,TB,SBA,XBH,TOB,RP,rowType
0,aardsda01,9,162.0,1.96,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,27,2.45,0.00,0.00,0.00,0.00,0.00,0.00,162Avg
1,aaronha01,23,162.0,607.33,106.79,185.23,30.65,4.81,37.09,112.83,...,16.11,31,684.74,112.68,336.77,15.37,72.55,255.67,182.53,162Avg
2,aaronto01,7,162.0,349.95,37.81,80.07,15.57,2.22,4.82,34.85,...,13.35,27,387.39,57.46,114.55,6.30,22.61,111.95,67.84,162Avg
3,aasedo01,13,162.0,1.81,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,29,1.81,0.00,0.00,0.00,0.00,0.00,0.00,162Avg
4,abadan01,3,162.0,226.80,10.80,21.60,0.00,0.00,0.00,0.00,...,10.80,30,270.00,21.60,21.60,10.80,0.00,64.80,10.80,162Avg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20725,zupofr01,3,162.0,182.25,30.38,30.38,10.12,0.00,0.00,0.00,...,0.00,19,202.50,20.25,40.50,0.00,10.12,50.62,30.38,162Avg
20726,zuvelpa01,9,162.0,380.58,31.78,84.49,13.18,1.55,1.55,15.50,...,6.20,27,422.44,68.21,105.42,1.55,16.28,112.39,45.73,162Avg
20727,zuverge01,8,162.0,86.48,3.05,12.79,1.22,0.61,0.00,4.26,...,1.83,30,101.71,10.96,15.23,0.61,1.83,18.27,7.31,162Avg
20728,zwilldu01,4,162.0,566.56,73.92,161.11,33.64,6.64,13.28,89.41,...,0.00,25,638.70,107.56,247.87,20.36,53.56,219.54,150.05,162Avg


In [None]:
#create rows for 600 PA Avg.
avg600_df = career_df.drop(['rowType'], axis=1 ).copy()
stat_cols = avg600_df.columns.difference(['playerID', 'age', 'yearID'])


avg600_df[stat_cols] = avg600_df[stat_cols].div(avg600_df['PA'], axis=0).mul(600).round(2)

avg600_df['rowType'] = '600Avg'

avg600_df

Unnamed: 0,playerID,yearID,G,AB,R,H,B2,B3,HR,RBI,...,GiDP,age,PA,B1,TB,SBA,XBH,TOB,RP,rowType
0,aardsda01,9,39720.00,480.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,27,600.0,0.00,0.00,0.00,0.00,0.00,0.00,600Avg
1,aaronha01,23,141.95,532.17,93.57,162.31,26.86,4.22,32.50,98.87,...,14.12,31,600.0,98.74,295.09,13.47,63.57,224.03,159.94,600Avg
2,aaronto01,7,250.91,542.01,58.56,124.02,24.11,3.44,7.46,53.97,...,20.67,27,600.0,89.00,177.42,9.76,35.02,173.40,105.07,600Avg
3,aasedo01,13,53760.00,600.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,29,600.0,0.00,0.00,0.00,0.00,0.00,0.00,600Avg
4,abadan01,3,360.00,504.00,24.00,48.00,0.00,0.00,0.00,0.00,...,24.00,30,600.0,48.00,48.00,24.00,0.00,144.00,24.00,600Avg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20725,zupofr01,3,480.00,540.00,90.00,90.00,30.00,0.00,0.00,0.00,...,0.00,19,600.0,60.00,120.00,0.00,30.00,150.00,90.00,600Avg
20726,zuvelpa01,9,230.09,540.55,45.14,120.00,18.72,2.20,2.20,22.02,...,8.81,27,600.0,96.88,149.72,2.20,23.12,159.63,64.95,600Avg
20727,zuverge01,8,955.69,510.18,17.96,75.45,7.19,3.59,0.00,25.15,...,10.78,30,600.0,64.67,89.82,3.59,10.78,107.78,43.11,600Avg
20728,zwilldu01,4,152.18,532.22,69.44,151.35,31.60,6.24,12.47,83.99,...,0.00,25,600.0,101.04,232.85,19.13,50.31,206.24,140.96,600Avg


In [None]:
#create rows for season average
season_avg_df = player_batting.copy()
season_avg_df = season_avg_df.drop(['rowType'], axis=1).groupby(['playerID']).agg({
    'yearID': 'count',
    'G': 'mean',
    'AB': 'mean',
    'R': 'mean',
    'H': 'mean',
    'B2': 'mean',
    'B3': 'mean',
    'HR': 'mean',
    'RBI': 'mean',
    'SB': 'mean',
    'CS': 'mean',
    'BB': 'mean',
    'SO': 'mean',
    'IBB': 'mean',
    'HBP': 'mean',
    'SH': 'mean',
    'SF': 'mean',
    'GiDP': 'mean',
    'age': 'mean',
    'PA': 'mean',
    'B1': 'mean',
    'TB': 'mean',
    'SBA': 'mean',
    'XBH': 'mean',
    'TOB': 'mean',
    'RP': 'mean'
}).round(2).reset_index()

season_avg_df['age'] = season_avg_df['age'].round(0).astype(int)
season_avg_df['rowType'] = 'seasAvg'

season_avg_df

Unnamed: 0,playerID,yearID,G,AB,R,H,B2,B3,HR,RBI,...,GiDP,age,PA,B1,TB,SBA,XBH,TOB,RP,rowType
0,aardsda01,9,36.78,0.44,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,27,0.56,0.00,0.00,0.00,0.00,0.00,0.00,seasAvg
1,aaronha01,23,143.39,537.57,94.52,163.96,27.13,4.26,32.83,99.87,...,14.26,31,606.09,99.74,298.09,13.61,64.22,226.30,161.57,seasAvg
2,aaronto01,7,62.43,134.86,14.57,30.86,6.00,0.86,1.86,13.43,...,5.14,27,149.29,22.14,44.14,2.43,8.71,43.14,26.14,seasAvg
3,aasedo01,13,34.46,0.38,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,29,0.38,0.00,0.00,0.00,0.00,0.00,0.00,seasAvg
4,abadan01,3,5.00,7.00,0.33,0.67,0.00,0.00,0.00,0.00,...,0.33,30,8.33,0.67,0.67,0.33,0.00,2.00,0.33,seasAvg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20725,zupofr01,3,5.33,6.00,1.00,1.00,0.33,0.00,0.00,0.00,...,0.00,19,6.67,0.67,1.33,0.00,0.33,1.67,1.00,seasAvg
20726,zuvelpa01,9,23.22,54.56,4.56,12.11,1.89,0.22,0.22,2.22,...,0.89,27,60.56,9.78,15.11,0.22,2.33,16.11,6.56,seasAvg
20727,zuverge01,8,33.25,17.75,0.62,2.62,0.25,0.12,0.00,0.88,...,0.38,30,20.88,2.25,3.12,0.12,0.38,3.75,1.50,seasAvg
20728,zwilldu01,4,91.50,320.00,41.75,91.00,19.00,3.75,7.50,50.50,...,0.00,25,360.75,60.75,140.00,11.50,30.25,124.00,84.75,seasAvg


In [None]:
all_hitting = pd.concat([player_batting, career_df, avg162_df, avg600_df, season_avg_df], ignore_index=True)

#ratios and percentages for all
all_hitting['BA'] = BasicHitting.calc_ba(all_hitting)
all_hitting['HRAB'] = BasicHitting.calc_hrab(all_hitting)
all_hitting['HRH'] = BasicHitting.calc_hrh(all_hitting)
all_hitting['OBP'] = BasicHitting.calc_obp(all_hitting)
all_hitting['SLGp'] = BasicHitting.calc_slg(all_hitting)
all_hitting['OPS'] = BasicHitting.calc_ops(all_hitting)
all_hitting['XBHp'] = BasicHitting.calc_xbh_pct(all_hitting)
all_hitting['EqA'] = BasicHitting.calc_eqa(all_hitting)
all_hitting['ISO'] = BasicHitting.calc_iso(all_hitting)
all_hitting['GPA'] = BasicHitting.calc_gpa(all_hitting)
all_hitting['PASO'] = BasicHitting.calc_paso(all_hitting)
all_hitting['BBK'] = BasicHitting.calc_bbk(all_hitting)
all_hitting['RC'] = BasicHitting.calc_rc(all_hitting)
all_hitting['BR'] = BasicHitting.calc_br(all_hitting)
all_hitting['SBp'] = BasicHitting.calc_sbp(all_hitting)
all_hitting['TA'] = BasicHitting.calc_ta(all_hitting)
all_hitting['BABIP'] = BasicHitting.calc_babip(all_hitting)

all_hitting

KeyError: 'GIDP'