In [3]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv 
from sqlalchemy import create_engine
from datetime import datetime
from dateutil.relativedelta import relativedelta
import sys 
sys.path.append('../maths/') 
from baseball_stats import BasicHitting


In [4]:
k = ['playerID', 'birthYear', 'birthMonth', 'birthDay']
import_people = pd.read_csv('../datafiles/People.csv', encoding='latin-1', usecols=k)
import_batting = pd.read_csv('../datafiles/Batting.csv', encoding='latin-1')
people = import_people.copy()

#fix fucky dates
people['birthYear'] = people['birthYear'].fillna(1875).astype(int)
people['birthMonth'] = people['birthMonth'].fillna(1).astype(int)
people['birthDay'] = people['birthDay'].fillna(1).astype(int)

#create a birthdate column so we can calulate an age at start of season
people['birthdate'] = people.apply(lambda x: f"""{x['birthYear']}-{x['birthMonth']}-{x['birthDay']}""", axis=1)
people['birthdate'] = pd.to_datetime(people['birthdate'], errors='coerce')

#merge the people and batting dataframes
batting = people.copy().merge(import_batting, on='playerID', how ='inner')

#make sure the raw stats are 0 and not null
batting.loc[:, ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP']].fillna(0, inplace=True)

In [5]:
player_batting = batting.copy()
player_batting = player_batting.drop(columns=['stint', 'teamID', 'lgID', 'birthYear', 'birthMonth', 'birthDay', 'G_old', 'G_batting'], axis=1).groupby(['playerID', 'birthdate', 'yearID']).sum().reset_index()
#estimate the start of the season at april 1
player_batting['season_start'] = player_batting.apply(lambda x: f"""{x['yearID']}-04-01""", axis=1)
player_batting['season_start'] = pd.to_datetime(player_batting['season_start'], errors='coerce')

#calculate the age of the player at the start of the season
player_batting['age'] = player_batting.apply(lambda x: relativedelta(x['season_start'], x['birthdate']).years, axis=1)
player_batting.drop(columns=['birthdate', 'season_start'], inplace=True)
player_batting['rowType'] = 'season'
player_batting.rename(columns={'3B':'B3', '2B':'B2', 'GIDP':'GiDP', 'SO':'K', 'yearID':'Years'}, inplace=True)

#adding shit up
player_batting['PA'] = BasicHitting.calc_pa(player_batting)
player_batting['B1'] = BasicHitting.calc_b1(player_batting)
player_batting['TB'] = BasicHitting.calc_tb(player_batting)
player_batting['SBA'] = BasicHitting.calc_sba(player_batting)
player_batting['XBH'] = BasicHitting.calc_xbh(player_batting)
player_batting['TOB'] = BasicHitting.calc_tob(player_batting)
player_batting['RP'] = BasicHitting.calc_rp(player_batting)


In [6]:
#create rows for career
career_df = player_batting.copy()
career_df = career_df.drop(['rowType'], axis=1).groupby(['playerID']).agg({
    'Years': 'count',
    'G': 'sum',
    'AB': 'sum',
    'R': 'sum',
    'H': 'sum',
    'B2': 'sum',
    'B3': 'sum',
    'HR': 'sum',
    'RBI': 'sum',
    'SB': 'sum',
    'CS': 'sum',
    'BB': 'sum',
    'K': 'sum',
    'IBB': 'sum',
    'HBP': 'sum',
    'SH': 'sum',
    'SF': 'sum',
    'GiDP': 'sum',
    'age': 'mean',
    'PA': 'sum',
    'B1': 'sum',
    'TB': 'sum',
    'SBA': 'sum',
    'XBH': 'sum',
    'TOB': 'sum',
    'RP': 'sum'
}).reset_index()

career_df['age'] = career_df['age'].round(0).astype(int)
career_df['rowType'] = 'career'

In [14]:
#create rows for 162 Game Avg.
avg162_df = career_df.drop(['rowType'], axis=1 ).copy()
stat_cols = avg162_df.columns.difference(['playerID', 'age', 'Years'])


avg162_df[stat_cols] = avg162_df[avg162_df['G'] > 0][stat_cols].div(avg162_df['G'], axis=0).mul(162).round(2)

avg162_df['rowType'] = '162Avg'


In [None]:
#create rows for 600 PA Avg.
avg600_df = career_df.drop(['rowType'], axis=1).copy()
stat_cols = avg600_df.columns.difference(['playerID', 'age', 'Years'])


avg600_df[stat_cols] = avg600_df[avg600_df['PA'] > 0][stat_cols].div(avg600_df['PA'], axis=0).mul(600).round(2)

avg600_df['rowType'] = '600Avg'


In [9]:
#create rows for season average
season_avg_df = player_batting.copy()
season_avg_df = season_avg_df.drop(['rowType'], axis=1).groupby(['playerID']).agg({
    'Years': 'count',
    'G': 'mean',
    'AB': 'mean',
    'R': 'mean',
    'H': 'mean',
    'B2': 'mean',
    'B3': 'mean',
    'HR': 'mean',
    'RBI': 'mean',
    'SB': 'mean',
    'CS': 'mean',
    'BB': 'mean',
    'K': 'mean',
    'IBB': 'mean',
    'HBP': 'mean',
    'SH': 'mean',
    'SF': 'mean',
    'GiDP': 'mean',
    'age': 'mean',
    'PA': 'mean',
    'B1': 'mean',
    'TB': 'mean',
    'SBA': 'mean',
    'XBH': 'mean',
    'TOB': 'mean',
    'RP': 'mean'
}).round(2).reset_index()

season_avg_df['age'] = season_avg_df['age'].round(0).astype(int)
season_avg_df['rowType'] = 'seasAvg'


In [10]:
all_hitting = pd.concat([player_batting, career_df, avg162_df, avg600_df, season_avg_df], ignore_index=True)

#ratios and percentages for all
all_hitting['BA'] = BasicHitting.calc_ba(all_hitting).round(4)
all_hitting['HRAB'] = BasicHitting.calc_hrab(all_hitting).round(4)
all_hitting['HRH'] = BasicHitting.calc_hrh(all_hitting).round(4)
all_hitting['OBP'] = BasicHitting.calc_obp(all_hitting).round(4)
all_hitting['SLGp'] = BasicHitting.calc_slg(all_hitting).round(4)
all_hitting['OPS'] = BasicHitting.calc_ops(all_hitting).round(4)
all_hitting['XBHp'] = BasicHitting.calc_xbh_pct(all_hitting).round(4)
all_hitting['EqA'] = BasicHitting.calc_eqa(all_hitting).round(4)
all_hitting['ISO'] = BasicHitting.calc_iso(all_hitting).round(4)
all_hitting['GPA'] = BasicHitting.calc_gpa(all_hitting).round(4)
all_hitting['PASO'] = BasicHitting.calc_paso(all_hitting).round(4)
all_hitting['BBK'] = BasicHitting.calc_bbk(all_hitting).round(4)
all_hitting['RC'] = BasicHitting.calc_rc(all_hitting).round(4)
all_hitting['BR'] = BasicHitting.calc_br(all_hitting).round(4)
all_hitting['SBp'] = BasicHitting.calc_sbp(all_hitting).round(4)
all_hitting['TA'] = BasicHitting.calc_ta(all_hitting).round(4)
all_hitting['BABIP'] = BasicHitting.calc_babip(all_hitting).round(4)

#reorder th columns
first_cols = ['rowType', 'playerID', 'age', 'Years']
addl_cols = all_hitting.columns.difference(first_cols).tolist()

all_hitting = all_hitting[first_cols + addl_cols]


In [11]:
load_dotenv()

db_user=os.getenv('jbbs_db_user')
db_pass=os.getenv('jbbs_db_password')
db_host=os.getenv('jbbs_db_host')
db_name=os.getenv('jbbs_db_name')
#all_hitting.to_csv('../outputs/plasyer_hitting.csv.gz', encoding='utf-8', index=False, compression='gzip')

engine = create_engine(f'mysql+mysqlconnector://{db_user}:{db_pass}@{db_host}/{db_name}', echo=False)
all_hitting.to_sql('test', con=engine, if_exists='replace', index=False)


ProgrammingError: (mysql.connector.errors.ProgrammingError) 1054 (42S22): Unknown column 'inf' in 'SELECT'
[SQL: INSERT INTO test (`rowType`, `playerID`, age, `Years`, `AB`, `B1`, `B2`, `B3`, `BA`, `BABIP`, `BB`, `BBK`, `BR`, `CS`, `EqA`, `G`, `GPA`, `GiDP`, `H`, `HBP`, `HR`, `HRAB`, `HRH`, `IBB`, `ISO`, `K`, `OBP`, `OPS`, `PA`, `PASO`, `R`, `RBI`, `RC`, `RP`, `SB`, `SBA`, `SBp`, `SF`, `SH`, `SLGp`, `TA`, `TB`, `TOB`, `XBH`, `XBHp`) VALUES (%(rowType)s, %(playerID)s, %(age)s, %(Years)s, %(AB)s, %(B1)s, %(B2)s, %(B3)s, %(BA)s, %(BABIP)s, %(BB)s, %(BBK)s, %(BR)s, %(CS)s, %(EqA)s, %(G)s, %(GPA)s, %(GiDP)s, %(H)s, %(HBP)s, %(HR)s, %(HRAB)s, %(HRH)s, %(IBB)s, %(ISO)s, %(K)s, %(OBP)s, %(OPS)s, %(PA)s, %(PASO)s, %(R)s, %(RBI)s, %(RC)s, %(RP)s, %(SB)s, %(SBA)s, %(SBp)s, %(SF)s, %(SH)s, %(SLGp)s, %(TA)s, %(TB)s, %(TOB)s, %(XBH)s, %(XBHp)s)]
[parameters: [{'rowType': 'season', 'playerID': 'aardsda01', 'age': 22, 'Years': 2004, 'AB': 0.0, 'B1': 0.0, 'B2': 0.0, 'B3': 0.0, 'BA': None, 'BABIP': None, 'BB': 0.0, 'BBK': None, 'BR': None, 'CS': 0.0, 'EqA': None, 'G': 11.0, 'GPA': None, 'GiDP': 0.0, 'H': 0.0, 'HBP': 0.0, 'HR': 0.0, 'HRAB': None, 'HRH': None, 'IBB': 0.0, 'ISO': None, 'K': 0.0, 'OBP': None, 'OPS': None, 'PA': 0.0, 'PASO': None, 'R': 0.0, 'RBI': 0.0, 'RC': 0.0, 'RP': 0.0, 'SB': 0.0, 'SBA': 0.0, 'SBp': None, 'SF': 0.0, 'SH': 0.0, 'SLGp': None, 'TA': None, 'TB': 0.0, 'TOB': 0.0, 'XBH': 0.0, 'XBHp': None}, {'rowType': 'season', 'playerID': 'aardsda01', 'age': 24, 'Years': 2006, 'AB': 2.0, 'B1': 0.0, 'B2': 0.0, 'B3': 0.0, 'BA': 0.0, 'BABIP': 0.0, 'BB': 0.0, 'BBK': None, 'BR': None, 'CS': 0.0, 'EqA': 0.333, 'G': 45.0, 'GPA': None, 'GiDP': 0.0, 'H': 0.0, 'HBP': 0.0, 'HR': 0.0, 'HRAB': 0.0, 'HRH': None, 'IBB': 0.0, 'ISO': 0.0, 'K': 0.0, 'OBP': 0.0, 'OPS': 0.0, 'PA': 3.0, 'PASO': None, 'R': 0.0, 'RBI': 0.0, 'RC': -0.1688, 'RP': 0.0, 'SB': 0.0, 'SBA': 0.0, 'SBp': None, 'SF': 0.0, 'SH': 1.0, 'SLGp': 0.0, 'TA': 0.0, 'TB': 0.0, 'TOB': 0.0, 'XBH': 0.0, 'XBHp': None}, {'rowType': 'season', 'playerID': 'aardsda01', 'age': 25, 'Years': 2007, 'AB': 0.0, 'B1': 0.0, 'B2': 0.0, 'B3': 0.0, 'BA': None, 'BABIP': None, 'BB': 0.0, 'BBK': None, 'BR': None, 'CS': 0.0, 'EqA': None, 'G': 25.0, 'GPA': None, 'GiDP': 0.0, 'H': 0.0, 'HBP': 0.0, 'HR': 0.0, 'HRAB': None, 'HRH': None, 'IBB': 0.0, 'ISO': None, 'K': 0.0, 'OBP': None, 'OPS': None, 'PA': 0.0, 'PASO': None, 'R': 0.0, 'RBI': 0.0, 'RC': 0.0, 'RP': 0.0, 'SB': 0.0, 'SBA': 0.0, 'SBp': None, 'SF': 0.0, 'SH': 0.0, 'SLGp': None, 'TA': None, 'TB': 0.0, 'TOB': 0.0, 'XBH': 0.0, 'XBHp': None}, {'rowType': 'season', 'playerID': 'aardsda01', 'age': 26, 'Years': 2008, 'AB': 1.0, 'B1': 0.0, 'B2': 0.0, 'B3': 0.0, 'BA': 0.0, 'BABIP': None, 'BB': 0.0, 'BBK': 0.0, 'BR': None, 'CS': 0.0, 'EqA': 0.0, 'G': 47.0, 'GPA': None, 'GiDP': 0.0, 'H': 0.0, 'HBP': 0.0, 'HR': 0.0, 'HRAB': 0.0, 'HRH': None, 'IBB': 0.0, 'ISO': 0.0, 'K': 1.0, 'OBP': 0.0, 'OPS': 0.0, 'PA': 1.0, 'PASO': 1.0, 'R': 0.0, 'RBI': 0.0, 'RC': -0.1107, 'RP': 0.0, 'SB': 0.0, 'SBA': 0.0, 'SBp': None, 'SF': 0.0, 'SH': 0.0, 'SLGp': 0.0, 'TA': 0.0, 'TB': 0.0, 'TOB': 0.0, 'XBH': 0.0, 'XBHp': None}, {'rowType': 'season', 'playerID': 'aardsda01', 'age': 27, 'Years': 2009, 'AB': 0.0, 'B1': 0.0, 'B2': 0.0, 'B3': 0.0, 'BA': None, 'BABIP': None, 'BB': 0.0, 'BBK': None, 'BR': None, 'CS': 0.0, 'EqA': None, 'G': 73.0, 'GPA': None, 'GiDP': 0.0, 'H': 0.0, 'HBP': 0.0, 'HR': 0.0, 'HRAB': None, 'HRH': None, 'IBB': 0.0, 'ISO': None, 'K': 0.0, 'OBP': None, 'OPS': None, 'PA': 0.0, 'PASO': None, 'R': 0.0, 'RBI': 0.0, 'RC': 0.0, 'RP': 0.0, 'SB': 0.0, 'SBA': 0.0, 'SBp': None, 'SF': 0.0, 'SH': 0.0, 'SLGp': None, 'TA': None, 'TB': 0.0, 'TOB': 0.0, 'XBH': 0.0, 'XBHp': None}, {'rowType': 'season', 'playerID': 'aardsda01', 'age': 28, 'Years': 2010, 'AB': 0.0, 'B1': 0.0, 'B2': 0.0, 'B3': 0.0, 'BA': None, 'BABIP': None, 'BB': 0.0, 'BBK': None, 'BR': None, 'CS': 0.0, 'EqA': None, 'G': 53.0, 'GPA': None, 'GiDP': 0.0, 'H': 0.0, 'HBP': 0.0, 'HR': 0.0, 'HRAB': None, 'HRH': None, 'IBB': 0.0, 'ISO': None, 'K': 0.0, 'OBP': None, 'OPS': None, 'PA': 0.0, 'PASO': None, 'R': 0.0, 'RBI': 0.0, 'RC': 0.0, 'RP': 0.0, 'SB': 0.0, 'SBA': 0.0, 'SBp': None, 'SF': 0.0, 'SH': 0.0, 'SLGp': None, 'TA': None, 'TB': 0.0, 'TOB': 0.0, 'XBH': 0.0, 'XBHp': None}, {'rowType': 'season', 'playerID': 'aardsda01', 'age': 30, 'Years': 2012, 'AB': 0.0, 'B1': 0.0, 'B2': 0.0, 'B3': 0.0, 'BA': None, 'BABIP': None, 'BB': 0.0, 'BBK': None, 'BR': None, 'CS': 0.0, 'EqA': None, 'G': 1.0, 'GPA': None, 'GiDP': 0.0, 'H': 0.0, 'HBP': 0.0, 'HR': 0.0, 'HRAB': None, 'HRH': None, 'IBB': 0.0, 'ISO': None, 'K': 0.0, 'OBP': None, 'OPS': None, 'PA': 0.0, 'PASO': None, 'R': 0.0, 'RBI': 0.0, 'RC': 0.0, 'RP': 0.0, 'SB': 0.0, 'SBA': 0.0, 'SBp': None, 'SF': 0.0, 'SH': 0.0, 'SLGp': None, 'TA': None, 'TB': 0.0, 'TOB': 0.0, 'XBH': 0.0, 'XBHp': None}, {'rowType': 'season', 'playerID': 'aardsda01', 'age': 31, 'Years': 2013, 'AB': 0.0, 'B1': 0.0, 'B2': 0.0, 'B3': 0.0, 'BA': None, 'BABIP': None, 'BB': 0.0, 'BBK': None, 'BR': None, 'CS': 0.0, 'EqA': None, 'G': 43.0, 'GPA': None, 'GiDP': 0.0, 'H': 0.0, 'HBP': 0.0, 'HR': 0.0, 'HRAB': None, 'HRH': None, 'IBB': 0.0, 'ISO': None, 'K': 0.0, 'OBP': None, 'OPS': None, 'PA': 0.0, 'PASO': None, 'R': 0.0, 'RBI': 0.0, 'RC': 0.0, 'RP': 0.0, 'SB': 0.0, 'SBA': 0.0, 'SBp': None, 'SF': 0.0, 'SH': 0.0, 'SLGp': None, 'TA': None, 'TB': 0.0, 'TOB': 0.0, 'XBH': 0.0, 'XBHp': None}  ... displaying 10 of 188070 total bound parameter sets ...  {'rowType': 'seasAvg', 'playerID': 'zwilldu01', 'age': 25, 'Years': 4, 'AB': 320.0, 'B1': 60.75, 'B2': 19.0, 'B3': 3.75, 'BA': 0.284, 'BABIP': 0.305, 'BB': 32.0, 'BBK': 0.826, 'BR': 50.421, 'CS': 0.0, 'EqA': 0.822, 'G': 91.5, 'GPA': 0.2643, 'GiDP': 0.0, 'H': 91.0, 'HBP': 1.0, 'HR': 7.5, 'HRAB': 0.0, 'HRH': 0.0, 'IBB': 0.0, 'ISO': 0.154, 'K': 38.75, 'OBP': 0.344, 'OPS': 0.782, 'PA': 360.75, 'PASO': 9.31, 'R': 41.75, 'RBI': 50.5, 'RC': 55.5108, 'RP': 84.75, 'SB': 11.5, 'SBA': 11.5, 'SBp': 1.0, 'SF': 0.0, 'SH': 7.75, 'SLGp': 0.438, 'TA': 0.8057, 'TB': 140.0, 'TOB': 124.0, 'XBH': 30.25, 'XBHp': 0.332}, {'rowType': 'seasAvg', 'playerID': 'zychto01', 'age': 25, 'Years': 3, 'AB': 0.0, 'B1': 0.0, 'B2': 0.0, 'B3': 0.0, 'BA': None, 'BABIP': None, 'BB': 0.0, 'BBK': None, 'BR': None, 'CS': 0.0, 'EqA': None, 'G': 23.33, 'GPA': None, 'GiDP': 0.0, 'H': 0.0, 'HBP': 0.0, 'HR': 0.0, 'HRAB': None, 'HRH': None, 'IBB': 0.0, 'ISO': None, 'K': 0.0, 'OBP': None, 'OPS': None, 'PA': 0.0, 'PASO': None, 'R': 0.0, 'RBI': 0.0, 'RC': 0.0, 'RP': 0.0, 'SB': 0.0, 'SBA': 0.0, 'SBp': None, 'SF': 0.0, 'SH': 0.0, 'SLGp': None, 'TA': None, 'TB': 0.0, 'TOB': 0.0, 'XBH': 0.0, 'XBHp': None}]]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
#test_hitting = all_hitting[all_hitting.isin([np.inf, -np.inf]).any(axis=1)]
#test_hitting

Unnamed: 0,rowType,playerID,age,Years,AB,B1,B2,B3,BA,BABIP,...,SBA,SBp,SF,SH,SLGp,TA,TB,TOB,XBH,XBHp
146620,600Avg,abbotan01,23,1,,,,,,,...,,,,,,,,,,
146624,600Avg,abbotgl01,27,11,,,,,,,...,,,,,,,,,,
146646,600Avg,abreual01,26,4,,,,,,,...,,,,,,,,,,
146648,600Avg,abreubr01,23,5,,,,,,,...,,,,,,,,,,
146651,600Avg,abreuju01,25,1,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167311,600Avg,zinkch01,28,1,,,,,,,...,,,,,,,,,,
167316,600Avg,zinsebi01,26,1,,,,,,,...,,,,,,,,,,
167331,600Avg,zumayjo01,23,5,,,,,,,...,,,,,,,,,,
167332,600Avg,zuniggu01,24,1,,,,,,,...,,,,,,,,,,
