In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv 
from sqlalchemy import create_engine
from datetime import datetime
from dateutil.relativedelta import relativedelta
import sys 
sys.path.append('../maths/') 
from baseball_stats import BasicHitting


In [2]:
k = ['playerID', 'birthYear', 'birthMonth', 'birthDay']
import_people = pd.read_csv('../datafiles/People.csv', encoding='latin-1', usecols=k)
import_batting = pd.read_csv('../datafiles/Batting.csv', encoding='latin-1')
people = import_people.copy()

#fix fucky dates
people['birthYear'] = people['birthYear'].fillna(1875).astype(int)
people['birthMonth'] = people['birthMonth'].fillna(1).astype(int)
people['birthDay'] = people['birthDay'].fillna(1).astype(int)

#create a birthdate column so we can calulate an age at start of season
people['birthdate'] = people.apply(lambda x: f"""{x['birthYear']}-{x['birthMonth']}-{x['birthDay']}""", axis=1)
people['birthdate'] = pd.to_datetime(people['birthdate'], errors='coerce')

#merge the people and batting dataframes
batting = people.copy().merge(import_batting, on='playerID', how ='inner')

#make sure the raw stats are 0 and not null
batting.loc[:, ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP']].fillna(0, inplace=True)

In [3]:
player_batting = batting.copy()
player_batting = player_batting.drop(columns=['stint', 'teamID', 'lgID', 'birthYear', 'birthMonth', 'birthDay', 'G_old', 'G_batting'], axis=1).groupby(['playerID', 'birthdate', 'yearID']).sum().reset_index()
#estimate the start of the season at april 1
player_batting['season_start'] = player_batting.apply(lambda x: f"""{x['yearID']}-04-01""", axis=1)
player_batting['season_start'] = pd.to_datetime(player_batting['season_start'], errors='coerce')

#calculate the age of the player at the start of the season
player_batting['age'] = player_batting.apply(lambda x: relativedelta(x['season_start'], x['birthdate']).years, axis=1)
player_batting.drop(columns=['birthdate', 'season_start'], inplace=True)
player_batting['rowType'] = 'season'
player_batting.rename(columns={'3B':'B3', '2B':'B2', 'GIDP':'GiDP', 'SO':'K', 'yearID':'Years'}, inplace=True)

#adding shit up
player_batting['PA'] = BasicHitting.calc_pa(player_batting)
player_batting['B1'] = BasicHitting.calc_b1(player_batting)
player_batting['TB'] = BasicHitting.calc_tb(player_batting)
player_batting['SBA'] = BasicHitting.calc_sba(player_batting)
player_batting['XBH'] = BasicHitting.calc_xbh(player_batting)
player_batting['TOB'] = BasicHitting.calc_tob(player_batting)
player_batting['RP'] = BasicHitting.calc_rp(player_batting)


In [4]:
#create rows for career
career_df = player_batting.copy()
career_df = career_df.drop(['rowType'], axis=1).groupby(['playerID']).agg({
    'Years': 'count',
    'G': 'sum',
    'AB': 'sum',
    'R': 'sum',
    'H': 'sum',
    'B2': 'sum',
    'B3': 'sum',
    'HR': 'sum',
    'RBI': 'sum',
    'SB': 'sum',
    'CS': 'sum',
    'BB': 'sum',
    'K': 'sum',
    'IBB': 'sum',
    'HBP': 'sum',
    'SH': 'sum',
    'SF': 'sum',
    'GiDP': 'sum',
    'age': 'mean',
    'PA': 'sum',
    'B1': 'sum',
    'TB': 'sum',
    'SBA': 'sum',
    'XBH': 'sum',
    'TOB': 'sum',
    'RP': 'sum'
}).reset_index()

career_df['age'] = career_df['age'].round(0).astype(int)
career_df['rowType'] = 'career'

In [5]:
#create rows for 162 Game Avg.
avg162_df = career_df.drop(['rowType'], axis=1 ).copy()
stat_cols = avg162_df.columns.difference(['playerID', 'age', 'Years'])


avg162_df[stat_cols] = avg162_df[avg162_df['G'] > 0][stat_cols].div(avg162_df['G'], axis=0).mul(162).round(2)

avg162_df['rowType'] = '162Avg'


In [6]:
#create rows for 600 PA Avg.
avg600_df = career_df.drop(['rowType'], axis=1).copy()
stat_cols = avg600_df.columns.difference(['playerID', 'age', 'Years'])


avg600_df[stat_cols] = avg600_df[avg600_df['PA'] > 0][stat_cols].div(avg600_df['PA'], axis=0).mul(600).round(2)

avg600_df['rowType'] = '600Avg'


In [7]:
#create rows for season average
season_avg_df = player_batting.copy()
season_avg_df = season_avg_df.drop(['rowType'], axis=1).groupby(['playerID']).agg({
    'Years': 'count',
    'G': 'mean',
    'AB': 'mean',
    'R': 'mean',
    'H': 'mean',
    'B2': 'mean',
    'B3': 'mean',
    'HR': 'mean',
    'RBI': 'mean',
    'SB': 'mean',
    'CS': 'mean',
    'BB': 'mean',
    'K': 'mean',
    'IBB': 'mean',
    'HBP': 'mean',
    'SH': 'mean',
    'SF': 'mean',
    'GiDP': 'mean',
    'age': 'mean',
    'PA': 'mean',
    'B1': 'mean',
    'TB': 'mean',
    'SBA': 'mean',
    'XBH': 'mean',
    'TOB': 'mean',
    'RP': 'mean'
}).round(2).reset_index()

season_avg_df['age'] = season_avg_df['age'].round(0).astype(int)
season_avg_df['rowType'] = 'seasAvg'


In [8]:
all_hitting = pd.concat([player_batting, career_df, avg162_df, avg600_df, season_avg_df], ignore_index=True)

#ratios and percentages for all
all_hitting['BA'] = BasicHitting.calc_ba(all_hitting).round(4)
all_hitting['HRAB'] = BasicHitting.calc_hrab(all_hitting).round(4)
all_hitting['HRH'] = BasicHitting.calc_hrh(all_hitting).round(4)
all_hitting['OBp'] = BasicHitting.calc_obp(all_hitting).round(4)
all_hitting['SLGp'] = BasicHitting.calc_slg(all_hitting).round(4)
all_hitting['OPS'] = BasicHitting.calc_ops(all_hitting).round(4)
all_hitting['XBHp'] = BasicHitting.calc_xbh_pct(all_hitting).round(4)
all_hitting['EqA'] = BasicHitting.calc_eqa(all_hitting).round(4)
all_hitting['ISO'] = BasicHitting.calc_iso(all_hitting).round(4)
all_hitting['GPA'] = BasicHitting.calc_gpa(all_hitting).round(4)
all_hitting['PASO'] = BasicHitting.calc_paso(all_hitting).round(4)
all_hitting['BBK'] = BasicHitting.calc_bbk(all_hitting).round(4)
all_hitting['RC'] = BasicHitting.calc_rc(all_hitting).round(4)
all_hitting['BR'] = BasicHitting.calc_br(all_hitting).round(4)
all_hitting['SBp'] = BasicHitting.calc_sbp(all_hitting).round(4)
all_hitting['TA'] = BasicHitting.calc_ta(all_hitting).round(4)
all_hitting['BABIP'] = BasicHitting.calc_babip(all_hitting).round(4)

#reorder th columns
first_cols = ['rowType', 'playerID', 'age', 'Years']
addl_cols = all_hitting.columns.difference(first_cols).tolist()

all_hitting = all_hitting[first_cols + addl_cols]


In [9]:
load_dotenv()

db_user=os.getenv('jbbs_db_user')
db_pass=os.getenv('jbbs_db_password')
db_host=os.getenv('jbbs_db_host')
db_name=os.getenv('jbbs_db_name')
#all_hitting.to_csv('../outputs/plasyer_hitting.csv.gz', encoding='utf-8', index=False, compression='gzip')

engine = create_engine(f'mysql+mysqlconnector://{db_user}:{db_pass}@{db_host}/{db_name}', echo=False)
all_hitting.to_sql('player_hitting_new', con=engine, if_exists='replace', index=False)


-1