In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv 
from sqlalchemy import create_engine
from datetime import datetime
from dateutil.relativedelta import relativedelta
import sys 
sys.path.append('../maths/') 
from baseball_stats import BasicFielding

In [None]:
k = ['playerID', 'birthYear', 'birthMonth', 'birthDay']
import_people = pd.read_csv('../datafiles/People.csv', encoding='latin-1', usecols=k)
import_fielding = pd.read_csv('../datafiles/Fielding.csv', encoding='latin-1')
people = import_people.copy()

#fix fucky dates
people['birthYear'] = people['birthYear'].fillna(1875).astype(int)
people['birthMonth'] = people['birthMonth'].fillna(1).astype(int)
people['birthDay'] = people['birthDay'].fillna(1).astype(int)

#create a birthdate column so we can calulate an age at start of season
people['birthdate'] = people.apply(lambda x: f"""{x['birthYear']}-{x['birthMonth']}-{x['birthDay']}""", axis=1)
people['birthdate'] = pd.to_datetime(people['birthdate'], errors='coerce')

#merge the people and batting dataframes
fielding = people.copy().merge(import_fielding, on='playerID', how ='inner')
fielding['season_start'] = fielding.apply(lambda x: f"""{x['yearID']}-04-01""", axis=1)
fielding['season_start'] = pd.to_datetime(fielding['season_start'], errors='coerce')
fielding['age'] = fielding.apply(lambda x: relativedelta(x['season_start'], x['birthdate']).years, axis=1)

fielding = fielding.drop(columns = ['birthYear', 'birthMonth', 'birthdate', 'birthDay', 'season_start', 'stint', 'teamID', 'lgID', 'ZR', 'POS'])
fielding.rename(columns = {'yearID':'Years', 'InnOuts':'IPO', 'SB':'SBA', 'CS':'CCS'}, inplace=True)

context_cols = ['playerID', 'age', 'Years']
data_cols = fielding.columns.difference(context_cols)

fielding[data_cols] = fielding[data_cols].fillna(0)
fielding = fielding[context_cols + data_cols.tolist()]

fielding = fielding.groupby(['playerID', 'age', 'Years']).sum().reset_index()

#easy adding shit up
fielding = BasicFielding.fieldingSums(fielding)

Unnamed: 0,playerID,age,Years,A,CCS,DP,E,G,GS,IPO,PB,PO,SBA,WP,IP,TC,CA,SBAT
0,aardsda01,22,2004,0,0.0,0,0.0,11,0.0,32.0,0.0,0,0.0,0.0,10.667,0.0,0,0.0
1,aardsda01,24,2006,5,0.0,1,0.0,45,0.0,159.0,0.0,1,0.0,0.0,53.0,6.0,6,0.0
2,aardsda01,25,2007,4,0.0,0,1.0,25,0.0,97.0,0.0,2,0.0,0.0,32.333,7.0,6,0.0
3,aardsda01,26,2008,6,0.0,0,0.0,47,0.0,146.0,0.0,3,0.0,0.0,48.667,9.0,9,0.0
4,aardsda01,27,2009,5,0.0,1,0.0,73,0.0,214.0,0.0,2,0.0,0.0,71.333,7.0,7,0.0
5,aardsda01,28,2010,3,0.0,0,1.0,53,0.0,149.0,0.0,2,0.0,0.0,49.667,6.0,5,0.0
6,aardsda01,30,2012,0,0.0,0,0.0,1,0.0,3.0,0.0,0,0.0,0.0,1.0,0.0,0,0.0
7,aardsda01,31,2013,5,0.0,0,0.0,43,0.0,119.0,0.0,1,0.0,0.0,39.667,6.0,6,0.0
8,aardsda01,33,2015,1,0.0,0,1.0,33,0.0,92.0,0.0,0,0.0,0.0,30.667,2.0,1,0.0
9,aaronha01,20,1954,5,0.0,0,7.0,116,113.0,3093.0,0.0,223,0.0,0.0,1031.0,235.0,228,0.0


In [15]:
career_df = fielding.copy()
career_df = career_df.groupby(['playerID']).agg({
    'age':'mean',
    'Years':'count',
    'A':'sum',
    'CCS':'sum',
    'DP':'sum',
    'E':'sum',
    'G':'sum',
    'GS':'sum',
    'IPO':'sum',
    'PB':'sum',
    'PO':'sum',
    'IPO':'sum',
    'SBA':'sum',
    'WP':'sum',
    'IP':'sum',
    'TC':'sum',
    'CA':'sum',
    'SBAT':'sum'
}).reset_index()

career_df['age'] = career_df['age'].round(0).astype(int)

In [16]:
avg162_df = career_df.copy()

stat_cols = avg162_df.columns.difference(['playerID', 'age', 'Years'])
avg162_df[stat_cols] = avg162_df[avg162_df['G'] > 0][stat_cols].div(avg162_df['G'], axis=0).mul(162).round(2)
avg162_df.insert(0, 'rowType', '162Avg')


In [17]:
norm_df = career_df.copy()
stat_cols = norm_df.columns.difference(['playerID', 'age', 'Years'])
norm_df[stat_cols] = norm_df[norm_df['IP'] > 0][stat_cols].div(norm_df['IP'], axis=0).mul(9).round(2)
norm_df.insert(0, 'rowType', 'normalized')

In [None]:
#merge all but season average and caluclate the percentages and ratios

season_df = fielding.copy()
season_df.insert(0, 'rowType', 'season')
career_df.insert(0, 'rowType', 'career')

most_fielding = pd.concat([season_df, career_df, avg162_df, norm_df], ignore_index=True)

most_fielding = BasicFielding.fieldingRatios(most_fielding)

Unnamed: 0,rowType,playerID,age,Years,A,CCS,DP,E,G,GS,...,WP,IP,TC,CA,SBAT,Fp,RF,RF9,Sp,SBp
0,season,aardsda01,22,2004,0.0,0.0,0.0,0.0,11.0,0.0,...,0.0,10.667,0.0,0.0,0.0,,0.0,0.0,0.0,
1,season,aardsda01,24,2006,5.0,0.0,1.0,0.0,45.0,0.0,...,0.0,53.0,6.0,6.0,0.0,1.0,0.133,1.019,0.0,
2,season,aardsda01,25,2007,4.0,0.0,0.0,1.0,25.0,0.0,...,0.0,32.333,7.0,6.0,0.0,0.857,0.24,1.67,0.0,
3,season,aardsda01,26,2008,6.0,0.0,0.0,0.0,47.0,0.0,...,0.0,48.667,9.0,9.0,0.0,1.0,0.191,1.664,0.0,
4,season,aardsda01,27,2009,5.0,0.0,1.0,0.0,73.0,0.0,...,0.0,71.333,7.0,7.0,0.0,1.0,0.096,0.883,0.0,
5,season,aardsda01,28,2010,3.0,0.0,0.0,1.0,53.0,0.0,...,0.0,49.667,6.0,5.0,0.0,0.833,0.094,0.906,0.0,
6,season,aardsda01,30,2012,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,
7,season,aardsda01,31,2013,5.0,0.0,0.0,0.0,43.0,0.0,...,0.0,39.667,6.0,6.0,0.0,1.0,0.14,1.361,0.0,
8,season,aardsda01,33,2015,1.0,0.0,0.0,1.0,33.0,0.0,...,0.0,30.667,2.0,1.0,0.0,0.5,0.03,0.293,0.0,
9,season,aaronha01,20,1954,5.0,0.0,0.0,7.0,116.0,113.0,...,0.0,1031.0,235.0,228.0,0.0,0.97,1.966,1.99,0.974,


In [None]:
#create rows for season average
season_avg_df = most_fielding[most_fielding['rowType'] == 'season'].copy()
season_avg_df = season_avg_df.drop(['rowType'], axis=1).groupby(['playerID']).mean().round(4).reset_index()

season_avg_df['age'] = season_avg_df['age'].round(0).astype(int)
season_avg_df['Years'] = season_avg_df['Years'].round(0).astype(int)
season_avg_df.insert(0, 'rowType', 'seasAvg')

final_fielding = pd.concat([most_fielding, season_avg_df], ignore_index=True)

Unnamed: 0,rowType,playerID,age,Years,A,CCS,DP,E,G,GS,...,WP,IP,TC,CA,SBAT,Fp,RF,RF9,Sp,SBp
73847,season,piazzmi01,23,1992,7.0,4.0,1.0,1.0,16.0,16.0,...,0.0,139.333,102.0,101.0,14.0,0.99,6.312,6.524,1.0,0.286
73848,season,piazzmi01,24,1993,99.0,59.0,11.0,11.0,147.0,141.0,...,0.0,1244.333,1011.0,1000.0,167.0,0.989,6.803,7.233,0.959,0.353
73849,season,piazzmi01,25,1994,38.0,26.0,7.0,10.0,104.0,99.0,...,0.0,860.667,688.0,678.0,102.0,0.985,6.519,7.09,0.952,0.255
73850,season,piazzmi01,26,1995,52.0,29.0,8.0,9.0,112.0,110.0,...,0.0,941.0,866.0,857.0,117.0,0.99,7.652,8.197,0.982,0.248
73851,season,piazzmi01,27,1996,70.0,34.0,6.0,9.0,146.0,144.0,...,0.0,1255.667,1134.0,1125.0,189.0,0.992,7.705,8.063,0.986,0.18
73852,season,piazzmi01,28,1997,74.0,43.0,10.0,16.0,139.0,139.0,...,0.0,1200.0,1135.0,1119.0,155.0,0.986,8.05,8.392,1.0,0.277
73853,season,piazzmi01,29,1998,85.0,41.0,7.0,11.0,140.0,140.0,...,0.0,1190.0,1080.0,1069.0,156.0,0.99,7.636,8.085,1.0,0.263
73854,season,piazzmi01,30,1999,47.0,37.0,5.0,11.0,137.0,135.0,...,0.0,1156.667,1011.0,1000.0,152.0,0.989,7.299,7.781,0.985,0.243
73855,season,piazzmi01,31,2000,38.0,32.0,10.0,3.0,124.0,124.0,...,0.0,1027.667,903.0,900.0,142.0,0.997,7.258,7.882,1.0,0.225
73856,season,piazzmi01,32,2001,58.0,33.0,5.0,9.0,131.0,128.0,...,0.0,1085.333,986.0,977.0,147.0,0.991,7.458,8.102,0.977,0.224


In [None]:
load_dotenv()

db_user=os.getenv('jbbs_db_user')
db_pass=os.getenv('jbbs_db_password')
db_host=os.getenv('jbbs_db_host')
db_name=os.getenv('jbbs_db_name')

engine = create_engine(
    f'mysql+mysqlconnector://{db_user}:{db_pass}@{db_host}/{db_name}',
    echo=False,
    pool_size=5,      # Maximum number of connections in the pool
    max_overflow=0,   # Prevents creating more connections than `pool_size`
    pool_recycle=600,
    pool_pre_ping=True
)

with engine.connect() as conn:
    conn.rollback() 
    final_fielding.to_sql('player_fielding_new', con=engine, if_exists='replace', index=False, chunksize=500)
    conn.execute('create index idx_playerid on player_fielding_new(playerID);')
    conn.execute('create index idx_rowType on player_fielding_new(rowType);')
    conn.execute('create index idx_age on player_fielding_new(age);')
    conn.execute('create index idx_years on player_fielding_new(Years);')
