In [12]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv 
from sqlalchemy import create_engine, text
from datetime import datetime
from dateutil.relativedelta import relativedelta
import sys 
sys.path.append('../maths/') 
from baseball_stats import BasicFielding

In [None]:
k = ['playerID', 'birthYear', 'birthMonth', 'birthDay']
import_people = pd.read_csv('../datafiles/People.csv', encoding='latin-1', usecols=k)
import_fielding = pd.read_csv('../datafiles/Fielding.csv', encoding='latin-1')
people = import_people.copy()

#fix fucky dates
people['birthYear'] = people['birthYear'].fillna(1875).astype(int)
people['birthMonth'] = people['birthMonth'].fillna(1).astype(int)
people['birthDay'] = people['birthDay'].fillna(1).astype(int)

#create a birthdate column so we can calulate an age at start of season
people['birthdate'] = people.apply(lambda x: f"""{x['birthYear']}-{x['birthMonth']}-{x['birthDay']}""", axis=1)
people['birthdate'] = pd.to_datetime(people['birthdate'], errors='coerce')

#merge the people and batting dataframes
fielding = people.copy().merge(import_fielding, on='playerID', how ='inner')
fielding['season_start'] = fielding.apply(lambda x: f"""{x['yearID']}-04-01""", axis=1)
fielding['season_start'] = pd.to_datetime(fielding['season_start'], errors='coerce')
fielding['age'] = fielding.apply(lambda x: relativedelta(x['season_start'], x['birthdate']).years, axis=1)

fielding = fielding.drop(columns = ['birthYear', 'birthMonth', 'birthdate', 'birthDay', 'season_start', 'stint', 'teamID', 'lgID', 'ZR', 'POS'])
fielding.rename(columns = {'yearID':'Years', 'InnOuts':'IPO', 'SB':'SBA', 'CS':'CCS', 'GS':'S'}, inplace=True)

context_cols = ['playerID', 'age', 'Years']
data_cols = fielding.columns.difference(context_cols)

fielding[data_cols] = fielding[data_cols].fillna(0)
fielding = fielding[context_cols + data_cols.tolist()]

fielding = fielding.groupby(['playerID', 'age', 'Years']).sum().reset_index()

#easy adding shit up
fielding = BasicFielding.fieldingSums(fielding)

In [None]:
career_df = fielding.copy()
career_df = career_df.groupby(['playerID']).agg({
    'age':'mean',
    'Years':'count',
    'A':'sum',
    'CCS':'sum',
    'DP':'sum',
    'E':'sum',
    'G':'sum',
    'S':'sum',
    'IPO':'sum',
    'PB':'sum',
    'PO':'sum',
    'IPO':'sum',
    'SBA':'sum',
    'WP':'sum',
    'IP':'sum',
    'TC':'sum',
    'CA':'sum',
    'SBAT':'sum'
}).reset_index()

career_df['age'] = career_df['age'].round(0).astype(int)

In [15]:
avg162_df = career_df.copy()

stat_cols = avg162_df.columns.difference(['playerID', 'age', 'Years'])
avg162_df[stat_cols] = avg162_df[avg162_df['G'] > 0][stat_cols].div(avg162_df['G'], axis=0).mul(162).round(2)
avg162_df.insert(0, 'rowType', '162Avg')


In [16]:
norm_df = career_df.copy()
stat_cols = norm_df.columns.difference(['playerID', 'age', 'Years'])
norm_df[stat_cols] = norm_df[norm_df['IP'] > 0][stat_cols].div(norm_df['IP'], axis=0).mul(9).round(2)
norm_df.insert(0, 'rowType', 'normalized')

In [17]:
#merge all but season average and caluclate the percentages and ratios

season_df = fielding.copy()
season_df.insert(0, 'rowType', 'season')
career_df.insert(0, 'rowType', 'career')

most_fielding = pd.concat([season_df, career_df, avg162_df, norm_df], ignore_index=True)

most_fielding = BasicFielding.fieldingRatios(most_fielding)

In [18]:
#create rows for season average
season_avg_df = most_fielding[most_fielding['rowType'] == 'season'].copy()
season_avg_df = season_avg_df.drop(['rowType'], axis=1).groupby(['playerID']).mean().round(4).reset_index()

season_avg_df['age'] = season_avg_df['age'].round(0).astype(int)
season_avg_df['Years'] = season_avg_df['Years'].round(0).astype(int)
season_avg_df.insert(0, 'rowType', 'seasAvg')

final_fielding = pd.concat([most_fielding, season_avg_df], ignore_index=True)

In [20]:
load_dotenv()

db_user=os.getenv('jbbs_db_user')
db_pass=os.getenv('jbbs_db_password')
db_host=os.getenv('jbbs_db_host')
db_name=os.getenv('jbbs_db_name')

engine = create_engine(
    f'mysql+mysqlconnector://{db_user}:{db_pass}@{db_host}/{db_name}',
    echo=False,
    pool_size=5,      # Maximum number of connections in the pool
    max_overflow=0,   # Prevents creating more connections than `pool_size`
    pool_recycle=600,
    pool_pre_ping=True
)

with engine.connect() as conn:
    conn.rollback() 
    final_fielding.to_sql('player_fielding_new', con=engine, if_exists='replace', index=False, chunksize=500)
    conn.execute(text('create index idx_playerid on player_fielding_new(playerID);'))
    conn.execute(text('create index idx_rowType on player_fielding_new(rowType);'))
    conn.execute(text('create index idx_age on player_fielding_new(age);'))
    conn.execute(text('create index idx_years on player_fielding_new(Years);'))
