In [None]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv 
from sqlalchemy import create_engine, text
from datetime import datetime
from dateutil.relativedelta import relativedelta
import sys 
sys.path.append('../maths/') 
from baseball_stats import BasicHitting


In [None]:
k = ['playerID', 'birthYear', 'birthMonth', 'birthDay']
import_people = pd.read_csv('../datafiles/People.csv', encoding='latin-1', usecols=k)
import_batting = pd.read_csv('../datafiles/Batting.csv', encoding='latin-1')
people = import_people.copy()

#fix fucky dates
people['birthYear'] = people['birthYear'].fillna(1875).astype(int)
people['birthMonth'] = people['birthMonth'].fillna(1).astype(int)
people['birthDay'] = people['birthDay'].fillna(1).astype(int)

#create a birthdate column so we can calulate an age at start of season
people['birthdate'] = people.apply(lambda x: f"""{x['birthYear']}-{x['birthMonth']}-{x['birthDay']}""", axis=1)
people['birthdate'] = pd.to_datetime(people['birthdate'], errors='coerce')

#merge the people and batting dataframes
batting = people.copy().merge(import_batting, on='playerID', how ='inner')

#calculate the age of the player at the start of the season
batting['season_start'] = batting.apply(lambda x: f"""{x['yearID']}-04-01""", axis=1)
batting['season_start'] = pd.to_datetime(batting['season_start'], errors='coerce')
batting['age'] = batting.apply(lambda x: relativedelta(x['season_start'], x['birthdate']).years, axis=1)

#drop unneeded columns
batting.drop(columns=['stint', 'teamID', 'lgID', 'birthYear', 'birthMonth', 'birthDay', 'G_old', 'G_batting', 'birthdate', 'season_start'], inplace=True)

#rename columns
batting.rename(columns={'3B':'B3', '2B':'B2', 'GIDP':'GiDP', 'SO':'K', 'yearID':'Years'}, inplace=True)

#fill in the 0s
context_cols = ['playerID', 'age', 'Years']
data_cols = batting.columns.difference(context_cols)
batting[data_cols] = batting[data_cols].fillna(0)

#order the columns
batting = batting[context_cols + data_cols.tolist()]

#sum each player year
batting = batting.groupby(['playerID', 'age', 'Years']).sum().reset_index()

batting = BasicHitting.battingSums(batting)


In [None]:
#create rows for career
career_df = batting.copy()
career_df = career_df.drop(['rowType'], axis=1).groupby(['playerID']).agg({
    'Years': 'count',
    'G': 'sum',
    'AB': 'sum',
    'R': 'sum',
    'H': 'sum',
    'B2': 'sum',
    'B3': 'sum',
    'HR': 'sum',
    'RBI': 'sum',
    'SB': 'sum',
    'CS': 'sum',
    'BB': 'sum',
    'K': 'sum',
    'IBB': 'sum',
    'HBP': 'sum',
    'SH': 'sum',
    'SF': 'sum',
    'GiDP': 'sum',
    'age': 'mean',
    'PA': 'sum',
    'B1': 'sum',
    'TB': 'sum',
    'SBA': 'sum',
    'XBH': 'sum',
    'TOB': 'sum',
    'RP': 'sum'
}).reset_index()

career_df['age'] = career_df['age'].round(0).astype(int)

In [None]:
#create rows for 162 Game Avg.
avg162_df = career_df.drop(['rowType'], axis=1 ).copy()
stat_cols = avg162_df.columns.difference(['playerID', 'age', 'Years'])
avg162_df[stat_cols] = avg162_df[avg162_df['G'] > 0][stat_cols].div(avg162_df['G'], axis=0).mul(162).round(2)
avg162_df.insert(0, 'rowType', '162Avg')

In [None]:
#create rows for 600 PA Avg.
norm_df = career_df.drop(['rowType'], axis=1).copy()
stat_cols = norm_df.columns.difference(['playerID', 'age', 'Years'])
norm_df[stat_cols] = norm_df[norm_df['PA'] > 0][stat_cols].div(norm_df['PA'], axis=0).mul(600).round(2)
norm_df.insert(0, 'rowType', 'normalized')

In [None]:
season_df = batting.copy()
season_df.insert(0, 'rowType', 'season')
career_df.insert(0, 'rowType', 'career')

most_hitting = pd.concat([season_df, career_df, avg162_df, norm_df], ignore_index=True)

most_hitting = BasicHitting.battingRatios(most_hitting) 

In [None]:
#create rows for season average
season_avg_df = most_hitting[most_hitting['rowType'] == 'season'].copy()
season_avg_df = season_avg_df.drop(['rowType'], axis=1).groupby(['playerID']).mean().round(4).reset_index()

season_avg_df['age'] = season_avg_df['age'].round(0).astype(int)
season_avg_df['Years'] = season_avg_df['Years'].round(0).astype(int)
season_avg_df.insert(0, 'rowType', 'seasAvg')

final_hitting = pd.concat([most_hitting, season_avg_df], ignore_index=True)


In [None]:
load_dotenv()

db_user=os.getenv('jbbs_db_user')
db_pass=os.getenv('jbbs_db_password')
db_host=os.getenv('jbbs_db_host')
db_name=os.getenv('jbbs_db_name')

engine = create_engine(
    f'mysql+mysqlconnector://{db_user}:{db_pass}@{db_host}/{db_name}',
    echo=False,
    pool_size=5,      # Maximum number of connections in the pool
    max_overflow=0,   # Prevents creating more connections than `pool_size`
    pool_recycle=600,
    pool_pre_ping=True
)

with engine.connect() as conn:
    conn.rollback() 
    final_hitting.to_sql('player_hitting_new', con=engine, if_exists='replace', index=False, chunksize=500)
    conn.execute(text('create index idx_playerid on player_hitting_new(playerID);'))
    conn.execute(text('create index idx_rowType on player_hitting_new(rowType);'))
    conn.execute(text('create index idx_age on player_hitting_new(age);'))
    conn.execute(text('create index idx_years on player_hitting_new(Years);'))
