In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv 
from sqlalchemy import create_engine
from datetime import datetime
from dateutil.relativedelta import relativedelta
import sys 
sys.path.append('../maths/') 
from baseball_stats import BasicPitching

In [8]:
k = ['playerID', 'birthYear', 'birthMonth', 'birthDay']
import_people = pd.read_csv('../datafiles/People.csv', encoding='latin-1', usecols=k)
import_pitching = pd.read_csv('../datafiles/Pitching.csv', encoding='latin-1')
people = import_people.copy()

#fix fucky dates
people['birthYear'] = people['birthYear'].fillna(1875).astype(int)
people['birthMonth'] = people['birthMonth'].fillna(1).astype(int)
people['birthDay'] = people['birthDay'].fillna(1).astype(int)

#create a birthdate column so we can calulate an age at start of season
people['birthdate'] = people.apply(lambda x: f"""{x['birthYear']}-{x['birthMonth']}-{x['birthDay']}""", axis=1)
people['birthdate'] = pd.to_datetime(people['birthdate'], errors='coerce')

#merge the people and batting dataframes
pitching = people.copy().merge(import_pitching, on='playerID', how ='inner')
pitching['season_start'] = pitching.apply(lambda x: f"""{x['yearID']}-04-01""", axis=1)
pitching['season_start'] = pd.to_datetime(pitching['season_start'], errors='coerce')
pitching['age'] = pitching.apply(lambda x: relativedelta(x['season_start'], x['birthdate']).years, axis=1)

pitching = pitching.drop(columns = ['birthYear', 'birthMonth', 'birthdate', 'birthDay', 'stint', 'teamID', 'lgID', 'season_start', 'BAOpp', 'ERA'])
pitching.rename(columns = {'yearID':'Years', 'IPouts':'IPO', 'BFP':'BF', 'SO':'K'}, inplace=True)

context_cols = ['playerID', 'age', 'Years']
data_cols = pitching.columns.difference(context_cols)

pitching[data_cols] = pitching[data_cols].fillna(0)
pitching = pitching[context_cols + data_cols.tolist()]

#easy adding shit up
pitching['IP'] = BasicPitching.calc_ip(pitching)
pitching['DECI'] = BasicPitching.calc_decisions(pitching)
pitching['NODE'] = BasicPitching.calc_no_decisions(pitching)
pitching['PAB'] = BasicPitching.calc_pab(pitching)

pitching.head(10)

Unnamed: 0,playerID,age,Years,BB,BF,BK,CG,ER,G,GF,...,SF,SH,SHO,SV,W,WP,IP,DECI,NODE,PAB
0,aardsda01,22,2004,10,61.0,0,0,8,11,5,...,1.0,0.0,0,0,1,0,10.667,1,10,48.0
1,aardsda01,24,2006,28,225.0,0,0,24,45,9,...,3.0,1.0,0,0,3,1,53.0,3,42,192.0
2,aardsda01,25,2007,17,151.0,0,0,23,25,7,...,1.0,2.0,0,0,2,2,32.333,3,22,130.0
3,aardsda01,26,2008,35,228.0,0,0,30,47,7,...,2.0,3.0,0,0,4,3,48.667,6,41,183.0
4,aardsda01,27,2009,34,296.0,0,0,20,73,53,...,1.0,2.0,0,38,3,2,71.333,9,64,259.0
5,aardsda01,28,2010,25,202.0,0,0,19,53,43,...,1.0,7.0,0,31,0,2,49.667,6,47,167.0
6,aardsda01,30,2012,1,5.0,0,0,1,1,1,...,0.0,0.0,0,0,0,0,1.0,0,1,4.0
7,aardsda01,31,2013,19,178.0,1,0,19,43,7,...,1.0,2.0,0,0,2,1,39.667,4,39,152.0
8,aardsda01,33,2015,14,129.0,0,0,16,33,9,...,1.0,0.0,0,0,1,1,30.667,2,31,113.0
9,aasedo01,22,1977,19,373.0,0,4,32,13,0,...,3.0,2.0,2,0,6,0,92.333,8,5,348.0


In [None]:
career_df = pitching.copy()
career_df = career_df.groupby(['playerID']).agg({
    'age':'mean',
    'Years':'count',
    'BB':'sum',
    'BF':'sum',
    'BK':'sum',
    'CG':'sum',
    'ER':'sum',
    'G':'sum',
    'GF':'sum',
    'GIDP':'sum',
    'GS':'sum',
    'H':'sum',
    'HBP':'sum',
    'HR':'sum',
    'IBB':'sum',
    'IPO':'sum',
    'K':'sum',
    'L':'sum',
    'R':'sum',
    'SF':'sum',
    'SH':'sum',
    'SHO':'sum',
    'SV':'sum',
    'W':'sum',
    'WP':'sum',
    'IP':'sum',
    'DECI':'sum',
    'NODE':'sum',
    'PAB':'sum'
})

career_df['age'] = career_df['age'].round(0).astype(int)


In [None]:
avg162_df = career_df.copy()

stat_cols = career_df.columns.difference(['playerID', 'age', 'Years'])
avg162_df[stat_cols] = avg162_df[avg162_df['G'] > 0][stat_cols].div(avg162_df['G'], axis=0).mul(56).round(2)
avg162_df['rowType'] = '162Avg'
