In [9]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv 
from sqlalchemy import create_engine
from datetime import datetime
from dateutil.relativedelta import relativedelta
import sys 
sys.path.append('../maths/') 
from baseball_stats import BasicPitching

In [10]:
k = ['playerID', 'birthYear', 'birthMonth', 'birthDay']
import_people = pd.read_csv('../datafiles/People.csv', encoding='latin-1', usecols=k)
import_pitching = pd.read_csv('../datafiles/Pitching.csv', encoding='latin-1')
people = import_people.copy()

#fix fucky dates
people['birthYear'] = people['birthYear'].fillna(1875).astype(int)
people['birthMonth'] = people['birthMonth'].fillna(1).astype(int)
people['birthDay'] = people['birthDay'].fillna(1).astype(int)

#create a birthdate column so we can calulate an age at start of season
people['birthdate'] = people.apply(lambda x: f"""{x['birthYear']}-{x['birthMonth']}-{x['birthDay']}""", axis=1)
people['birthdate'] = pd.to_datetime(people['birthdate'], errors='coerce')

#merge the people and batting dataframes
pitching = people.copy().merge(import_pitching, on='playerID', how ='inner')
pitching['season_start'] = pitching.apply(lambda x: f"""{x['yearID']}-04-01""", axis=1)
pitching['season_start'] = pd.to_datetime(pitching['season_start'], errors='coerce')
pitching['age'] = pitching.apply(lambda x: relativedelta(x['season_start'], x['birthdate']).years, axis=1)

pitching = pitching.drop(columns = ['birthYear', 'birthMonth', 'birthdate', 'birthDay', 'stint', 'teamID', 'lgID', 'season_start', 'BAOpp', 'ERA'])
pitching.rename(columns = {'yearID':'Years', 'IPouts':'IPO', 'BFP':'BF', 'SO':'K'}, inplace=True)

context_cols = ['playerID', 'age', 'Years']
data_cols = pitching.columns.difference(context_cols)

pitching[data_cols] = pitching[data_cols].fillna(0)
pitching = pitching[context_cols + data_cols.tolist()]

#easy adding shit up
pitching['IP'] = BasicPitching.calc_ip(pitching)
pitching['DECI'] = BasicPitching.calc_decisions(pitching)
pitching['NODE'] = BasicPitching.calc_no_decisions(pitching)
pitching['PAB'] = BasicPitching.calc_pab(pitching)

pitching.head(10)

Unnamed: 0,playerID,age,Years,BB,BF,BK,CG,ER,G,GF,...,SF,SH,SHO,SV,W,WP,IP,DECI,NODE,PAB
0,aardsda01,22,2004,10,61.0,0,0,8,11,5,...,1.0,0.0,0,0,1,0,10.667,1,10,48.0
1,aardsda01,24,2006,28,225.0,0,0,24,45,9,...,3.0,1.0,0,0,3,1,53.0,3,42,192.0
2,aardsda01,25,2007,17,151.0,0,0,23,25,7,...,1.0,2.0,0,0,2,2,32.333,3,22,130.0
3,aardsda01,26,2008,35,228.0,0,0,30,47,7,...,2.0,3.0,0,0,4,3,48.667,6,41,183.0
4,aardsda01,27,2009,34,296.0,0,0,20,73,53,...,1.0,2.0,0,38,3,2,71.333,9,64,259.0
5,aardsda01,28,2010,25,202.0,0,0,19,53,43,...,1.0,7.0,0,31,0,2,49.667,6,47,167.0
6,aardsda01,30,2012,1,5.0,0,0,1,1,1,...,0.0,0.0,0,0,0,0,1.0,0,1,4.0
7,aardsda01,31,2013,19,178.0,1,0,19,43,7,...,1.0,2.0,0,0,2,1,39.667,4,39,152.0
8,aardsda01,33,2015,14,129.0,0,0,16,33,9,...,1.0,0.0,0,0,1,1,30.667,2,31,113.0
9,aasedo01,22,1977,19,373.0,0,4,32,13,0,...,3.0,2.0,2,0,6,0,92.333,8,5,348.0


In [None]:
career_df = pitching.copy()
career_df = career_df.groupby(['playerID']).agg({
    'age':'mean',
    'Years':'count',
    'BB':'sum',
    'BF':'sum',
    'BK':'sum',
    'CG':'sum',
    'ER':'sum',
    'G':'sum',
    'GF':'sum',
    'GIDP':'sum',
    'GS':'sum',
    'H':'sum',
    'HBP':'sum',
    'HR':'sum',
    'IBB':'sum',
    'IPO':'sum',
    'K':'sum',
    'L':'sum',
    'R':'sum',
    'SF':'sum',
    'SH':'sum',
    'SHO':'sum',
    'SV':'sum',
    'W':'sum',
    'WP':'sum',
    'IP':'sum',
    'DECI':'sum',
    'NODE':'sum',
    'PAB':'sum'
}).reset_index()

career_df['age'] = career_df['age'].round(0).astype(int)


Unnamed: 0,playerID,age,Years,BB,BF,BK,CG,ER,G,GF,...,SF,SH,SHO,SV,W,WP,IP,DECI,NODE,PAB
0,aardsda01,27,9,183,1475.0,1,0,160,331,141,...,11.0,17.0,0,69,16,12,337.001,34,297,1248.0
1,aasedo01,29,13,457,4730.0,3,22,468,448,235,...,34.0,50.0,5,82,66,22,1109.333,126,322,4182.0
2,abadfe01,30,12,126,1513.0,2,0,149,406,103,...,12.0,7.0,0,2,9,10,354.668,38,368,1356.0
3,abbeybe01,24,6,192,2568.0,0,52,285,79,14,...,0.0,0.0,0,1,22,18,568.0,62,17,2350.0
4,abbeych01,29,1,0,12.0,0,0,1,1,1,...,0.0,0.0,0,0,0,1,2.0,0,1,12.0
5,abbotan01,23,1,44,459.0,1,0,47,21,0,...,5.0,0.0,0,0,8,1,109.333,14,7,409.0
6,abbotco01,26,3,55,481.0,1,0,70,45,19,...,5.0,1.0,0,0,1,8,104.666,8,37,411.0
7,abbotda01,28,1,8,67.0,0,1,9,3,2,...,0.0,0.0,0,1,0,3,13.0,2,1,58.0
8,abbotgl01,28,12,352,5508.0,5,37,627,248,13,...,39.0,60.0,5,0,62,18,1285.999,145,103,5025.0
9,abbotji01,26,11,620,7211.0,11,31,791,263,5,...,47.0,70.0,6,0,87,53,1674.0,195,68,6442.0


In [None]:
avg162_df = career_df.copy()

stat_cols = avg162_df.columns.difference(['playerID', 'age', 'Years'])
avg162_df[stat_cols] = avg162_df[avg162_df['G'] > 0][stat_cols].div(avg162_df['G'], axis=0).mul(56).round(2)
avg162_df.insert(0, 'rowType', '162Avg')


Unnamed: 0,rowType,playerID,age,Years,BB,BF,BK,CG,ER,G,...,SF,SH,SHO,SV,W,WP,IP,DECI,NODE,PAB
0,162Avg,aardsda01,27,9,30.96,249.55,0.17,0.0,27.07,56.0,...,1.86,2.88,0.0,11.67,2.71,2.03,57.02,5.75,50.25,211.14
1,162Avg,aasedo01,29,13,57.13,591.25,0.38,2.75,58.5,56.0,...,4.25,6.25,0.62,10.25,8.25,2.75,138.67,15.75,40.25,522.75
2,162Avg,abadfe01,30,12,17.38,208.69,0.28,0.0,20.55,56.0,...,1.66,0.97,0.0,0.28,1.24,1.38,48.92,5.24,50.76,187.03
3,162Avg,abbeybe01,24,6,136.1,1820.35,0.0,36.86,202.03,56.0,...,0.0,0.0,0.0,0.71,15.59,12.76,402.63,43.95,12.05,1665.82
4,162Avg,abbeych01,29,1,0.0,672.0,0.0,0.0,56.0,56.0,...,0.0,0.0,0.0,0.0,0.0,56.0,112.0,0.0,56.0,672.0
5,162Avg,abbotan01,23,1,117.33,1224.0,2.67,0.0,125.33,56.0,...,13.33,0.0,0.0,0.0,21.33,2.67,291.55,37.33,18.67,1090.67
6,162Avg,abbotco01,26,3,68.44,598.58,1.24,0.0,87.11,56.0,...,6.22,1.24,0.0,0.0,1.24,9.96,130.25,9.96,46.04,511.47
7,162Avg,abbotda01,28,1,149.33,1250.67,0.0,18.67,168.0,56.0,...,0.0,0.0,0.0,18.67,0.0,56.0,242.67,37.33,18.67,1082.67
8,162Avg,abbotgl01,28,12,79.48,1243.74,1.13,8.35,141.58,56.0,...,8.81,13.55,1.13,0.0,14.0,4.06,290.39,32.74,23.26,1134.68
9,162Avg,abbotji01,26,11,132.02,1535.42,2.34,6.6,168.43,56.0,...,10.01,14.9,1.28,0.0,18.52,11.29,356.44,41.52,14.48,1371.68


In [None]:
norm_df = career_df.copy()
stat_cols = norm_df.columns.difference(['playerID', 'age', 'Years'])
norm_df[stat_cols] = norm_df[norm_df['IP'] > 0][stat_cols].div(norm_df['IP'], axis=0).mul(9).round(2)
norm_df.insert(0, 'rowType', 'normalized')

Unnamed: 0,rowType,playerID,age,Years,BB,BF,BK,CG,ER,G,...,SF,SH,SHO,SV,W,WP,IP,DECI,NODE,PAB
0,normalized,aardsda01,27,9,4.89,39.39,0.03,0.0,4.27,8.84,...,0.29,0.45,0.0,1.84,0.43,0.32,9.0,0.91,7.93,33.33
1,normalized,aasedo01,29,13,3.71,38.37,0.02,0.18,3.8,3.63,...,0.28,0.41,0.04,0.67,0.54,0.18,9.0,1.02,2.61,33.93
2,normalized,abadfe01,30,12,3.2,38.39,0.05,0.0,3.78,10.3,...,0.3,0.18,0.0,0.05,0.23,0.25,9.0,0.96,9.34,34.41
3,normalized,abbeybe01,24,6,3.04,40.69,0.0,0.82,4.52,1.25,...,0.0,0.0,0.0,0.02,0.35,0.29,9.0,0.98,0.27,37.24
4,normalized,abbeych01,29,1,0.0,54.0,0.0,0.0,4.5,4.5,...,0.0,0.0,0.0,0.0,0.0,4.5,9.0,0.0,4.5,54.0
5,normalized,abbotan01,23,1,3.62,37.78,0.08,0.0,3.87,1.73,...,0.41,0.0,0.0,0.0,0.66,0.08,9.0,1.15,0.58,33.67
6,normalized,abbotco01,26,3,4.73,41.36,0.09,0.0,6.02,3.87,...,0.43,0.09,0.0,0.0,0.09,0.69,9.0,0.69,3.18,35.34
7,normalized,abbotda01,28,1,5.54,46.38,0.0,0.69,6.23,2.08,...,0.0,0.0,0.0,0.69,0.0,2.08,9.0,1.38,0.69,40.15
8,normalized,abbotgl01,28,12,2.46,38.55,0.03,0.26,4.39,1.74,...,0.27,0.42,0.03,0.0,0.43,0.13,9.0,1.01,0.72,35.17
9,normalized,abbotji01,26,11,3.33,38.77,0.06,0.17,4.25,1.41,...,0.25,0.38,0.03,0.0,0.47,0.28,9.0,1.05,0.37,34.63


In [None]:
#merge all but season average and caluclate the percentages and ratios

season_df = pitching.copy()
season_df.insert(0, 'rowType', 'season')

most_pitching = pd.concat([season_df, career_df, avg162_df, norm_df], ignore_index=True)

most_pitching['CERA'] = BasicPitching.calc_cera(most_pitching).round(4)
most_pitching['DERA'] = BasicPitching.calc_dera(most_pitching).round(4)
most_pitching['ERA'] = BasicPitching.calc_era(most_pitching).round(4)
most_pitching['H9'] = BasicPitching.calc_h9(most_pitching).round(4)
most_pitching['HR9'] = BasicPitching.calc_hr9(most_pitching).round(4)
most_pitching['K9'] = BasicPitching.calc_k9(most_pitching).round(4)
most_pitching['HRG'] = BasicPitching.calc_hrg(most_pitching).round(4)
most_pitching['RAG'] = BasicPitching.calc_rag(most_pitching).round(4)
most_pitching['OBA'] = BasicPitching.calc_oba(most_pitching).round(4)
most_pitching['PBABIP'] = BasicPitching.calc_pbabip(most_pitching).round(4)
most_pitching['PFR'] = BasicPitching.calc_pfr(most_pitching).round(4)
most_pitching['R9'] = BasicPitching.calc_r9(most_pitching).round(4)
most_pitching['SO9'] = BasicPitching.calc_so9(most_pitching).round(4)
most_pitching['BB9'] = BasicPitching.calc_bb9(most_pitching).round(4)
most_pitching['BF9'] = BasicPitching.calc_bf9(most_pitching).round(4)
most_pitching['WHIP'] = BasicPitching.calc_whip(most_pitching).round(4)
most_pitching['WP'] = BasicPitching.calc_wp(most_pitching).round(4)
most_pitching['BBK'] = BasicPitching.calc_bbk(most_pitching).round(4)
most_pitching['OBPA'] = BasicPitching.calc_obpa(most_pitching).round(4)
most_pitching['FIP'] = BasicPitching.calc_fip(most_pitching).round(4)

most_pitching.head(10)



KeyError: 'SO'