In [None]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv 
from sqlalchemy import create_engine
from datetime import datetime
from dateutil.relativedelta import relativedelta
import sys 
sys.path.append('../maths/') 
from baseball_stats import BasicPitching, pitchingRatios

In [2]:
k = ['playerID', 'birthYear', 'birthMonth', 'birthDay']
import_people = pd.read_csv('../datafiles/People.csv', encoding='latin-1', usecols=k)
import_pitching = pd.read_csv('../datafiles/Pitching.csv', encoding='latin-1')
people = import_people.copy()

#fix fucky dates
people['birthYear'] = people['birthYear'].fillna(1875).astype(int)
people['birthMonth'] = people['birthMonth'].fillna(1).astype(int)
people['birthDay'] = people['birthDay'].fillna(1).astype(int)

#create a birthdate column so we can calulate an age at start of season
people['birthdate'] = people.apply(lambda x: f"""{x['birthYear']}-{x['birthMonth']}-{x['birthDay']}""", axis=1)
people['birthdate'] = pd.to_datetime(people['birthdate'], errors='coerce')

#merge the people and batting dataframes
pitching = people.copy().merge(import_pitching, on='playerID', how ='inner')
pitching['season_start'] = pitching.apply(lambda x: f"""{x['yearID']}-04-01""", axis=1)
pitching['season_start'] = pd.to_datetime(pitching['season_start'], errors='coerce')
pitching['age'] = pitching.apply(lambda x: relativedelta(x['season_start'], x['birthdate']).years, axis=1)

pitching = pitching.drop(columns = ['birthYear', 'birthMonth', 'birthdate', 'birthDay', 'stint', 'teamID', 'lgID', 'season_start', 'BAOpp', 'ERA'])
pitching.rename(columns = {'yearID':'Years', 'IPouts':'IPO', 'BFP':'BF', 'SO':'K'}, inplace=True)

context_cols = ['playerID', 'age', 'Years']
data_cols = pitching.columns.difference(context_cols)

pitching[data_cols] = pitching[data_cols].fillna(0)
pitching = pitching[context_cols + data_cols.tolist()]

#easy adding shit up
pitching['IP'] = BasicPitching.calc_ip(pitching)
pitching['DECI'] = BasicPitching.calc_decisions(pitching)
pitching['NODE'] = BasicPitching.calc_no_decisions(pitching)
pitching['PAB'] = BasicPitching.calc_pab(pitching)

pitching.head(10)

Unnamed: 0,playerID,age,Years,BB,BF,BK,CG,ER,G,GF,...,SF,SH,SHO,SV,W,WP,IP,DECI,NODE,PAB
0,aardsda01,22,2004,10,61.0,0,0,8,11,5,...,1.0,0.0,0,0,1,0,10.667,1,10,48.0
1,aardsda01,24,2006,28,225.0,0,0,24,45,9,...,3.0,1.0,0,0,3,1,53.0,3,42,192.0
2,aardsda01,25,2007,17,151.0,0,0,23,25,7,...,1.0,2.0,0,0,2,2,32.333,3,22,130.0
3,aardsda01,26,2008,35,228.0,0,0,30,47,7,...,2.0,3.0,0,0,4,3,48.667,6,41,183.0
4,aardsda01,27,2009,34,296.0,0,0,20,73,53,...,1.0,2.0,0,38,3,2,71.333,9,64,259.0
5,aardsda01,28,2010,25,202.0,0,0,19,53,43,...,1.0,7.0,0,31,0,2,49.667,6,47,167.0
6,aardsda01,30,2012,1,5.0,0,0,1,1,1,...,0.0,0.0,0,0,0,0,1.0,0,1,4.0
7,aardsda01,31,2013,19,178.0,1,0,19,43,7,...,1.0,2.0,0,0,2,1,39.667,4,39,152.0
8,aardsda01,33,2015,14,129.0,0,0,16,33,9,...,1.0,0.0,0,0,1,1,30.667,2,31,113.0
9,aasedo01,22,1977,19,373.0,0,4,32,13,0,...,3.0,2.0,2,0,6,0,92.333,8,5,348.0


In [3]:
career_df = pitching.copy()
career_df = career_df.groupby(['playerID']).agg({
    'age':'mean',
    'Years':'count',
    'BB':'sum',
    'BF':'sum',
    'BK':'sum',
    'CG':'sum',
    'ER':'sum',
    'G':'sum',
    'GF':'sum',
    'GIDP':'sum',
    'GS':'sum',
    'H':'sum',
    'HBP':'sum',
    'HR':'sum',
    'IBB':'sum',
    'IPO':'sum',
    'K':'sum',
    'L':'sum',
    'R':'sum',
    'SF':'sum',
    'SH':'sum',
    'SHO':'sum',
    'SV':'sum',
    'W':'sum',
    'WP':'sum',
    'IP':'sum',
    'DECI':'sum',
    'NODE':'sum',
    'PAB':'sum'
}).reset_index()

career_df['age'] = career_df['age'].round(0).astype(int)


In [4]:
avg162_df = career_df.copy()

stat_cols = avg162_df.columns.difference(['playerID', 'age', 'Years'])
avg162_df[stat_cols] = avg162_df[avg162_df['G'] > 0][stat_cols].div(avg162_df['G'], axis=0).mul(56).round(2)
avg162_df.insert(0, 'rowType', '162Avg')


In [5]:
norm_df = career_df.copy()
stat_cols = norm_df.columns.difference(['playerID', 'age', 'Years'])
norm_df[stat_cols] = norm_df[norm_df['IP'] > 0][stat_cols].div(norm_df['IP'], axis=0).mul(9).round(2)
norm_df.insert(0, 'rowType', 'normalized')

In [None]:
#merge all but season average and caluclate the percentages and ratios

season_df = pitching.copy()
season_df.insert(0, 'rowType', 'season')

most_pitching = pd.concat([season_df, career_df, avg162_df, norm_df], ignore_index=True)

most_pitching = pitchingRatios(most_pitching)

most_pitching.head(10)

# most_pitching['ERC'] = BasicPitching.calc_cera(most_pitching).round(4)
# most_pitching['DICE'] = BasicPitching.calc_dera(most_pitching).round(4)
# most_pitching['ERA'] = BasicPitching.calc_era(most_pitching).round(4)
# most_pitching['H9'] = BasicPitching.calc_h9(most_pitching).round(4)
# most_pitching['HR9'] = BasicPitching.calc_hr9(most_pitching).round(4)
# most_pitching['K9'] = BasicPitching.calc_k9(most_pitching).round(4)
# most_pitching['KG'] = BasicPitching.calc_kg(most_pitching).round(4)
# most_pitching['HRG'] = BasicPitching.calc_hrg(most_pitching).round(4)
# most_pitching['RAG'] = BasicPitching.calc_rag(most_pitching).round(4)
# most_pitching['OBA'] = BasicPitching.calc_oba(most_pitching).round(4)
# most_pitching['BABIP'] = BasicPitching.calc_pbabip(most_pitching).round(4)
# most_pitching['PFR'] = BasicPitching.calc_pfr(most_pitching).round(4)
# most_pitching['R9'] = BasicPitching.calc_r9(most_pitching).round(4)
# most_pitching['BB9'] = BasicPitching.calc_bb9(most_pitching).round(4)
# most_pitching['BF9'] = BasicPitching.calc_bf9(most_pitching).round(4)
# most_pitching['WHIP'] = BasicPitching.calc_whip(most_pitching).round(4)
# most_pitching['WPCT'] = BasicPitching.calc_wp(most_pitching).round(4)
# most_pitching['BBK'] = BasicPitching.calc_bbk(most_pitching).round(4)
# most_pitching['OBPA'] = BasicPitching.calc_obpa(most_pitching).round(4)


Unnamed: 0,rowType,playerID,age,Years,BB,BF,BK,CG,ER,G,...,OBA,BABIP,PFR,R9,BB9,BF9,WHIP,WPCT,BBK,OBPA
0,season,aardsda01,22,2004,10.0,61.0,0.0,0.0,8.0,11.0,...,0.417,0.452,1.406,6.75,8.437,51.467,2.812,1.0,2.0,0.667
1,season,aardsda01,24,2006,28.0,225.0,0.0,0.0,24.0,45.0,...,0.214,0.229,1.453,4.245,4.755,38.208,1.302,1.0,0.571,0.365
2,season,aardsda01,25,2007,17.0,151.0,0.0,0.0,23.0,25.0,...,0.3,0.376,1.639,6.68,4.732,42.031,1.732,0.667,0.472,0.438
3,season,aardsda01,26,2008,35.0,228.0,0.0,0.0,30.0,47.0,...,0.268,0.341,1.726,5.918,6.473,42.164,1.726,0.667,0.714,0.486
4,season,aardsda01,27,2009,34.0,296.0,0.0,0.0,20.0,73.0,...,0.189,0.253,1.598,2.902,4.29,37.346,1.164,0.333,0.425,0.32
5,season,aardsda01,28,2010,25.0,202.0,0.0,0.0,19.0,53.0,...,0.198,0.239,1.49,3.443,4.53,36.604,1.168,0.0,0.51,0.359
6,season,aardsda01,30,2012,1.0,5.0,0.0,0.0,1.0,1.0,...,0.25,0.0,2.0,9.0,9.0,45.0,2.0,,1.0,0.5
7,season,aardsda01,31,2013,19.0,178.0,1.0,0.0,19.0,43.0,...,0.257,0.278,1.387,4.538,4.311,40.386,1.462,0.5,0.528,0.408
8,season,aardsda01,33,2015,14.0,129.0,0.0,0.0,16.0,33.0,...,0.221,0.247,1.598,4.989,4.109,37.858,1.272,0.5,0.4,0.354
9,season,aasedo01,22,1977,19.0,373.0,0.0,4.0,32.0,13.0,...,0.244,0.267,0.736,3.509,1.852,36.358,1.126,0.75,0.388,0.302


In [None]:
#create rows for season average
season_avg_df = most_pitching[most_pitching['rowType'] == 'season'].copy()
season_avg_df = season_avg_df.drop(['rowType'], axis=1).groupby(['playerID']).mean().round(4).reset_index()

season_avg_df['age'] = season_avg_df['age'].round(0).astype(int)
season_avg_df['Years'] = season_avg_df['Years'].round(0).astype(int)
season_avg_df.insert(0, 'rowType', 'seasAvg')

final_pitching = pd.concat([most_pitching, season_avg_df], ignore_index=True)