## Hello World

Sources:
https://github.com/jldbc/pybaseball
https://www.kaggle.com/code/weslayton/fangraphs-baseball-scraper-analysis

In [1]:
#import general libraries
import numpy as np
import pandas as pd

pd.set_option("display.precision", 2)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

from scipy import stats
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')

In [2]:
#install latest pybaseball
#pip install pybaseball

In [3]:
#import data sources
import pybaseball

In [4]:
from pybaseball import batting_stats

## Collect Data

### Annual data

In [5]:
df = pybaseball.team_pitching(2023,2023)

In [6]:
df.columns

Index(['teamIDfg', 'Season', 'Team', 'Age', 'W', 'L', 'ERA', 'G', 'GS', 'CG',
       ...
       'LA', 'Barrels', 'Barrel%', 'maxEV', 'HardHit', 'HardHit%', 'Events',
       'CStr%', 'CSW%', 'xERA'],
      dtype='object', length=333)

# Clean Data

In [7]:
print("Pre cleaning shape:", df.shape)

Pre cleaning shape: (30, 333)


In [8]:
df.head()

Unnamed: 0,teamIDfg,Season,Team,Age,W,L,ERA,G,GS,CG,ShO,SV,BS,IP,TBF,H,R,ER,HR,BB,IBB,HBP,WP,BK,SO,GB,FB,LD,IFFB,Balls,Strikes,Pitches,RS,IFH,BU,BUH,K/9,BB/9,K/BB,H/9,HR/9,AVG,WHIP,BABIP,LOB%,FIP,GB/FB,LD%,GB%,FB%,IFFB%,HR/FB,IFH%,BUH%,Starting,Start-IP,Relieving,Relief-IP,RAR,WAR,Dollars,tERA,xFIP,WPA,-WPA,+WPA,RE24,REW,pLI,inLI,gmLI,exLI,Pulls,WPA/LI,Clutch,FB% 2,FBv,SL%,SLv,CT%,CTv,CB%,CBv,CH%,CHv,SF%,SFv,KN%,KNv,XX%,PO%,wFB,wSL,wCT,wCB,wCH,wSF,wKN,wFB/C,wSL/C,...,SB% (pi),SI% (pi),SL% (pi),XX% (pi),vCH (pi),vCS (pi),vCU (pi),vFA (pi),vFC (pi),vFS (pi),vKN (pi),vSB (pi),vSI (pi),vSL (pi),vXX (pi),CH-X (pi),CS-X (pi),CU-X (pi),FA-X (pi),FC-X (pi),FS-X (pi),KN-X (pi),SB-X (pi),SI-X (pi),SL-X (pi),XX-X (pi),CH-Z (pi),CS-Z (pi),CU-Z (pi),FA-Z (pi),FC-Z (pi),FS-Z (pi),KN-Z (pi),SB-Z (pi),SI-Z (pi),SL-Z (pi),XX-Z (pi),wCH (pi),wCS (pi),wCU (pi),wFA (pi),wFC (pi),wFS (pi),wKN (pi),wSB (pi),wSI (pi),wSL (pi),wXX (pi),wCH/C (pi),wCS/C (pi),wCU/C (pi),wFA/C (pi),wFC/C (pi),wFS/C (pi),wKN/C (pi),wSB/C (pi),wSI/C (pi),wSL/C (pi),wXX/C (pi),O-Swing% (pi),Z-Swing% (pi),Swing% (pi),O-Contact% (pi),Z-Contact% (pi),Contact% (pi),Zone% (pi),Pace (pi),FRM,K/9+,BB/9+,K/BB+,H/9+,HR/9+,AVG+,WHIP+,BABIP+,LOB%+,K%+,BB%+,LD%+,GB%+,FB%+,HR/FB%+,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,EV,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xERA
0,21,2023,HOU,29,38,29,3.3,271,67,1,1,17,8,598.0,2488,537,244,219,69,206,4,27,17,2,629,706,579,330,53,3634,6179,9813,304,45,9,2,9.47,3.1,3.05,8.08,1.04,0.238,1.24,0.301,0.781,3.84,1.22,0.204,0.437,0.359,0.092,0.119,0.064,0.222,60.0,377.0,24.6,216.1,84.6,9.0,$71.7,3.94,3.88,4.02,-41.78,45.8,44.53,4.71,1.0,0.93,0.98,1.02,204,0.78,3.24,0.478,94.1,0.191,83.8,0.092,88.0,0.142,79.8,0.079,87.3,0.018,85.9,,,0.004,,17.9,8.4,8.4,13.3,-8.3,-2.6,,0.38,0.45,...,,0.119,0.249,,87.4,,79.7,94.4,86.9,86.1,,,94.1,85.1,,-6.3,,3.1,-3.9,1.9,-6.2,,,1.2,4.3,,2.5,,-9.3,8.8,2.9,0.0,,,4.0,0.0,,-9.2,,12.0,3.7,4.2,-2.6,,,16.6,16.2,,-1.18,,0.88,0.11,0.87,-1.49,,,1.42,0.66,,0.292,0.647,0.47,0.514,0.829,0.732,0.502,19.0,-7.8,107,94,114,97,90,99,96,103,109,110,96,103,104,94,101,94,108,98,102,99,101,89.1,11.3,144,0.089,117.4,649,0.399,1626,0.158,0.284,
1,8,2023,MIN,28,34,33,3.54,264,67,0,0,13,14,600.0,2483,511,253,236,65,193,13,19,19,1,639,673,650,286,61,3315,6380,9695,295,52,22,6,9.59,2.9,3.31,7.67,0.98,0.225,1.17,0.285,0.744,3.61,1.04,0.178,0.418,0.404,0.094,0.1,0.077,0.273,75.6,376.9,14.8,218.2,90.4,9.7,$77.7,3.4,3.93,2.98,-41.94,44.91,40.78,4.32,1.01,0.91,1.09,1.17,197,4.69,-1.74,0.45,93.9,0.212,84.2,0.065,89.1,0.077,81.3,0.125,85.9,0.07,86.9,,,0.008,,28.6,1.7,6.3,-1.5,9.6,10.1,,0.66,0.08,...,,0.076,0.233,0.0,86.1,,81.4,94.0,88.0,84.3,,,95.0,84.8,77.2,-6.0,,4.9,-3.9,1.8,-7.2,,,-8.1,5.3,0.0,2.6,,-8.4,8.3,2.1,0.6,,,3.9,-0.8,0.0,12.1,,-1.6,24.8,13.4,6.5,,,8.8,-3.3,0.0,1.04,,-0.23,0.66,2.9,1.07,,,1.19,-0.15,-3.82,0.313,0.672,0.496,0.57,0.825,0.746,0.508,18.7,0.2,109,88,124,92,84,93,91,98,103,112,90,89,100,106,84,98,101,101,96,102,99,88.9,14.0,130,0.08,116.0,639,0.392,1632,0.161,0.285,
2,12,2023,TBR,29,48,22,3.56,297,70,0,0,20,14,619.0,2578,517,257,245,68,224,6,31,28,1,591,791,590,334,66,3543,6330,9873,395,48,15,6,8.59,3.26,2.64,7.52,0.99,0.223,1.2,0.27,0.761,4.03,1.34,0.195,0.461,0.344,0.112,0.115,0.061,0.4,62.7,323.3,-3.3,288.0,59.4,6.5,$52.3,3.87,4.11,2.46,-41.0,43.45,29.36,3.06,0.95,0.85,1.04,1.09,227,1.99,0.6,0.442,93.1,0.204,83.0,0.115,88.2,0.095,80.2,0.102,86.8,0.042,86.6,,,0.004,,15.2,5.7,11.1,9.1,0.8,8.9,,0.35,0.28,...,,0.184,0.25,,86.8,,81.2,94.7,88.4,87.2,,,91.6,83.5,,4.2,,2.7,0.7,1.7,-5.3,,,-4.7,2.9,,1.4,,-7.2,9.1,3.1,3.0,,,1.2,-0.1,,8.6,,8.7,-3.4,6.6,-0.3,,,24.8,13.8,,0.68,,1.15,-0.13,0.78,-0.16,,,1.38,0.56,,0.286,0.627,0.463,0.563,0.849,0.763,0.518,18.2,-5.8,98,99,96,90,86,92,93,93,106,100,101,98,110,90,97,104,97,98,106,98,100,88.5,10.5,131,0.076,115.9,669,0.386,1732,0.178,0.286,
3,9,2023,NYY,29,39,29,3.6,281,68,1,1,20,8,609.2,2558,511,267,244,77,228,1,40,14,6,602,746,627,300,66,3590,6412,10002,311,55,14,7,8.89,3.37,2.64,7.54,1.14,0.223,1.21,0.269,0.763,4.26,1.19,0.179,0.446,0.375,0.105,0.123,0.074,0.5,30.4,350.4,21.2,256.7,51.6,5.4,$43.5,4.08,4.25,4.33,-42.74,47.06,28.54,2.93,1.01,0.95,1.19,1.29,213,0.71,3.56,0.485,94.8,0.192,85.1,0.074,89.4,0.106,82.8,0.143,86.7,,,,,0.003,,31.0,3.3,-3.8,2.0,-1.2,,,0.64,0.17,...,,0.225,0.218,,86.8,,83.2,94.7,89.6,,,,95.3,84.8,,-5.7,,4.0,-3.9,-0.1,,,,-6.9,3.4,,2.7,,-4.1,9.1,3.7,,,,4.1,-0.5,,0.2,,4.7,10.0,0.0,,,,24.3,-1.0,,0.01,,0.57,0.39,-0.01,,,,1.08,-0.04,,0.284,0.651,0.47,0.586,0.844,0.767,0.505,18.3,8.9,101,102,98,91,98,93,94,93,106,102,104,90,106,98,104,107,102,86,102,102,96,88.8,12.5,137,0.081,116.7,663,0.393,1688,0.17,0.279,
4,29,2023,SDP,30,32,34,3.76,268,66,0,0,18,11,586.0,2437,504,258,245,72,215,4,26,21,3,567,717,595,305,71,3637,6180,9817,280,38,12,7,8.71,3.3,2.64,7.74,1.11,0.23,1.23,0.277,0.756,4.17,1.21,0.189,0.443,0.368,0.119,0.121,0.053,0.583,53.8,354.6,19.7,226.5,73.5,7.5,$60.1,3.93,4.19,0.04,-42.72,42.76,22.07,2.32,0.98,0.92,1.07,1.13,202,1.37,-1.33,0.495,93.8,0.16,82.5,0.089,88.2,0.101,79.0,0.141,84.5,0.015,86.8,,,0.004,,24.3,4.7,-1.1,8.6,7.2,-0.3,,0.5,0.3,...,0.004,0.151,0.177,0.0,84.8,,79.3,94.1,89.5,89.3,,80.5,94.1,82.9,95.2,-3.3,,3.6,-0.3,1.2,-4.2,,-5.2,-3.4,3.4,-9.9,3.4,,-9.7,8.3,4.4,1.9,,-4.1,5.6,0.3,6.9,8.0,,7.6,4.4,1.0,0.9,,-1.4,22.8,5.3,0.0,0.58,,0.78,0.13,0.14,0.85,,-3.79,1.53,0.3,4.4,0.289,0.637,0.463,0.582,0.84,0.76,0.501,18.6,-3.9,101,96,104,89,93,92,91,93,105,104,99,92,103,101,96,103,101,94,115,101,91,87.7,12.5,122,0.075,114.6,578,0.355,1629,0.166,0.276,


In [9]:
df.columns

Index(['teamIDfg', 'Season', 'Team', 'Age', 'W', 'L', 'ERA', 'G', 'GS', 'CG',
       ...
       'LA', 'Barrels', 'Barrel%', 'maxEV', 'HardHit', 'HardHit%', 'Events',
       'CStr%', 'CSW%', 'xERA'],
      dtype='object', length=333)

In [10]:
print("Pre removal ", df.shape)
#df = df.drop(['#days'], axis = 1)

print("Post removal ", df.shape)

Pre removal  (30, 333)
Post removal  (30, 333)


In [11]:
df.columns

Index(['teamIDfg', 'Season', 'Team', 'Age', 'W', 'L', 'ERA', 'G', 'GS', 'CG',
       ...
       'LA', 'Barrels', 'Barrel%', 'maxEV', 'HardHit', 'HardHit%', 'Events',
       'CStr%', 'CSW%', 'xERA'],
      dtype='object', length=333)

In [12]:
# rename columns
#df = df.rename(columns = {'xyz': 'abc'})
#df.columns

In [13]:
# count nulls
counts = df.isna().sum()
percentages = round(df.isna().mean() * 100, 1)
null_values = pd.concat([counts, percentages], axis=1, keys=["count", "%"])
print(null_values)

                 count         %
teamIDfg             0   0.00000
Season               0   0.00000
Team                 0   0.00000
Age                  0   0.00000
W                    0   0.00000
L                    0   0.00000
ERA                  0   0.00000
G                    0   0.00000
GS                   0   0.00000
CG                   0   0.00000
ShO                  0   0.00000
SV                   0   0.00000
BS                   0   0.00000
IP                   0   0.00000
TBF                  0   0.00000
H                    0   0.00000
R                    0   0.00000
ER                   0   0.00000
HR                   0   0.00000
BB                   0   0.00000
IBB                  0   0.00000
HBP                  0   0.00000
WP                   0   0.00000
BK                   0   0.00000
SO                   0   0.00000
GB                   0   0.00000
FB                   0   0.00000
LD                   0   0.00000
IFFB                 0   0.00000
Balls     

In [14]:
# drop nulls
print("Pre removal ", df.shape)
#df = df.dropna()
print("Post removal ", df.shape)

Pre removal  (30, 333)
Post removal  (30, 333)


In [15]:
# count duplicates
len(df['Team'])-len(df['Team'].drop_duplicates())

0

In [16]:
# remove dups

print("Pre removal ", df.shape)
#df = df.drop_duplicates(subset=['Name'])
print("Post removal ", df.shape)

Pre removal  (30, 333)
Post removal  (30, 333)


In [17]:
# remove outliers

#print("Pre removal ", df.shape)
#df = df[(np.abs(stats.zscore(df['xyz'])) < 3)]
#print("Post removal ", df.shape)

# Data Restructing

## Variable Restructing

### Level

In [18]:
# change column values
#df['Lev'] = df['Lev'].map({'Maj-AL': 'AL', 'Maj-NL': 'NL'})

In [19]:
# rename column
#df.rename(columns={'Lev': 'League'}, inplace=True)

In [20]:
# verify changes
#df[["League", "Name"]].groupby("League").count()

## Normalizations

In [21]:
#df=['Volume_norm'] = (df.Volume - df.Volume.mean()) / (df.Volume.max() - df.Volume.mix())
#df.columns

## Categorizations

In [22]:
#normalized = ['Volume_norm', 'Difficulty_norm']

## Calculations

In [23]:
#calculate wOBA on 14 day player batting

#df['wOBA'] = (0.69*df['BB'] + 0.72*df['HBP'] + 0.89*df['H'] + 1.27*df['2B'] + 1.62*df['3B'] + 2.10*df['HR']) / (df['AB'] + df['BB'] - df['IBB'] + df['SF'] + df['HBP'])

In [24]:
#df.head()

## Export Data

In [25]:
#EXPORT

df.to_csv('C:\\Users\\b7tbu\\JUPYTER PROJECTS\\ANALYTICO\\Data_Exports\\Team\\Pitching\\EXPORT_annual.csv', index=False)