## Hello World

Sources:
https://github.com/jldbc/pybaseball
https://www.kaggle.com/code/weslayton/fangraphs-baseball-scraper-analysis

In [1]:
#import general libraries
import numpy as np
import pandas as pd

pd.set_option("display.precision", 2)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

from scipy import stats
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')

In [2]:
#install latest pybaseball
#pip install pybaseball

In [3]:
#import data sources
import pybaseball

In [4]:
from pybaseball import batting_stats

## Collect Data

### Annual data

In [5]:
df = pybaseball.team_batting(2023,2023)

In [6]:
df.columns

Index(['teamIDfg', 'Season', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B', '2B',
       ...
       'maxEV', 'HardHit', 'HardHit%', 'Events', 'CStr%', 'CSW%', 'xBA',
       'xSLG', 'xwOBA', 'L-WAR'],
      dtype='object', length=319)

# Clean Data

In [7]:
print("Pre cleaning shape:", df.shape)

Pre cleaning shape: (30, 319)


In [8]:
df.head()

Unnamed: 0,teamIDfg,Season,Team,Age,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,GB,FB,LD,IFFB,Pitches,Balls,Strikes,IFH,BU,BUH,BB%,K%,BB/K,OBP,SLG,OPS,ISO,BABIP,GB/FB,LD%,GB%,FB%,IFFB%,HR/FB,IFH%,BUH%,wOBA,wRAA,wRC,Bat,Fld,Rep,Pos,RAR,WAR,Dol,Spd,wRC+,WPA,-WPA,+WPA,RE24,REW,pLI,phLI,PH,WPA/LI,Clutch,FB% (Pitch),FBv,SL%,SLv,CT%,CTv,CB%,CBv,CH%,CHv,SF%,SFv,KN%,KNv,XX%,PO%,wFB,wSL,wCT,wCB,wCH,wSF,wKN,wFB/C,wSL/C,wCT/C,wCB/C,wCH/C,...,KN% (pi),SB% (pi),SI% (pi),SL% (pi),XX% (pi),vCH (pi),vCS (pi),vCU (pi),vFA (pi),vFC (pi),vFS (pi),vKN (pi),vSB (pi),vSI (pi),vSL (pi),vXX (pi),CH-X (pi),CS-X (pi),CU-X (pi),FA-X (pi),FC-X (pi),FS-X (pi),KN-X (pi),SB-X (pi),SI-X (pi),SL-X (pi),XX-X (pi),CH-Z (pi),CS-Z (pi),CU-Z (pi),FA-Z (pi),FC-Z (pi),FS-Z (pi),KN-Z (pi),SB-Z (pi),SI-Z (pi),SL-Z (pi),XX-Z (pi),wCH (pi),wCS (pi),wCU (pi),wFA (pi),wFC (pi),wFS (pi),wKN (pi),wSB (pi),wSI (pi),wSL (pi),wXX (pi),wCH/C (pi),wCS/C (pi),wCU/C (pi),wFA/C (pi),wFC/C (pi),wFS/C (pi),wKN/C (pi),wSB/C (pi),wSI/C (pi),wSL/C (pi),wXX/C (pi),O-Swing% (pi),Z-Swing% (pi),Swing% (pi),O-Contact% (pi),Z-Contact% (pi),Contact% (pi),Zone% (pi),Pace (pi),FRM,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,LD+%,GB%+,FB%+,HR/FB%+,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,EV,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,12,2023,TBR,28,996,2348,2641,615,365,126,11,113,395,378,235,1,585,37,17,4,41,89,18,0.262,754,699,320,66,10280,3710,6570,52,11,7,0.089,0.222,0.4,0.336,0.469,0.806,0.207,0.301,1.08,0.18,0.425,0.394,0.094,0.162,0.069,0.636,0.348,64.4,383,86.7,-0.7,81.5,-8.2,176.3,17.7,$141.7,6.0,128,10.54,-37.85,48.4,115.97,12.78,0.86,1.7,41,11.81,0.42,0.456,93.7,0.234,84.1,0.083,89.1,0.074,80.3,0.131,85.4,0.022,86.5,,,0.003,,49.6,13.8,-9.6,3.3,8.6,1.1,,1.06,0.58,-1.13,0.43,0.64,...,,,0.152,0.251,,85.8,,80.1,94.2,89.4,85.7,,,93.7,84.4,,-4.8,,3.8,-2.8,0.8,-5.0,,,-5.6,2.9,,2.3,,-7.3,8.7,4.0,0.6,,,3.6,-0.4,,9.4,,3.7,47.0,-10.5,-2.0,,,0.8,12.7,,0.69,,0.59,1.52,-1.26,-0.96,,,0.05,0.49,,0.296,0.665,0.48,0.547,0.843,0.751,0.497,18.7,-5.9,107,104,97,106,116,129,103,0.9,101,104,133,101,102,95,90,100,104,90.1,13.5,170,0.095,114.5,759,0.425,1784,0.158,0.277,,,,17.5
1,13,2023,TEX,29,926,2315,2594,631,393,143,8,87,409,392,225,5,581,26,22,5,33,36,11,0.273,710,667,373,55,10089,3730,6359,54,11,3,0.087,0.224,0.39,0.341,0.454,0.795,0.181,0.326,1.06,0.213,0.406,0.381,0.082,0.13,0.076,0.273,0.344,54.6,367,60.8,16.4,80.0,-8.2,151.3,15.2,$121.7,4.9,120,6.37,-35.46,41.83,126.94,13.31,0.83,1.18,32,6.91,0.77,0.491,93.0,0.207,84.6,0.083,88.8,0.105,80.0,0.1,85.3,0.015,87.4,,,0.003,,23.4,18.6,-1.1,4.5,22.5,1.9,,0.47,0.89,-0.14,0.42,2.24,...,,,0.127,0.224,,84.0,,80.0,93.5,89.9,88.5,,,93.6,84.9,,-2.3,,2.6,-1.9,0.8,-4.6,,,-3.0,1.6,,2.6,,-7.5,8.4,4.1,1.3,,,3.8,0.1,,20.6,,2.1,21.2,-1.4,3.5,,,-0.9,15.2,,1.97,,0.21,0.61,-0.16,2.39,,,-0.07,0.67,,0.26,0.667,0.469,0.582,0.852,0.779,0.513,18.4,4.5,111,102,98,108,112,113,111,1.07,96,101,108,95,104,102,94,92,116,90.0,13.2,164,0.093,115.1,751,0.426,1762,0.159,0.263,,,,14.9
2,16,2023,ATL,29,931,2249,2505,591,360,117,5,109,339,327,216,9,559,26,14,0,58,47,9,0.263,754,628,319,48,9871,3578,6293,52,3,2,0.086,0.223,0.39,0.333,0.465,0.797,0.202,0.302,1.2,0.188,0.443,0.369,0.076,0.174,0.069,0.667,0.344,52.0,354,40.4,-1.2,77.3,-7.6,113.6,11.4,$91.3,4.5,113,5.75,-42.91,48.67,49.77,5.14,0.98,1.35,32,4.73,1.13,0.477,94.0,0.205,84.7,0.074,89.5,0.099,80.2,0.122,85.9,0.024,86.7,,,0.005,,21.1,3.0,11.3,9.4,1.0,-4.3,,0.45,0.15,1.56,0.96,0.08,...,,0.001,0.141,0.219,,86.1,,80.2,94.4,90.0,86.9,,81.9,94.1,85.0,,-2.6,,2.9,-1.8,1.0,-5.7,,-6.5,-4.4,1.9,,2.0,,-7.7,8.4,4.5,0.9,,-4.1,4.3,-0.2,,-0.2,,11.0,8.1,13.7,-4.4,,0.3,6.6,-4.5,,-0.01,,1.2,0.25,2.11,-2.03,,5.29,0.48,-0.21,,0.3,0.695,0.497,0.597,0.815,0.749,0.498,18.7,8.0,105,96,99,103,113,125,101,0.92,104,100,141,109,97,89,86,96,114,91.2,11.9,204,0.12,118.6,775,0.455,1704,0.141,0.264,,,,10.5
3,22,2023,LAD,29,995,2251,2587,545,296,122,12,115,372,360,279,11,589,25,28,0,28,41,10,0.242,630,705,354,65,10303,3976,6327,36,1,0,0.108,0.228,0.47,0.329,0.46,0.789,0.218,0.273,0.89,0.21,0.373,0.417,0.092,0.163,0.057,0.0,0.339,44.4,356,45.5,-0.9,79.8,-7.5,132.6,13.3,$106.6,5.0,114,6.94,-41.37,48.3,86.49,9.13,0.95,1.49,70,7.55,-0.26,0.475,93.5,0.21,84.6,0.081,88.6,0.108,79.4,0.109,86.1,0.017,88.1,,,0.006,,23.7,10.6,-2.1,9.1,21.0,-3.6,,0.48,0.49,-0.25,0.82,1.88,...,,0.0,0.179,0.21,0.0,86.0,63.1,79.4,93.9,89.2,88.4,,80.2,93.2,84.3,78.5,-2.7,9.0,3.3,-1.9,0.4,-6.3,,-5.0,-3.2,2.3,0.0,2.1,-10.1,-6.2,8.2,3.7,1.2,,-4.6,4.0,-0.2,0.0,17.4,-0.3,7.6,16.4,-4.5,-1.2,,-0.5,5.7,11.1,0.0,1.62,-25.8,0.73,0.55,-0.45,-0.61,,-12.67,0.31,0.52,-4.4,0.247,0.647,0.446,0.542,0.841,0.757,0.497,18.5,0.6,97,120,101,102,112,135,91,1.02,87,114,133,104,95,100,92,95,112,89.9,15.9,179,0.106,112.7,723,0.427,1694,0.169,0.276,,,,13.2
4,15,2023,ARI,28,971,2270,2542,600,371,136,16,77,348,333,211,4,502,21,23,13,49,63,12,0.264,798,621,361,64,9915,3638,6277,49,24,7,0.083,0.197,0.42,0.33,0.44,0.77,0.176,0.305,1.29,0.203,0.448,0.349,0.103,0.124,0.061,0.292,0.333,31.1,337,21.4,6.6,78.4,-8.7,111.6,11.2,$89.7,5.8,107,7.7,-40.41,48.11,53.33,5.44,0.93,1.66,57,3.85,4.42,0.488,93.9,0.232,84.1,0.076,89.5,0.083,80.1,0.107,85.9,0.014,86.6,,,0.004,,33.5,-3.0,-8.4,5.1,2.8,2.0,,0.69,-0.13,-1.12,0.63,0.27,...,,0.001,0.164,0.244,0.0,85.6,78.3,80.0,94.2,89.7,88.0,,81.3,93.5,84.3,80.0,-3.1,4.9,2.9,-1.2,0.8,-5.9,,-6.6,-3.8,1.5,-5.0,2.4,-9.6,-6.7,8.0,3.8,1.1,,-3.3,3.5,-0.1,3.5,3.6,1.7,3.7,23.6,-4.4,-2.2,,1.7,4.4,-8.3,0.0,0.37,11.39,0.47,0.75,-0.63,-1.05,,34.79,0.27,-0.35,-0.29,0.261,0.623,0.448,0.595,0.886,0.804,0.517,18.6,-5.8,106,93,88,102,107,109,102,0.99,105,95,101,101,102,97,93,101,102,89.0,11.1,121,0.067,114.3,706,0.39,1808,0.183,0.271,,,,11.6


In [9]:
df.columns

Index(['teamIDfg', 'Season', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B', '2B',
       ...
       'maxEV', 'HardHit', 'HardHit%', 'Events', 'CStr%', 'CSW%', 'xBA',
       'xSLG', 'xwOBA', 'L-WAR'],
      dtype='object', length=319)

In [10]:
print("Pre removal ", df.shape)
#df = df.drop(['#days'], axis = 1)

print("Post removal ", df.shape)

Pre removal  (30, 319)
Post removal  (30, 319)


In [11]:
df.columns

Index(['teamIDfg', 'Season', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B', '2B',
       ...
       'maxEV', 'HardHit', 'HardHit%', 'Events', 'CStr%', 'CSW%', 'xBA',
       'xSLG', 'xwOBA', 'L-WAR'],
      dtype='object', length=319)

In [12]:
# rename columns
#df = df.rename(columns = {'xyz': 'abc'})
#df.columns

In [13]:
# count nulls
counts = df.isna().sum()
percentages = round(df.isna().mean() * 100, 1)
null_values = pd.concat([counts, percentages], axis=1, keys=["count", "%"])
print(null_values)

                 count         %
teamIDfg             0   0.00000
Season               0   0.00000
Team                 0   0.00000
Age                  0   0.00000
G                    0   0.00000
AB                   0   0.00000
PA                   0   0.00000
H                    0   0.00000
1B                   0   0.00000
2B                   0   0.00000
3B                   0   0.00000
HR                   0   0.00000
R                    0   0.00000
RBI                  0   0.00000
BB                   0   0.00000
IBB                  0   0.00000
SO                   0   0.00000
HBP                  0   0.00000
SF                   0   0.00000
SH                   0   0.00000
GDP                  0   0.00000
SB                   0   0.00000
CS                   0   0.00000
AVG                  0   0.00000
GB                   0   0.00000
FB                   0   0.00000
LD                   0   0.00000
IFFB                 0   0.00000
Pitches              0   0.00000
Balls     

In [14]:
# drop nulls
print("Pre removal ", df.shape)
#df = df.dropna()
print("Post removal ", df.shape)

Pre removal  (30, 319)
Post removal  (30, 319)


In [15]:
# count duplicates
len(df['Team'])-len(df['Team'].drop_duplicates())

0

In [16]:
# remove dups

print("Pre removal ", df.shape)
#df = df.drop_duplicates(subset=['Name'])
print("Post removal ", df.shape)

Pre removal  (30, 319)
Post removal  (30, 319)


In [17]:
# remove outliers

#print("Pre removal ", df.shape)
#df = df[(np.abs(stats.zscore(df['xyz'])) < 3)]
#print("Post removal ", df.shape)

# Data Restructing

## Variable Restructing

### Level

In [18]:
# change column values
#df['Lev'] = df['Lev'].map({'Maj-AL': 'AL', 'Maj-NL': 'NL'})

In [19]:
# rename column
#df.rename(columns={'Lev': 'League'}, inplace=True)

In [20]:
# verify changes
#df[["League", "Name"]].groupby("League").count()

## Normalizations

In [21]:
#df=['Volume_norm'] = (df.Volume - df.Volume.mean()) / (df.Volume.max() - df.Volume.mix())
#df.columns

## Categorizations

In [22]:
#normalized = ['Volume_norm', 'Difficulty_norm']

## Calculations

In [23]:
#calculate wOBA on 14 day player batting

#df['wOBA'] = (0.69*df['BB'] + 0.72*df['HBP'] + 0.89*df['H'] + 1.27*df['2B'] + 1.62*df['3B'] + 2.10*df['HR']) / (df['AB'] + df['BB'] - df['IBB'] + df['SF'] + df['HBP'])

In [24]:
#df.head()

## Export Data

In [25]:
#EXPORT

df.to_csv('C:\\Users\\b7tbu\\JUPYTER PROJECTS\\ANALYTICO\\Data_Exports\\Team\\Batting\\EXPORT_annual.csv', index=False)