## Hello World

Sources:
https://github.com/jldbc/pybaseball
https://www.kaggle.com/code/weslayton/fangraphs-baseball-scraper-analysis

In [1]:
# Import general libraries
import numpy as np
import pandas as pd

pd.set_option("display.precision", 2)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

from scipy import stats
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')

In [2]:
# Install latest pybaseball
#pip install pybaseball

In [3]:
# Import data sources
import pybaseball

In [4]:
from pybaseball import pitching_stats_range

## Collect Data

### 14 Day Data

In [5]:
from datetime import datetime, timedelta

In [6]:
# Calculate current date and date 14 days ago
today = datetime.now().date()

two_weeks_ago = today - timedelta(days=14)

print(today)
print(two_weeks_ago)

2023-06-14
2023-05-31


In [7]:
# Get the current date
today = datetime.now().date()

# Calculate the date 14 days ago
two_weeks_ago = today - timedelta(days=14)

# Convert the dates to strings
today = today.isoformat()
two_weeks_ago = two_weeks_ago.isoformat()

In [8]:
# Call data
df = pitching_stats_range(today,two_weeks_ago)

In [9]:
df.columns

Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'GS', 'W', 'L', 'SV', 'IP',
       'H', 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B', '3B', 'IBB',
       'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit', 'Str', 'StL', 'StS',
       'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9', 'SO/W', 'mlbID'],
      dtype='object')

# Clean Data

## Player Batting (14 Day)

In [10]:
# DEAL with double team values in Team column

In [11]:
print("Pre cleaning shape:", df.shape)

Pre cleaning shape: (462, 41)


In [12]:
df.head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,GS,W,L,SV,IP,H,R,ER,BB,SO,HR,HBP,ERA,AB,2B,3B,IBB,GDP,SF,SB,CS,PO,BF,Pit,Str,StL,StS,GB/FB,LD,PU,WHIP,BAbip,SO9,SO/W,mlbID
1,Andrew Abbott,24,4,Maj-NL,Cincinnati,2,2,2.0,,,11.2,6,0,0,7,10,0,0,0.0,40,1,0,0,1,0,0,0,0,47,210,0.61,0.17,0.07,0.27,0.33,0.03,1.114,0.2,7.7,1.43,671096
2,Albert Abreu,27,5,Maj-AL,New York,3,0,,,,2.1,3,1,1,0,1,1,0,3.86,10,0,0,0,0,0,0,0,0,10,38,0.63,0.18,0.05,0.33,0.33,0.11,1.286,0.25,3.9,,656061
3,Bryan Abreu,26,1,Maj-AL,Houston,7,0,,,,7.1,8,1,1,2,9,0,0,1.23,28,2,0,0,1,0,0,1,0,30,118,0.61,0.16,0.14,0.37,0.26,0.11,1.364,0.421,11.0,4.5,650556
4,Jason Adam,31,3,Maj-AL,Tampa Bay,6,0,1.0,,3.0,6.0,3,1,1,5,7,0,3,1.5,19,1,0,0,2,0,4,0,0,27,114,0.61,0.14,0.18,0.58,0.17,0.0,1.333,0.25,10.5,1.4,592094
5,Austin Adams,32,2,Maj-NL,Arizona,6,0,,,,4.1,3,0,0,1,5,0,1,0.0,16,0,0,0,0,0,0,0,0,18,86,0.63,0.13,0.13,0.36,0.18,0.09,0.923,0.273,10.4,5.0,613534


In [13]:
df.columns

Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'GS', 'W', 'L', 'SV', 'IP',
       'H', 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B', '3B', 'IBB',
       'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit', 'Str', 'StL', 'StS',
       'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9', 'SO/W', 'mlbID'],
      dtype='object')

In [14]:
print("Pre removal ", df.shape)
#df = df.drop(['#days'], axis = 1)
print("Post removal ", df.shape)

Pre removal  (462, 41)
Post removal  (462, 41)


In [15]:
df.columns

Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'GS', 'W', 'L', 'SV', 'IP',
       'H', 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B', '3B', 'IBB',
       'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit', 'Str', 'StL', 'StS',
       'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9', 'SO/W', 'mlbID'],
      dtype='object')

In [16]:
# rename columns
#df = df.rename(columns = {'xyz': 'abc'})
#df.columns

In [17]:
# count nulls
counts = df.isna().sum()
percentages = round(df.isna().mean() * 100, 1)
null_values = pd.concat([counts, percentages], axis=1, keys=["count", "%"])
print(null_values)

       count        %
Name       0  0.00000
Age        0  0.00000
#days      0  0.00000
Lev        0  0.00000
Tm         0  0.00000
G          0  0.00000
GS         0  0.00000
W        311 67.30000
L        311 67.30000
SV       408 88.30000
IP         0  0.00000
H          0  0.00000
R          0  0.00000
ER         0  0.00000
BB         0  0.00000
SO         0  0.00000
HR         0  0.00000
HBP        0  0.00000
ERA        0  0.00000
AB         0  0.00000
2B         0  0.00000
3B         0  0.00000
IBB        0  0.00000
GDP        0  0.00000
SF         0  0.00000
SB         0  0.00000
CS         0  0.00000
PO         0  0.00000
BF         0  0.00000
Pit        0  0.00000
Str        0  0.00000
StL        0  0.00000
StS        0  0.00000
GB/FB      0  0.00000
LD         0  0.00000
PU         0  0.00000
WHIP       2  0.40000
BAbip      0  0.00000
SO9        2  0.40000
SO/W      78 16.90000
mlbID      0  0.00000


In [18]:
# drop nulls
print("Pre removal ", df.shape)
#df = df.dropna()
print("Post removal ", df.shape)

Pre removal  (462, 41)
Post removal  (462, 41)


In [19]:
# count duplicates
len(df['Name'])-len(df['Name'].drop_duplicates())

1

In [20]:
# remove dups

print("Pre removal ", df.shape)
df = df.drop_duplicates(subset=['Name'])
print("Post removal ", df.shape)

Pre removal  (462, 41)
Post removal  (461, 41)


In [21]:
# remove outliers

#print("Pre removal ", df.shape)
#df = df[(np.abs(stats.zscore(df['xyz'])) < 3)]
#print("Post removal ", df.shape)

# Data Restructing

## Variable Restructing

### Level

In [22]:
# change column values
df['Lev'] = df['Lev'].map({'Maj-AL': 'AL', 'Maj-NL': 'NL'})

In [23]:
# rename column
df.rename(columns={'Lev': 'League'}, inplace=True)

In [24]:
# verify changes
df[["League", "Name"]].groupby("League").count()

Unnamed: 0_level_0,Name
League,Unnamed: 1_level_1
AL,232
NL,228


## Normalizations

In [25]:
#df=['Volume_norm'] = (df.Volume - df.Volume.mean()) / (df.Volume.max() - df.Volume.mix())
#df.columns

## Categorizations

In [26]:
#normalized = ['Volume_norm', 'Difficulty_norm']

## Calculations

In [27]:
#calculate wOBA on 14 day player batting

#df['wOBA'] = (0.69*df['BB'] + 0.72*df['HBP'] + 0.89*df['H'] + 1.27*df['2B'] + 1.62*df['3B'] + 2.10*df['HR']) / (df['AB'] + df['BB'] - df['IBB'] + df['SF'] + df['HBP'])

In [28]:
df.head()

Unnamed: 0,Name,Age,#days,League,Tm,G,GS,W,L,SV,IP,H,R,ER,BB,SO,HR,HBP,ERA,AB,2B,3B,IBB,GDP,SF,SB,CS,PO,BF,Pit,Str,StL,StS,GB/FB,LD,PU,WHIP,BAbip,SO9,SO/W,mlbID
1,Andrew Abbott,24,4,NL,Cincinnati,2,2,2.0,,,11.2,6,0,0,7,10,0,0,0.0,40,1,0,0,1,0,0,0,0,47,210,0.61,0.17,0.07,0.27,0.33,0.03,1.114,0.2,7.7,1.43,671096
2,Albert Abreu,27,5,AL,New York,3,0,,,,2.1,3,1,1,0,1,1,0,3.86,10,0,0,0,0,0,0,0,0,10,38,0.63,0.18,0.05,0.33,0.33,0.11,1.286,0.25,3.9,,656061
3,Bryan Abreu,26,1,AL,Houston,7,0,,,,7.1,8,1,1,2,9,0,0,1.23,28,2,0,0,1,0,0,1,0,30,118,0.61,0.16,0.14,0.37,0.26,0.11,1.364,0.421,11.0,4.5,650556
4,Jason Adam,31,3,AL,Tampa Bay,6,0,1.0,,3.0,6.0,3,1,1,5,7,0,3,1.5,19,1,0,0,2,0,4,0,0,27,114,0.61,0.14,0.18,0.58,0.17,0.0,1.333,0.25,10.5,1.4,592094
5,Austin Adams,32,2,NL,Arizona,6,0,,,,4.1,3,0,0,1,5,0,1,0.0,16,0,0,0,0,0,0,0,0,18,86,0.63,0.13,0.13,0.36,0.18,0.09,0.923,0.273,10.4,5.0,613534


## Export Data

In [29]:
#EXPORT

df.to_csv('C:\\Users\\b7tbu\\JUPYTER PROJECTS\\ANALYTICO\\Data_Exports\\Player\\Pitching\\EXPORT_14day.csv', index=False)