## Hello World

Sources:
https://github.com/jldbc/pybaseball
https://www.kaggle.com/code/weslayton/fangraphs-baseball-scraper-analysis

In [1]:
#import general libraries
import numpy as np
import pandas as pd

pd.set_option("display.precision", 2)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

from scipy import stats
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')

In [2]:
#install latest pybaseball
!pip install --upgrade pybaseball



In [3]:
#import data sources
import pybaseball

In [4]:
from pybaseball import batting_stats_range

## Collect Data

### 14 Day Data

In [5]:
from datetime import datetime, timedelta

In [6]:
# calculate current date and date 14 days ago
today = datetime.now().date()

two_weeks_ago = today - timedelta(days=14)

print(today)
print(two_weeks_ago)

2023-06-14
2023-05-31


In [7]:
# Get the current date
today = datetime.now().date()

# Calculate the date 14 days ago
two_weeks_ago = today - timedelta(days=14)

# Convert the dates to strings
today = today.isoformat()
two_weeks_ago = two_weeks_ago.isoformat()

In [8]:
# Call data
df = batting_stats_range(today,two_weeks_ago)

In [9]:
df.columns

Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'PA', 'AB', 'R', 'H', '2B',
       '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB',
       'CS', 'BA', 'OBP', 'SLG', 'OPS', 'mlbID'],
      dtype='object')

# Clean Data

## Player Batting (14 Day)

In [10]:
# deal with double team values in Team column

In [11]:
print("Pre cleaning shape:", df.shape)

Pre cleaning shape: (437, 28)


In [12]:
df.head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,2B,3B,HR,RBI,BB,IBB,SO,HBP,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,mlbID
1,CJ Abrams,22,1,Maj-NL,Washington,8,31,31,2,5,2,0,1,1,0,0,6,0,0,0,2,0,0,0.161,0.161,0.323,0.484,682928
2,Jos\xc3\xa9 Abreu,36,1,Maj-AL,Houston,12,52,48,3,13,2,0,2,10,2,0,14,1,0,1,1,0,1,0.271,0.308,0.438,0.745,547989
3,Ronald Acuna Jr.,25,2,Maj-NL,Atlanta,11,52,51,8,18,6,0,2,10,1,0,6,0,0,0,2,6,1,0.353,0.365,0.588,0.954,660670
4,Willy Adames,27,1,Maj-NL,Milwaukee,6,28,27,4,6,1,0,1,1,1,0,10,0,0,0,0,0,0,0.222,0.25,0.37,0.62,642715
5,Riley Adams,27,4,Maj-NL,Washington,3,11,11,0,2,0,0,0,0,0,0,4,0,0,0,0,0,0,0.182,0.182,0.182,0.364,656180


In [13]:
df.columns

Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'PA', 'AB', 'R', 'H', '2B',
       '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB',
       'CS', 'BA', 'OBP', 'SLG', 'OPS', 'mlbID'],
      dtype='object')

In [14]:
print("Pre removal ", df.shape)
df = df.drop(['#days'], axis = 1)
df = df.drop(['BA'], axis = 1)
df = df.drop(['OBP'], axis = 1)
df = df.drop(['SLG'], axis = 1)
df = df.drop(['OPS'], axis = 1)
df = df.drop(['GDP'], axis = 1)
df = df.drop(['RBI'], axis = 1)

print("Post removal ", df.shape)

Pre removal  (437, 28)
Post removal  (437, 21)


In [15]:
df.columns

Index(['Name', 'Age', 'Lev', 'Tm', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR',
       'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'SB', 'CS', 'mlbID'],
      dtype='object')

In [16]:
# rename columns
#df = df.rename(columns = {'xyz': 'abc'})
#df.columns

In [17]:
# count nulls
counts = df.isna().sum()
percentages = round(df.isna().mean() * 100, 1)
null_values = pd.concat([counts, percentages], axis=1, keys=["count", "%"])
print(null_values)

       count       %
Name       0 0.00000
Age        0 0.00000
Lev        0 0.00000
Tm         0 0.00000
G          0 0.00000
PA         0 0.00000
AB         0 0.00000
R          0 0.00000
H          0 0.00000
2B         0 0.00000
3B         0 0.00000
HR         0 0.00000
BB         0 0.00000
IBB        0 0.00000
SO         0 0.00000
HBP        0 0.00000
SH         0 0.00000
SF         0 0.00000
SB         0 0.00000
CS         0 0.00000
mlbID      0 0.00000


In [18]:
# drop nulls
print("Pre removal ", df.shape)
df = df.dropna()
print("Post removal ", df.shape)

Pre removal  (437, 21)
Post removal  (437, 21)


In [19]:
# count duplicates
len(df['Name'])-len(df['Name'].drop_duplicates())

0

In [20]:
# remove dups

print("Pre removal ", df.shape)
df = df.drop_duplicates(subset=['Name'])
print("Post removal ", df.shape)

Pre removal  (437, 21)
Post removal  (437, 21)


In [21]:
# remove outliers

#print("Pre removal ", df.shape)
#df = df[(np.abs(stats.zscore(df['xyz'])) < 3)]
#print("Post removal ", df.shape)

# Data Restructing

## Variable Restructing

### Level

In [22]:
# change column values
df['Lev'] = df['Lev'].map({'Maj-AL': 'AL', 'Maj-NL': 'NL'})

In [23]:
# rename column
df.rename(columns={'Lev': 'League'}, inplace=True)

In [24]:
# verify changes
df[["League", "Name"]].groupby("League").count()

Unnamed: 0_level_0,Name
League,Unnamed: 1_level_1
AL,222
NL,215


## Normalizations

In [25]:
#df=['Volume_norm'] = (df.Volume - df.Volume.mean()) / (df.Volume.max() - df.Volume.mix())
#df.columns

## Categorizations

In [26]:
#normalized = ['Volume_norm', 'Difficulty_norm']

## Calculations

In [27]:
#calculate wOBA on 14 day player batting

df['wOBA'] = (0.69*df['BB'] + 0.72*df['HBP'] + 0.89*df['H'] + 1.27*df['2B'] + 1.62*df['3B'] + 2.10*df['HR']) / (df['AB'] + df['BB'] - df['IBB'] + df['SF'] + df['HBP'])

In [28]:
df.head()

Unnamed: 0,Name,Age,League,Tm,G,PA,AB,R,H,2B,3B,HR,BB,IBB,SO,HBP,SH,SF,SB,CS,mlbID,wOBA
1,CJ Abrams,22,NL,Washington,8,31,31,2,5,2,0,1,0,0,6,0,0,0,0,0,682928,0.29323
2,Jos\xc3\xa9 Abreu,36,AL,Houston,12,52,48,3,13,2,0,2,2,0,14,1,0,1,0,1,547989,0.3925
3,Ronald Acuna Jr.,25,NL,Atlanta,11,52,51,8,18,6,0,2,1,0,6,0,0,0,6,1,660670,0.54865
4,Willy Adames,27,NL,Milwaukee,6,28,27,4,6,1,0,1,1,0,10,0,0,0,0,0,642715,0.33571
5,Riley Adams,27,NL,Washington,3,11,11,0,2,0,0,0,0,0,4,0,0,0,0,0,656180,0.16182


## Export Data

In [29]:
#EXPORT

df.to_csv('C:\\Users\\b7tbu\\JUPYTER PROJECTS\\ANALYTICO\\Data_Exports\\Player\\Batting\\EXPORT_14day.csv', index=False)