## Baseball Stats

In [None]:
import pandas as pd
import numpy as np
import sqlite3 as sql
from scipy import stats

#from fangraphs import fangraph
import bb_stat_utils
from IPython.display import display, HTML

import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
# Run this cell to generate the SQLLite DB. 
# This may take a while (~30 mins)
#fangraph.generate_sqllite_tables()

In [None]:
# Point to the SQLLTIE Db
db_path = r'/home/zach/dev/demos/baseball_stats/2021_Baseball-final.db'
conn = sql.connect(db_path)

# Sanitizing Data
The batting and pitching data is individual game level stats. 
Here we cast all basic stats as integers, as they are all strings in the database.
Note the casting will error and be ignored, (ie do nothing) on non integer data, such as batting average, ERA, slugging percentage etc.
These statistics, when of interest, are rederived later, from the raw data. 
We also turn the date string into a much more useful datetime object. 

## A Note on Innings Pitched
For some reason, baseball stores fractional innings as .1 = 1/3 . Similiar to a base 3 system, but only for the decimal. so 3.1 innings is 3 1/3 innings in base 10. 


In [None]:
import os
os.path.exists(db_path)

In [None]:
# Clean Batting Data
batting_df = pd.read_sql('SELECT * FROM bat', con=conn)
for col in batting_df.columns:
    batting_df[col] = batting_df[col].astype('int', errors='ignore')
batting_df['Datetime'] = pd.to_datetime(batting_df['Date'], format="%Y-%m-%d")

In [None]:
# Clean Position Data (Games Played at Each Position)
# make a column for each position and count the games played at the position
# prepend an f to each position to incidate its a fielding stat and avoid conflating Singles (1B) with the first base position
unique_pos = []
for dashed_pos in batting_df.Pos.unique():
    [ unique_pos.append('f' + pos) for pos in dashed_pos.split('-') if 'f' + pos not in unique_pos]
batting_df[unique_pos] = 0

for pos in unique_pos:
    batting_df.loc[batting_df['Pos'] == pos[1::], pos] = 1

# Add Aggregate Positions (Like RF/CF/LF) being interchangeable
batting_df['fOF'] = batting_df[['fRF', 'fCF', 'fLF']].sum(axis=1)
unique_pos.append('fOF')

In [None]:
# Sanity Check (Tatis Played Games at SS, RF, CF, DH)
batting_df.loc[batting_df['name'].str.contains('tatis-jr'), unique_pos].sum()

In [None]:
# Aggregate Batting Data for a Full Season
batting_df_sum = batting_df.groupby(['id', 'name']).sum()

In [None]:
# Clean Pitching Data
pitching_df = pd.read_sql('SELECT * FROM pit', con=conn)
for col in pitching_df.columns:
    pitching_df[col] = pitching_df[col].astype('int', errors='ignore')

pitching_df['IP'] = pitching_df['IP'].apply(lambda x: bb_stat_utils.IP_basis_conversion(x))
pitching_df_sum = pitching_df.groupby('name').sum()
pitching_df['Datetime'] = pd.to_datetime(pitching_df['Date'], format="%Y-%m-%d")

In [None]:
# Calculate 'advanced', non integer metrics for the full season,
#  these functions are applied in place
bb_stat_utils.calc_SLUG_TOT(batting_df_sum)
bb_stat_utils.calc_WHIP_TOT(pitching_df_sum)

# TOPSIS

In [None]:
cats = ['HR', 'RBI', 'OB_TOT', 'SB', 'SLUG_TOT']
cats_power = {key: 1.0 for key in cats}

min_games_played_at_pos = 20
for pos in unique_pos:
    mask = (batting_df_sum[pos] > min_games_played_at_pos)
    sub_df = batting_df_sum.loc[mask,:].copy()
    if len(sub_df) > 0:
        score_df = bb_stat_utils.topsis(df=sub_df, cats=cats, cats_power=cats_power,  csv_name=None)
        bb_stat_utils.determine_pick_value(score_df)
        display(HTML(f'<h1>{pos}</h1><b>' + score_df.iloc[:10][cats + ['distance_from_ideals', 'improvement', 'pick_value']].to_html()+'<\b>'))
        
        fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(16,9))
        fig.suptitle(pos)
        ax[1].plot(score_df['distance_from_ideals'].values)
        ax[1].set_ylabel('Distance From Ideal')
        ax[0].plot(score_df['improvement'].values)
        ax[0].set_ylabel('% Further Than Previous')
        ax[2].hist(score_df['improvement'].values, bins=20)
        ax[2].set_ylabel('Pick Value')
        ax[0].grid()
        ax[1].grid()
        ax[2].grid()
        

In [None]:
cats = ['ER', 'QS_STAND', 'SV', 'WH_TOT', 'SO', 'IP']
cats_power = {'ER': -1.0, 'QS_STAND': 1.0, 'SV': 1.0, 'WH_TOT': -1.0, 'SO': 1.0, 'IP' : 1.0}
score_df = bb_stat_utils.topsis(df=pitching_df_sum, cats=cats, cats_power=cats_power, csv_name=None)
bb_stat_utils.determine_pick_value(score_df)

display(HTML(f'<h1>Pitching</h1>' + score_df.iloc[:5][cats + ['distance_from_ideals']].to_html()))
