## Baseball Stats

In [1]:
import pandas as pd
import sqlite3 as sql
import scipy

from fangraphs import fangraph
import bb_stat_utils

In [None]:
# Run this cell to generate the SQLLite DB. 
# This may take a while (~30 mins)
fangraph.generate_sqllite_tables()

In [4]:
# Point to the SQLLTIE Db
db_path = '/home/zach/dev/demos/2021_Baseball-final.db'
conn = sql.connect(db_path)

# Sanitizing Data
The batting and pitching data is individual game level stats. 
Here we cast all basic stats as integers, as they are all strings in the database.
Note the casting will error and be ignored, (ie do nothing) on non integer data, such as batting average, ERA, slugging percentage etc.
These statistics, when of interest, are rederived later, from the raw data. 
We also turn the date string into a much more useful datetime object. 

## A Note on Innings Pitched
For some reason, baseball stores fractional innings as .1 = 1/3 . Similiar to a base 3 system, but only for the decimal. so 3.1 innings is 3 1/3 innings in base 10. 


In [5]:
# Clean Batting Data
batting_df = pd.read_sql('SELECT * FROM bat', con=conn)
for col in batting_df.columns:
    batting_df[col] = batting_df[col].astype('int', errors='ignore')
batting_df_sum = batting_df.groupby('name').sum()

batting_df['Datetime'] = pd.to_datetime(batting_df['Date'], format="%Y-%m-%d")

In [6]:
# Clean Pitching Data
pitching_df = pd.read_sql('SELECT * FROM pit', con=conn)
for col in pitching_df.columns:
    pitching_df[col] = pitching_df[col].astype('int', errors='ignore')

pitching_df['IP'] = pitching_df['IP'].apply(lambda x: bb_stat_utils.IP_basis_conversion(x))
pitching_df_sum = pitching_df.groupby('name').sum()
pitching_df['Datetime'] = pd.to_datetime(pitching_df['Date'], format="%Y-%m-%d")

In [7]:
# Calculate 'advanced', non integer metrics for the full season
bb_stat_utils.calc_SLUG_TOT(batting_df_sum)
bb_stat_utils.calc_WHIP_TOT(pitching_df_sum)

# TOPSIS

In [None]:
cats = ['HR', 'RBI', 'OB_TOT', 'SB', 'SLUG_TOT']
cats_power = {key: 1.0 for key in cats}
bb_stat_utils.topsis(df=batting_df, cats=cats, cats_power=cats_power,  csv_name='bat_rank_bbref.csv')

cats = ['ER', 'QS_STAND', 'SV', 'WH_TOT', 'SO', 'IP']
cats_power = {'ER': -1.0, 'QS_STAND': 1.0, 'SV': 1.0, 'WH_TOT': -1.0, 'SO': 1.0, 'IP' : 1.0}
bb_stat_utils.topsis(df=pitching_df, cats=cats, cats_power=cats_power, csv_name='pitch_rank_bbref.csv')
