## Baseball Stats

In [1]:
import os
import sqlite3 as sql
import logging

import pandas as pd
import numpy as np

from scrapers import fangraph
import bb_stat_utils

logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s', level=logging.INFO)
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
# Point to the SQLLTIE Db
db_path = r'/home/zach/dev/demos/2021_Baseball_PURE.db'
if not os.path.exists(db_path):
    #This may take a while (~30 mins)
    fangraphscraper = fangraph.FangraphScraper(db_path)
    fangraphscraper.generate_sqllite_tables()
conn = sql.connect(db_path)

### Spot Check Batting Data Quality 
According to [this baseball reference link](https://www.baseball-reference.com/players/t/tatisfe02.shtml) Tatis had the following select stats in 2021 Regular Season

| Stat  |Value |
| ------| -----|
| AB    | 478|
| HR    | 42     |
| RBI   | 97 |

Running the code below, our data set agrees


In [3]:
tatis_df = pd.read_sql("SELECT * FROM bat WHERE name LIKE '%tatis%'", con=conn)
print(tatis_df.columns)
tatis_df[['HR', 'AB', 'RBI']].sum()


Index(['index', 'Team', 'Opp', 'BO', 'Pos', 'G', 'AB', 'PA', 'H', '1B', '2B',
       '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'GDP',
       'SB', 'CS', 'datetime', 'name', 'id', 'fDH', 'fC', 'f1B', 'F2B', 'fSS',
       'f3B', 'fPH', 'fLF', 'fCF', 'fRF', 'fOF'],
      dtype='object')


HR      42
AB     478
RBI     97
dtype: int64

### Spot Check Pitching Data Quality 
According to [this baseball reference link](https://www.baseball-reference.com/players/d/darviyu01.shtml) Yu Darvish had the following select stats in 2021 Regular Season

| Stat  |Value |
| ------| -----|
| GS    | 30|
| IP    | 166.1     | 
| H   | 138 |
|ERA | 4.22|

Running the code below, our data set agrees, Note that 0.1 is equivalent to 0.333 for innings pitched, and that ERA is a derived parameter. (ER/(IP/9))


In [4]:
darvish_df = pd.read_sql("SELECT * FROM pit WHERE name LIKE '%darvish%'", con=conn)

darvish_df = darvish_df[['ER', 'IP', 'H']].sum()
darvish_df['ERA'] = darvish_df['ER'] / ( darvish_df['IP'] / 9)
darvish_df


ER      78.000000
IP     166.333333
H      138.000000
ERA      4.220441
dtype: float64

Nice, our bespoke data set agrees with an indepedent data set on baseball reference! Now we can do use to draft the ultimate FBB team!

# TOPSIS - Drafting the ideal FBB Team
We can use the [topsis alogrithm](https://en.wikipedia.org/wiki/TOPSIS) (often used in design probems) to decide who is the ideal candidate for a given Baseball position wheile drafting our team.

TOPSIS relies on comparing numerical, what appears uncompareable. For example TOPSIS allows us to compare a player with 32 HRs and 11 SB to a player with 15 HR and 21 SB. 

As shown in the example above, Roto scoring weights all categories, equally, so our TOPSIS algorithm will use equal weights for all categories. As discussed below this does not imply that stolen bases and home runs are equally as common. 

TOPSIS compares all entries (players in this case) and determines which player have the lowest distance from ideal. 

##### Catagorical Rarity - Modification to TOPSIS
The TOPSIS algortithm presented here has actually been slightly modified from the text book definition to account for scarcity, for example RBIs being more common than HR. To see this look into the `topsis` function itself and note the second normalization step not typically present in textbook definitions

## First Cut - No Knowledge of Team Make Up
The analysis below shows the rankings and relative values of all players for the given position (or in pitching just all pitchers)

The offsensive categories are standard in FBB and are the catagories used in the author's league. (OBP- On Base Percentage and SLUG -Slugging Percentage are derived metrics)



In [15]:
cats = ['HR', 'RBI', 'SB','OBP', 'SLG']  
unique_pos = ['fDH', 'fC', 'f1B', 'F2B', 'fSS', 'f3B', 'fPH', 'fLF', 'fCF', 'fRF']

cats_power = {key: 1.0 for key in cats} # This should be negative if a lower value is desired (ie offesnive strikeouts)

# pos = 'fC'
# s = """SELECT
# name,
# id,
# SUM(HR),
# SUM(RBI),
# SUM(SB),
# SUM(1B),
# SUM(2B),
# SUM(3B)
# FROM bat
# WHERE '{}' > 1
# GROUP BY name, id;
# """.format(*cats[:3] + [pos])
# print(s)
s = """SELECT
SUM()
FROM bat
WHERE '{fC}' > 1
GROUP BY name, id;
"""
pos_df = pd.read_sql(s, con=conn)
pos_df
# bb_stat_utils.calc_SLG_OBP(pos_df)
# pos_df[cats + ['name', 'id', pos]]
# Index(['index', 'Team', 'Opp', 'BO', 'Pos', 'G', 'AB', 'PA', 'H', '1B', '2B',
#        '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'GDP',
#        'SB', 'CS', 'datetime', 'name', 'id', 'fDH', 'fC', 'f1B', 'F2B', 'fSS',
#        'f3B', 'fPH', 'fLF', 'fCF', 'fRF', 'fOF'],

# min_games_played_at_pos = 20
# for pos in unique_pos:
#     mask = (batting_df_sum[pos] > min_games_played_at_pos)
#     sub_df = batting_df_sum.loc[mask,:].copy()
#     if len(sub_df) > 0:
#         score_df = bb_stat_utils.topsis(df=sub_df, cats=cats, cats_power=cats_power,  csv_name=None)
#         bb_stat_utils.determine_pick_value(score_df)
#         display(HTML(f'<h1>{pos}</h1><b>' + score_df.iloc[:10][cats + ['distance_from_ideals', 'improvement', 'pick_value']].to_html()+'<\b>'))
        
#         fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(16,9))
#         fig.suptitle(pos)
#         ax[0].plot(score_df['distance_from_ideals'].values)
#         ax[0].set_ylabel('Distance From Ideal')
#         ax[1].hist(score_df['improvement'].values, bins=20)
#         ax[1].set_ylabel('Pick Value')
#         ax[0].grid()
#         ax[1].grid()        

DatabaseError: Execution failed on sql 'schema bat': near "schema": syntax error

In [None]:
cats = ['ER', 'QS_STAND', 'SV', 'WH_TOT', 'SO', 'IP']
cats_power = {'ER': -1.0, 'QS_STAND': 1.0, 'SV': 1.0, 'WH_TOT': -1.0, 'SO': 1.0, 'IP' : 1.0}
score_df = bb_stat_utils.topsis(df=pitching_df_sum, cats=cats, cats_power=cats_power, csv_name=None)
bb_stat_utils.determine_pick_value(score_df)

display(HTML(f'<h1>Pitching</h1>' + score_df.iloc[:5][cats + ['distance_from_ideals']].to_html()))
