In [24]:
##displays all outputs of each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd

In [25]:
##reads in batting csv file
batting = pd.read_csv('C:/Users/Andrew Fish/Documents/Baseball Data/LahmanData2023/Batting.csv')
##reads in pitching csv file
pitching = pd.read_csv('C:/Users/Andrew Fish/Documents/Baseball Data/LahmanData2023/Pitching.csv')
##reads in people csv file has all bio information of players
master = pd.read_csv('C:/Users/Andrew Fish/Documents/Baseball Data/LahmanData2023/People.csv')

##setting index to playerID for easy search through
master.index = master['playerID']

##function with purpose of retrieving full name from master with a given playerID
def get_player_name(playerID):
    firstname = master.nameFirst[playerID]
    lastname = master.nameLast[playerID]
    fullname = str(firstname) + " " + str(lastname)
    return fullname

In [26]:
##purpose of function is to output the batter rankings filtered by year and player
##year must be inputed as a single integer or a list of integers and player an optional string parameter
def get_batter_ranks(years, player = None):
    ##will use OBP, HR, R, RBI, SB as those are the hitting statistics used in my rotisserie fantasy baseball league
    ##adding obp since all the others are in the batting data frame already
    batting['OBP'] = np.round(((batting['H'] + batting['BB'] + batting['HBP']) / 
                      (batting['AB'] + batting['BB'] + batting['SF'] + batting ['HBP'])), 3)

    batter_names = []
    ##loop to get full names from the master list using function created in 2nd cell
    for i in range(len(batting['playerID'])):
        name = get_player_name(batting.playerID[i])
        batter_names.append(name)
    batting['Name'] = batter_names
    
    ##creates data frame with desired stats
    batter_ranks = pd.DataFrame({'Season' : batting['yearID'],
                                 'Team' : batting['teamID'],
                                 'OBP' : batting['OBP'],
                                 'HR' : batting['HR'],
                                 'R' : batting['R'],
                                 'RBI' : batting['RBI'],
                                 'SB' : batting['SB']})
    ##creates index as player name and then fills na to 0
    batter_ranks.index = batting['Name']
    batter_ranks = batter_ranks.fillna(0)
    ##filters for desired season want to do this before creating rankings so only ranking desired years
    batter_ranks = batter_ranks[np.in1d(batter_ranks['Season'], years)]

    ##adding ranks for each stat
    batter_ranks['OBP_rank'] = batter_ranks.OBP.rank(ascending = False)
    batter_ranks['HR_rank'] = batter_ranks.HR.rank(ascending = False)
    batter_ranks['R_rank'] = batter_ranks.R.rank(ascending = False)
    batter_ranks['RBI_rank'] = batter_ranks.RBI.rank(ascending = False)
    batter_ranks['SB_rank'] = batter_ranks.SB.rank(ascending = False)
    ##sums all the individual ranks then it ranks them
    batter_ranks['Overall_rank'] = (batter_ranks.OBP_rank + batter_ranks.HR_rank + batter_ranks.R_rank +
                                    batter_ranks.RBI_rank + batter_ranks.SB_rank).rank(method = 'first')
    ##sorts by overall rank
    batter_ranks = batter_ranks.sort_values(by = 'Overall_rank')

    ##if statment to deal with optional player parameter
    if player is None:
        ##if none returns the first 20 elements of batter_ranks with the filtered years
        return batter_ranks.head(20)
    else:
        ##if there is a player returns the elements with that player as the index
        return batter_ranks.loc[player]

In [27]:
get_batter_ranks([2023])

Unnamed: 0_level_0,Season,Team,OBP,HR,R,RBI,SB,OBP_rank,HR_rank,R_rank,RBI_rank,SB_rank,Overall_rank
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Ronald Acuna,2023,ATL,0.416,41,149,106.0,73.0,16.0,5.0,1.0,8.5,1.0,1.0
Freddie Freeman,2023,LAN,0.41,29,131,102.0,23.0,20.0,30.5,2.0,15.0,34.0,2.0
Shohei Ohtani,2023,LAA,0.412,44,102,95.0,20.0,18.0,4.0,14.0,32.0,45.5,3.0
Mookie Betts,2023,LAN,0.408,39,126,107.0,14.0,23.0,7.5,4.0,6.5,74.0,4.0
Kyle Tucker,2023,HOU,0.369,29,97,112.0,30.0,59.5,30.5,20.0,3.0,16.5,5.0
Juan Soto,2023,SDN,0.41,35,97,109.0,12.0,20.0,14.0,20.0,4.5,96.0,6.0
Corbin Carroll,2023,ARI,0.362,25,116,76.0,54.0,81.5,49.0,7.0,66.0,3.0,7.0
Cody Bellinger,2023,CHN,0.356,26,95,97.0,20.0,96.5,40.5,24.5,24.0,45.5,8.0
Francisco Lindor,2023,NYN,0.336,31,108,98.0,31.0,169.5,22.5,9.0,20.0,13.5,9.0
Julio Rodriguez,2023,SEA,0.333,32,102,103.0,37.0,182.0,20.0,14.0,13.0,8.0,10.0


In [28]:
##will do the same for pitching this will be split into starters (sp) and closers (cp)
##sp will have wins while cp will have saves otherwise the categories are the same
def get_sp_ranks(years, player = None):
    ##will use W, K, S, ERA, and WHIP as those are the pitching statistics for my rotisserie fanasty baseball leauge
    ##adding WHIP since that isn't one of the stats tracked. Will have to divide IPouts by 3
    pitching['WHIP'] = np.round(((pitching['BB'] + pitching['H']) / (pitching['IPouts'] / 3)), 2)
    
    pitcher_names = []
    ##loops through pitching to find player name from player id
    for i in range(len(pitching['playerID'])):
        name = get_player_name(pitching.playerID[i])
        pitcher_names.append(name)
    pitching['Name'] = pitcher_names

    ##data frame for starting pitching
    sp_ranks = pd.DataFrame({'Season' : pitching['yearID'],
                             'Team' : pitching['teamID'],
                             'W' : pitching['W'],
                             'K' : pitching['SO'],
                             'ERA' : pitching['ERA'],
                             'WHIP' : pitching['WHIP']})
    ##index as player name and fill na to 0
    sp_ranks.index = pitching['Name']
    sp_ranks = sp_ranks.fillna(0)
    ##filtering before rankings so the rankings change based on the years
    sp_ranks = sp_ranks[np.in1d(sp_ranks['Season'], years)]

    ##adding ranks for each stat
    sp_ranks['W_rank'] = sp_ranks.W.rank(ascending = False)
    sp_ranks['K_rank'] = sp_ranks.K.rank(ascending = False)
    ##no ascending = False for ERA and WHIP bc lower is better
    sp_ranks['ERA_rank'] = sp_ranks.ERA.rank()
    sp_ranks['WHIP_rank'] = sp_ranks.WHIP.rank()
    ##creating overall rank by summing ranks then ranking by the sum
    sp_ranks['Overall_rank'] = (sp_ranks.W_rank + sp_ranks.K_rank +
                                     sp_ranks.ERA_rank + sp_ranks.WHIP_rank).rank(method = 'first')
    ##sort by overall rank
    sp_ranks = sp_ranks.sort_values(by = 'Overall_rank')

    ##if statment to deal with optional player parameter
    if player is None:
        ##if none returns the first 20 elements of batter_ranks with the filtered years
        return sp_ranks.head(20)
    else:
        ##if there is a player returns the elements with that player as the index
        return sp_ranks.loc[player]

In [29]:
get_sp_ranks([2022, 2023], 'Joe Ryan')

Unnamed: 0_level_0,Season,Team,W,K,ERA,WHIP,W_rank,K_rank,ERA_rank,WHIP_rank,Overall_rank
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Joe Ryan,2022,MIN,13,151,3.55,1.1,38.5,89.0,646.5,383.0,80.0
Joe Ryan,2023,MIN,11,197,4.51,1.17,81.0,29.5,1063.0,529.0,219.0


In [32]:
##closing pitcher
def get_cp_ranks(years, player = None):
    
    cp_ranks = pd.DataFrame({'Season' : pitching['yearID'],
                             'Team' : pitching['teamID'],
                             'S' : pitching['SV'],
                             'K' : pitching['SO'],
                             'ERA' : pitching['ERA'],
                             'WHIP' : pitching['WHIP']})
    ##index as player id and fill na to 0
    cp_ranks.index = pitching['Name']
    cp_ranks = cp_ranks.fillna(0)
    ##filter for years
    cp_ranks = cp_ranks[np.in1d(cp_ranks['Season'], years)]
    
    ##adding ranks for each stat
    cp_ranks['S_rank'] = cp_ranks.S.rank(ascending = False)
    cp_ranks['K_rank'] = cp_ranks.K.rank(ascending = False)
    ##no ascending = False for ERA and WHIP bc lower is better
    cp_ranks['ERA_rank'] = cp_ranks.ERA.rank()
    cp_ranks['WHIP_rank'] = cp_ranks.WHIP.rank()
    ##creating overall rank by summing ranks then ranking by the sum
    cp_ranks['Overall_rank'] = (cp_ranks.S_rank + cp_ranks.K_rank +
                                     cp_ranks.ERA_rank + cp_ranks.WHIP_rank).rank(method = 'first')
    ##sort by overall rank
    cp_ranks = cp_ranks.sort_values(by = 'Overall_rank')

     ##if statment to deal with optional player parameter
    if player is None:
        ##if none returns the first 20 elements of batter_ranks with the filtered years
        return cp_ranks.head(20)
    else:
        ##if there is a player returns the elements with that player as the index
        return cp_ranks.loc[player]

In [33]:
get_cp_ranks([2021, 2022, 2023], 'Emmanuel Clase')

Unnamed: 0_level_0,Season,Team,S,K,ERA,WHIP,S_rank,K_rank,ERA_rank,WHIP_rank,Overall_rank
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Emmanuel Clase,2022,CLE,42,77,1.36,0.73,2.0,517.5,241.5,90.5,5.0
Emmanuel Clase,2021,CLE,24,74,1.29,0.96,41.0,555.0,234.5,247.0,12.0
Emmanuel Clase,2023,CLE,44,64,3.22,1.16,1.0,699.5,810.5,765.5,150.0
