In [1]:
import pandas as pd
import re
import requests

from pybaseball import batting_stats_bref

pd.set_option('display.max_columns', None)

In [3]:
## Download data from baseball-reference 

data2020 = batting_stats_bref(2020)
data2020['year'] = 2020

data2019 = batting_stats_bref(2019)
data2019['year'] = 2019

data2018 = batting_stats_bref(2018)
data2018['year'] = 2018

In [173]:
## Add Singles column and split walks into intentional and unintentional

data2020['1B'] = data2020.apply(lambda row: row['H'] - row['2B'] - row['3B'] - row['HR'], axis=1)
data2020['UBB'] = data2020.apply(lambda row: row['BB'] - row['IBB'], axis=1)

data2019['1B'] = data2019.apply(lambda row: row['H'] - row['2B'] - row['3B'] - row['HR'], axis=1)
data2019['UBB'] = data2019.apply(lambda row: row['BB'] - row['IBB'], axis=1)

data2018['1B'] = data2018.apply(lambda row: row['H'] - row['2B'] - row['3B'] - row['HR'], axis=1)
data2018['UBB'] = data2018.apply(lambda row: row['BB'] - row['IBB'], axis=1)


Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,2B,3B,HR,RBI,BB,IBB,SO,HBP,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,year,1B,UBB
1,José Abreu,33,90,MLB-AL,Chicago,60,262,240,43,76,15,0,19,60,18,1,59,3,0,1,10,0,0,0.317,0.37,0.617,0.987,2020,42,17
2,Ronald Acuna Jr.,22,91,MLB-NL,Atlanta,46,202,160,46,40,11,0,14,29,38,2,60,4,0,0,3,8,1,0.25,0.406,0.581,0.987,2020,15,36
3,Willy Adames,24,91,MLB-AL,Tampa Bay,54,205,185,29,48,15,1,8,23,20,0,74,0,0,0,4,2,1,0.259,0.332,0.481,0.813,2020,24,20
4,Austin Adams,29,90,MLB-NL,San Diego,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,2020,0,0
5,Matt Adams,31,119,MLB-NL,Atlanta,16,51,49,4,9,2,0,2,9,2,0,18,0,0,0,3,0,0,0.184,0.216,0.347,0.563,2020,5,2


In [175]:
## League statistics imported from file donwloaded directly from Fangraphs
## There could be a way to do this automatically and not have to rely on a direct download

league_agg_stats = pd.read_csv('fangraphs_leaguestats.csv')
league_agg_stats['UBB'] = league_agg_stats.apply(lambda row: row['BB'] - row['IBB'], axis=1)
league_agg_stats.head()


Unnamed: 0,Season,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,UBB
0,2018,71590,165432,185139,41018,26322,8264,847,5585,21630,20606,15686,929,41207,1922,1235,823,3457,2474,958,0.248,14757.0
1,2019,71684,166651,186516,42039,25947,8531,785,6776,23467,22471,15895,753,42823,1984,1150,776,3463,2280,832,0.252,15142.0
2,2020,26721,59030,66506,14439,9071,2823,241,2304,8344,7978,6092,202,15586,821,402,126,1237,885,292,0.245,5890.0


In [178]:
## List of relevant columns

cols1 = ['PA', 'AB', 'R', '1B', '2B', '3B', 'HR', 'RBI', 'UBB', 'IBB', 'SO', 'HBP', 'SF', 'SB']

## Convert aggregate league stats to per plate appearance rates

league_avgs_per_pa = league_agg_stats[cols1].div(league_agg_stats.PA, axis=0)

## Grab league rates for 2020
## Copy this row n times, n corresponds to number of players with at least one plate appearance in 2020

league_avgs_per_pa_2020 = league_avgs_per_pa.iloc[[2]]
n = data2020.shape[0]
league_avgs_per_pa_2020 = pd.concat([league_avgs_per_pa_2020] * n)


Unnamed: 0,PA,AB,R,1B,2B,3B,HR,RBI,UBB,IBB,SO,HBP,SF,SB
2,1.0,0.887589,0.125462,0.136394,0.042447,0.003624,0.034643,0.119959,0.088563,0.003037,0.234355,0.012345,0.006045,0.013307
2,1.0,0.887589,0.125462,0.136394,0.042447,0.003624,0.034643,0.119959,0.088563,0.003037,0.234355,0.012345,0.006045,0.013307
2,1.0,0.887589,0.125462,0.136394,0.042447,0.003624,0.034643,0.119959,0.088563,0.003037,0.234355,0.012345,0.006045,0.013307
2,1.0,0.887589,0.125462,0.136394,0.042447,0.003624,0.034643,0.119959,0.088563,0.003037,0.234355,0.012345,0.006045,0.013307
2,1.0,0.887589,0.125462,0.136394,0.042447,0.003624,0.034643,0.119959,0.088563,0.003037,0.234355,0.012345,0.006045,0.013307


In [181]:
## Create dataframe of player categorical data for players who played in 2020

cols2 = ['Name', 'Age', '#days', 'Lev', 'Tm', 'year']
player_info_2020 = data2020[cols2].reset_index(drop=True)

## Combine the player info table with the league rates table
## Multiply this table so there are three rows for each player

player_info_with_league_rates_2020 = pd.concat([player_info_2020[['Name', 'year']], league_avgs_per_pa_2020.reset_index(drop=True)], axis=1)
player_info_with_league_rates_2020 = pd.concat([player_info_with_league_rates_2020] * 3)

## Combine player statistical data from 2020 to the table of league per PA rates
## There should now be four rows per player

cols3 = ['Name', 'year'] + cols1
stats_with_league_avgs = pd.concat([player_info_with_league_rates_2020.reset_index(drop=True), data2020[cols3].reset_index(drop=True)], axis=0)
stats_with_league_avgs = stats_with_league_avgs.sort_values(by=['Name', 'PA'], ascending=False).reset_index(drop=True)



Unnamed: 0,Name,year,PA,AB,R,1B,2B,3B,HR,RBI,UBB,IBB,SO,HBP,SF,SB
0,Zack Collins,2020,18.0,16.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0
1,Zack Collins,2020,1.0,0.887589,0.125462,0.136394,0.042447,0.003624,0.034643,0.119959,0.088563,0.003037,0.234355,0.012345,0.006045,0.013307
2,Zack Collins,2020,1.0,0.887589,0.125462,0.136394,0.042447,0.003624,0.034643,0.119959,0.088563,0.003037,0.234355,0.012345,0.006045,0.013307
3,Zack Collins,2020,1.0,0.887589,0.125462,0.136394,0.042447,0.003624,0.034643,0.119959,0.088563,0.003037,0.234355,0.012345,0.006045,0.013307
4,Zach McKinstry,2020,7.0,7.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0


In [185]:
## This loop goes through the previous table and projects rest of season stats 
## for each player who accrued at least one PA in the 2020 season
## This is necessary because the 2020 season lasted only 60 games, 
## and we need to translate 2020 stats to the same 162 game scale for accurate 2021 forecasting
## The method used is explained by Tom Tango in a blog post linked below
## http://tangotiger.com/index.php/site/article/how-to-handle-the-2020-season-for-forecasting-the-2021-season

numrows = stats_with_league_avgs.shape[0]

for i in range(2, numrows, 4):

    plate_appearances = stats_with_league_avgs.iloc[i - 2, 2]
    ros_plate_appearances = plate_appearances * 1.45
    stats_with_league_avgs.iloc[i, 2] = ros_plate_appearances
    regression_coeff = plate_appearances / (plate_appearances + 300)
    
    for j in range(4, 16):
    
        player_actual_per_pa_avg = stats_with_league_avgs.iloc[i - 2, j] / plate_appearances
        league_avg_metric = stats_with_league_avgs.iloc[i - 1, j]
        stats_with_league_avgs.iloc[i, j] = ((((player_actual_per_pa_avg - league_avg_metric) * regression_coeff) + league_avg_metric) * ros_plate_appearances)

    at_bats = ros_plate_appearances - stats_with_league_avgs.iloc[i, 10] - stats_with_league_avgs.iloc[i, 11] - stats_with_league_avgs.iloc[i, 13] - stats_with_league_avgs.iloc[i, 14]
    stats_with_league_avgs.iloc[i, 3] = at_bats
    
for i in range(3, numrows, 4):
    for j in range(2, 16):
        
        actual_stat = stats_with_league_avgs.iloc[i - 3, j]
        projected_ros_stat = stats_with_league_avgs.iloc[i - 1, j]
        
        stats_with_league_avgs.iloc[i, j] = actual_stat + projected_ros_stat

full_season_projections_2020 = stats_with_league_avgs.iloc[3::4,:].reset_index(drop=True).round(0)


Unnamed: 0,Name,year,PA,AB,R,1B,2B,3B,HR,RBI,UBB,IBB,SO,HBP,SF,SB
0,Zack Collins,2020,44.0,39.0,4.0,3.0,2.0,0.0,1.0,3.0,4.0,0.0,11.0,0.0,0.0,0.0
1,Zach McKinstry,2020,17.0,16.0,2.0,2.0,1.0,0.0,0.0,1.0,1.0,0.0,5.0,0.0,0.0,0.0
2,Yuli Gurriel,2020,564.0,512.0,68.0,75.0,28.0,2.0,16.0,58.0,36.0,1.0,88.0,6.0,9.0,3.0
3,Yu Chang,2020,32.0,28.0,3.0,5.0,1.0,0.0,1.0,3.0,4.0,0.0,8.0,0.0,0.0,0.0
4,Yoshi Tsutsugo,2020,453.0,392.0,63.0,49.0,15.0,2.0,18.0,57.0,54.0,2.0,117.0,4.0,3.0,2.0


In [105]:
## Moving on to 2021 projections
## Only projecting players who made an appearance in 2020
## ***TODO: Will have to figure out in the future how to remove pitcher's batting stats and vice versa
## as well as dealing with players who opted out of the 2020 season

players_2020 = list(full_season_projections_2020.Name)
data2019 = data2019[data2019['Name'].isin(players_2020)]
data2018 = data2018[data2018['Name'].isin(players_2020)]


In [187]:
## Combine our full season 2020 projections with batter stats from 2018-19

proj_df = pd.concat([full_season_projections_2020.reset_index(drop=True), data2019[cols3].reset_index(drop=True), data2018[cols3].reset_index(drop=True)], axis=0)
proj_df = proj_df.sort_values(by=['Name', 'year'], ascending=True).reset_index(drop=True)


Unnamed: 0,Name,year,PA,AB,R,1B,2B,3B,HR,RBI,UBB,IBB,SO,HBP,SF,SB
0,AJ Pollock,2018,460.0,413.0,61.0,59.0,21.0,5.0,21.0,65.0,29.0,2.0,100.0,8.0,7.0,13.0
1,AJ Pollock,2019,342.0,308.0,49.0,51.0,15.0,1.0,15.0,47.0,22.0,1.0,74.0,7.0,4.0,5.0
2,AJ Pollock,2020,514.0,472.0,70.0,71.0,22.0,1.0,32.0,76.0,33.0,2.0,114.0,2.0,4.0,6.0
3,Aaron Hicks,2018,581.0,480.0,90.0,71.0,18.0,3.0,27.0,79.0,89.0,1.0,111.0,3.0,6.0,11.0
4,Aaron Hicks,2019,255.0,221.0,41.0,30.0,10.0,0.0,12.0,36.0,31.0,0.0,72.0,0.0,3.0,1.0
5,Aaron Hicks,2020,517.0,430.0,67.0,56.0,24.0,4.0,16.0,55.0,80.0,2.0,103.0,4.0,1.0,9.0
6,Aaron Judge,2018,498.0,413.0,77.0,66.0,22.0,0.0,27.0,67.0,73.0,3.0,152.0,4.0,5.0,6.0
7,Aaron Judge,2019,447.0,378.0,75.0,57.0,18.0,1.0,27.0,55.0,60.0,4.0,141.0,3.0,1.0,3.0
8,Aaron Judge,2020,279.0,248.0,47.0,36.0,9.0,0.0,17.0,45.0,25.0,0.0,73.0,4.0,1.0,2.0
9,Aaron Whitefield,2020,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [188]:
## Could repurpose the index list for past seasons as well

league_avgs_per_pa.index = ['2018', '2019', '2020']

## Table with weights for past season data
## Weighting scheme is 5/4/3 for counting stats and 5/1/0 for playing time
## Methodology follows the Marcels projection system developed by Tom Tango and explained here
## http://tangotiger.net/marcel/
## and here
## http://www.tangotiger.net/archives/stud0346.shtml

weight_df = pd.DataFrame({'PA': [3.0, 4, 5], 'AB': [3.0, 4, 5], 'R': [3.0, 4, 5], '1B': [3.0, 4, 5], '2B': [3.0, 4, 5], '3B': [3.0, 4, 5],
                     'HR': [3.0, 4, 5], 'RBI': [3.0, 4, 5], 'UBB': [3.0, 4, 5], 'IBB': [3.0, 4, 5], 'SO': [3.0, 4, 5], 'HBP': [3.0, 4, 5],
                    'SF': [3.0, 4, 5], 'SB': [3.0, 4, 5]}, index=['2018', '2019', '2020'])

playing_time_weight_df = pd.DataFrame({'PA': [0.0, 0.1, 0.55]}, index=['2018', '2019', '2020'])



In [193]:
all_years_list = list(weight_df.index)
all_weighted_seasons = []
all_league_weighted_seasons = []
all_playing_time_weighted_seasons = []

## Could make this a function

for name in players_2020:
    
    ## This section applies weighting scheme to prior 3 seasons of data for each player
    
    df = proj_df[proj_df['Name'] == name]
    
    player_pa = df['PA']
    
    partial_df = df.iloc[:, 1:]
    partial_df = partial_df.set_index(partial_df.columns[0])
    
    years_of_data = partial_df.shape[0]
    years_list = all_years_list[(3 - years_of_data):]
    
    adj_weight_df = weight_df.iloc[weight_df.index.isin(years_list)]
    
    weighted_df = pd.DataFrame(partial_df.values*adj_weight_df.values, columns=partial_df.columns, index=partial_df.index)
    
    partial_df = df.iloc[:, :2]
    weighted_season = pd.concat([partial_df.reset_index(drop=True), weighted_df.reset_index(drop=True)], axis=1)
    weighted_season['weights'] = 'player'
    weighted_season = weighted_season.drop(['year'], axis=1)
    
    all_weighted_seasons.append(weighted_season)
    
    ## This section finds expected rates for each player based on yearly league average rates
    
    player_pa_df = pd.concat([player_pa] * league_avgs_per_pa.shape[1], axis=1)
    
    adj_league_avgs_per_pa = league_avgs_per_pa.iloc[league_avgs_per_pa.index.isin(years_list)]

    expected_stats_by_player_pa = pd.DataFrame(adj_league_avgs_per_pa.values*player_pa_df.values, columns=adj_league_avgs_per_pa.columns, index=adj_league_avgs_per_pa.index)
    expected_stats_by_player_pa_weighted_df = pd.DataFrame(expected_stats_by_player_pa.values*adj_weight_df.values, columns=expected_stats_by_player_pa.columns, index=expected_stats_by_player_pa.index)
    
    expected_stats_by_player = pd.concat([partial_df.reset_index(drop=True), expected_stats_by_player_pa_weighted_df.reset_index(drop=True)], axis=1)
    
    expected_stats_by_player['weights'] = 'league'
    expected_stats_by_player = expected_stats_by_player.drop(['year'], axis=1)
    
    all_league_weighted_seasons.append(expected_stats_by_player)
    
    ## Projecting Playing Time Section
    
    partial_df = df.iloc[:, 1:3]
    partial_df = partial_df.set_index(partial_df.columns[0])

    years_of_data = partial_df.shape[0]
    years_list = all_years_list[(3 - years_of_data):]

    adj_pt_weight_df = playing_time_weight_df.iloc[playing_time_weight_df.index.isin(years_list)]
    
    pt_weighted_df = pd.DataFrame(partial_df.values*adj_pt_weight_df.values, columns=partial_df.columns, index=partial_df.index)
    
    partial_df = df.iloc[:, :2]
    all_player_years_pa = pd.concat([partial_df.reset_index(drop=True), pt_weighted_df.reset_index(drop=True)], axis=1)

    all_player_years_pa = all_player_years_pa.drop(['year'], axis=1)
    
    all_playing_time_weighted_seasons.append(all_player_years_pa)
    

In [194]:
all_player_weighted_seasons = pd.concat(all_weighted_seasons, axis=0)
all_league_weighted_seasons = pd.concat(all_league_weighted_seasons, axis=0)
all_pt_weighted_seasons = pd.concat(all_playing_time_weighted_seasons, axis=0)

## Sum expected stats across all seasons by player
## Prorate expected stats to 1200 plate appearances
## This will be the regression component used in the player projections

sum_league_weighted_seasons = all_league_weighted_seasons.groupby(['Name', 'weights']).sum()
sum_league_weighted_seasons = sum_league_weighted_seasons.div(sum_league_weighted_seasons.PA, axis=0) * 1200
sum_league_weighted_seasons = sum_league_weighted_seasons.reset_index()


Unnamed: 0,Name,weights,PA,AB,R,1B,2B,3B,HR,RBI,UBB,IBB,SO,HBP,SF,SB
0,AJ Pollock,league,1200.0,1068.78776,147.976778,166.312228,52.634553,4.82527,40.698478,141.414622,101.240354,4.570159,276.087533,13.675207,7.485834,15.651596
1,Aaron Hicks,league,1200.0,1068.791977,147.260478,166.555831,52.54635,4.854391,40.207087,140.683083,101.123565,4.648209,275.528471,13.655013,7.526068,15.742484
2,Aaron Judge,league,1200.0,1070.103117,147.40927,167.136247,53.285968,4.981481,40.629451,140.869617,99.495871,4.862661,274.525714,13.277732,7.549044,15.493103
3,Aaron Whitefield,league,1200.0,1065.106908,150.554837,163.672451,50.936758,4.34848,41.572189,143.950922,106.276125,3.644784,281.225754,14.813701,7.253481,15.968484
4,Abraham Almonte,league,1200.0,1070.754686,144.506294,168.428661,53.277318,5.163884,38.792864,137.921605,98.223691,5.290526,271.718904,13.011415,7.727257,15.749978


In [195]:
## Combine expected stats/1200 PA (regression component) table with weighted 3 year stats table

all_forecast_data = pd.concat([all_player_weighted_seasons, sum_league_weighted_seasons], axis=0)
all_forecast_data = all_forecast_data.sort_values(by=['Name', 'weights']).groupby(['Name', 'weights']).sum().round().reset_index()


Unnamed: 0,Name,weights,PA,AB,R,1B,2B,3B,HR,RBI,UBB,IBB,SO,HBP,SF,SB
0,AJ Pollock,league,1200.0,1069.0,148.0,166.0,53.0,5.0,41.0,141.0,101.0,5.0,276.0,14.0,7.0,16.0
1,AJ Pollock,player,5318.0,4831.0,729.0,736.0,233.0,24.0,283.0,763.0,340.0,20.0,1166.0,62.0,57.0,89.0
2,Aaron Hicks,league,1200.0,1069.0,147.0,167.0,53.0,5.0,40.0,141.0,101.0,5.0,276.0,14.0,8.0,16.0
3,Aaron Hicks,player,5348.0,4474.0,769.0,613.0,214.0,29.0,209.0,656.0,791.0,13.0,1136.0,29.0,35.0,82.0
4,Aaron Judge,league,1200.0,1070.0,147.0,167.0,53.0,5.0,41.0,141.0,99.0,5.0,275.0,13.0,8.0,15.0


In [196]:
## Combine stats for each player

sum_all_forecast_data = all_forecast_data.groupby(['Name']).sum()


Unnamed: 0_level_0,PA,AB,R,1B,2B,3B,HR,RBI,UBB,IBB,SO,HBP,SF,SB
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AJ Pollock,6518.0,5900.0,877.0,902.0,286.0,29.0,324.0,904.0,441.0,25.0,1442.0,76.0,64.0,105.0
Aaron Hicks,6548.0,5543.0,916.0,780.0,267.0,34.0,249.0,797.0,892.0,18.0,1412.0,43.0,43.0,98.0
Aaron Judge,5877.0,5061.0,913.0,773.0,236.0,9.0,315.0,787.0,683.0,30.0,1660.0,57.0,32.0,55.0
Aaron Whitefield,1210.0,1075.0,151.0,164.0,51.0,4.0,42.0,144.0,106.0,4.0,286.0,15.0,7.0,16.0
Abraham Almonte,1965.0,1737.0,244.0,258.0,73.0,15.0,57.0,191.0,191.0,5.0,452.0,13.0,11.0,27.0


In [197]:
## Convert to projected per plate appearance rates

projected_rates_all_players_no_age = sum_all_forecast_data.div(sum_all_forecast_data.PA, axis=0)


Unnamed: 0_level_0,PA,AB,R,1B,2B,3B,HR,RBI,UBB,IBB,SO,HBP,SF,SB
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AJ Pollock,1.0,0.905186,0.13455,0.138386,0.043878,0.004449,0.049708,0.138693,0.067659,0.003836,0.221234,0.01166,0.009819,0.016109
Aaron Hicks,1.0,0.846518,0.13989,0.11912,0.040776,0.005192,0.038027,0.121717,0.136225,0.002749,0.215638,0.006567,0.006567,0.014966
Aaron Judge,1.0,0.861154,0.155351,0.13153,0.040157,0.001531,0.053599,0.133912,0.116216,0.005105,0.282457,0.009699,0.005445,0.009359
Aaron Whitefield,1.0,0.88843,0.124793,0.135537,0.042149,0.003306,0.034711,0.119008,0.087603,0.003306,0.236364,0.012397,0.005785,0.013223
Abraham Almonte,1.0,0.883969,0.124173,0.131298,0.03715,0.007634,0.029008,0.097201,0.097201,0.002545,0.230025,0.006616,0.005598,0.01374


In [204]:
## Now apply an age adjustment to the projected rates

counting_stat_rates = projected_rates_all_players_no_age.iloc[:, 2:]

# Data frame of player info from most recent season in data set

player_age_df = player_info_2020.iloc[:,:2]
player_age_df['age_factor'] = player_age_df.apply(lambda row: (29 - (row['Age'] + 1)) * 0.006 if (29 - (row['Age'] + 1)) > 0 else (29 - (row['Age'] + 1)) * 0.003, axis=1)
player_age_df = player_age_df.set_index(partial_df.columns[0]).drop(['Age'], axis=1)

player_age_mult_df = pd.concat([player_age_df] * counting_stat_rates.shape[1], axis=1)
player_age_mult_df = player_age_mult_df + 1

age_weighted_rates = pd.DataFrame(non_playing_time_rates.values*player_age_mult_df.values, columns=non_playing_time_rates.columns, index=non_playing_time_rates.index)
age_weighted_rates.head()

Unnamed: 0_level_0,R,1B,2B,3B,HR,RBI,UBB,IBB,SO,HBP,SF,SB
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AJ Pollock,0.132532,0.13631,0.04322,0.004382,0.048963,0.136612,0.066644,0.003778,0.217915,0.011485,0.009672,0.015868
Aaron Hicks,0.144926,0.123409,0.042244,0.005379,0.039396,0.126098,0.141129,0.002848,0.223401,0.006803,0.006803,0.015505
Aaron Judge,0.15908,0.134686,0.04112,0.001568,0.054885,0.137126,0.119005,0.005227,0.289236,0.009932,0.005576,0.009583
Aaron Whitefield,0.124419,0.135131,0.042022,0.003296,0.034607,0.118651,0.08734,0.003296,0.235655,0.01236,0.005768,0.013183
Abraham Almonte,0.123055,0.130116,0.036816,0.007565,0.028747,0.096326,0.096326,0.002522,0.227955,0.006556,0.005548,0.013617


In [211]:
## Create table of 2021 playing time projections

playing_time_projection = all_pt_weighted_seasons.groupby(['Name']).sum() + 200
playing_time_projection.head()


Unnamed: 0_level_0,PA
Name,Unnamed: 1_level_1
AJ Pollock,516.9
Aaron Hicks,509.85
Aaron Judge,398.15
Aaron Whitefield,201.1
Abraham Almonte,221.4


In [210]:
## Multiply projected per PA rates by projected season PA

full_season_projections_with_playing_time = age_weighted_rates.mul(playing_time_projection.PA, axis=0).round()


Unnamed: 0_level_0,R,1B,2B,3B,HR,RBI,UBB,IBB,SO,HBP,SF,SB
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AJ Pollock,69.0,70.0,22.0,2.0,25.0,71.0,34.0,2.0,113.0,6.0,5.0,8.0
Aaron Hicks,74.0,63.0,22.0,3.0,20.0,64.0,72.0,1.0,114.0,3.0,3.0,8.0
Aaron Judge,63.0,54.0,16.0,1.0,22.0,55.0,47.0,2.0,115.0,4.0,2.0,4.0
Aaron Whitefield,25.0,27.0,8.0,1.0,7.0,24.0,18.0,1.0,47.0,2.0,1.0,3.0
Abraham Almonte,27.0,29.0,8.0,2.0,6.0,21.0,21.0,1.0,50.0,1.0,1.0,3.0


In [233]:
## Merge full season projections with playing time projections and add at bats column

season_projections = pd.concat([playing_time_projection.round(), full_season_projections_with_playing_time], axis=1)

season_projections['AB'] = season_projections.apply(lambda row: row['PA'] - row['UBB'] - row['IBB'] - row['HBP'] - row['SF'], axis=1)


In [238]:
## Add total projected DraftKings points and then per plate appearance projection

def draft_king_batters(single, double, triple, HR, RBI, R, BB, HBP, SB):
    
    return (3 * single) + (5 * double) + (8 * triple) + (10 * HR) + (2 * RBI) + (2 * R) + (2 * BB) + (2 * HBP) + (5 * SB)



season_projections['DKp'] = season_projections.apply(lambda batter: draft_king_batters(batter['1B'],
                                                                                batter['2B'],
                                                                                batter['3B'],
                                                                                batter['HR'],
                                                                                batter['RBI'],
                                                                                batter['R'],
                                                                                (batter['UBB'] + batter['IBB']),
                                                                                batter['HBP'],
                                                                                batter['SB']),axis=1)

## Re arrange columns
season_projections = season_projections[['PA', 'AB', 'R', '1B', '2B', '3B', 'HR', 'RBI', 'UBB', 'IBB', 'SO', 'HBP', 'SF', 'SB', 'DKp']]
season_projections.head()


Unnamed: 0_level_0,PA,AB,R,1B,2B,3B,HR,RBI,UBB,IBB,SO,HBP,SF,SB,DKp
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AJ Pollock,517.0,470.0,69.0,70.0,22.0,2.0,25.0,71.0,34.0,2.0,113.0,6.0,5.0,8.0,990.0
Aaron Hicks,510.0,431.0,74.0,63.0,22.0,3.0,20.0,64.0,72.0,1.0,114.0,3.0,3.0,8.0,991.0
Aaron Judge,398.0,343.0,63.0,54.0,16.0,1.0,22.0,55.0,47.0,2.0,115.0,4.0,2.0,4.0,832.0
Aaron Whitefield,201.0,179.0,25.0,27.0,8.0,1.0,7.0,24.0,18.0,1.0,47.0,2.0,1.0,3.0,354.0
Abraham Almonte,221.0,197.0,27.0,29.0,8.0,2.0,6.0,21.0,21.0,1.0,50.0,1.0,1.0,3.0,360.0


In [224]:
## Convert Draft Kings point projection to a per plate appearance projection

DK_points_per_pa_proj = season_projections.div(season_projections.PA, axis=0)[['DKp']].reset_index().round(3)


Unnamed: 0,Name,DKp
0,AJ Pollock,1.915
1,Aaron Hicks,1.943
2,Aaron Judge,2.09
3,Aaron Whitefield,1.761
4,Abraham Almonte,1.629


In [239]:
### ***TODOs:
## Make this a function that can be used for backtesting previous seasons rather than just 2021
## Projections for pitchers
