In [159]:
import pandas as pd
import os
import numpy as np
import math
from scipy.stats import pearsonr
import requests
from datetime import date
from datetime import datetime
from statistics import mean
from pulp import *
from collections import Counter
import warnings
import requests, zipfile
from io import BytesIO
warnings.filterwarnings('ignore')

from projection_file_functions import *

In [160]:
today = date.today()
d = today.strftime("%Y-%b-%d")

In [242]:
def get_pitcher_salaries(date):
    response = requests.get(f'https://api.sportsdata.io/api/mlb/fantasy/json/DfsSlatesByDate/{date}', headers={'Ocp-Apim-Subscription-Key': '6fcab751d8594ce9909283dcdc522d24'})
    games = response.json()
    df_slates = pd.json_normalize(games)
    df_slates = df_slates[['SlateID', 'Operator', 'OperatorSlateID', 'OperatorName', 'NumberOfGames', 'OperatorGameType', 'SalaryCap']]
    df_player_sal = pd.json_normalize(games, record_path =['DfsSlatePlayers'])
    df_player_sal = df_player_sal[(df_player_sal['OperatorPosition'] == 'SP') | (df_player_sal['OperatorPosition'] == 'RP') | (df_player_sal['OperatorPosition'] == 'P')].reset_index(drop=True)
    df_player_sal = df_player_sal.merge(df_slates, how='left', on='SlateID')

    players = list(df_player_sal.PlayerID.unique())

    return df_player_sal, players

In [243]:
sals, players = get_pitcher_salaries(d)

In [244]:
def get_current_season_game_logs_pitchers(date):
    season = date[:4]
    cwd = os.getcwd()
    os.chdir(cwd + '/daily-downloads/Fantasy.2019-2022' + date)

    ## In practice you'd just need the current season data file
    game_stats = pd.read_csv(f'PlayerGame.{season}.csv')

    # Select only regular season data (should only be needed on backpark_adjusted_sals_with_vegas_linesing)
    game_stats = game_stats.loc[game_stats.SeasonType == 1].reset_index(drop=True)

    # Select only pitcher data
    game_stats = game_stats.loc[game_stats.PositionCategory == 'P'].reset_index(drop=True)

    # Just changes name of dataframe
    data = game_stats.sort_values(['PlayerID', 'Day'], ascending=True).reset_index(drop=True)

    # For beta park_adjusted_sals_with_vegas_linesing only, need to select only games from prior to the request date
    #data['Day'] = data['Day'].astype('datetime64[ns]')
    #data['Day'] = data['Day'].dt.date
    #date_object = datetime.strptime(date, '%Y-%b-%d').date()

    #data = data[data['Day'] < date_object].reset_index(drop=True)

    data.rename(columns = {'Wins': 'W', 'PitchingEarnedRuns': 'ER', 'PitchingWalks': 'BB', 'PitchingStrikeouts': 'SO', 'PitchingHomeRuns': 'HR', 'PitchingHits': 'H'}, inplace = True) 
    data['H-HR'] = data.apply(lambda row: row['H'] - row['HR'], axis=1)

    return data

In [245]:
game_logs = get_current_season_game_logs_pitchers(d)

In [165]:
def get_league_stats(date):

    season = date[:4]

    ## Currently using all past games we have access to for park_adjusted_sals_with_vegas_linesing
    ## In practice you'd just need the current season data file
    game_stats = pd.read_csv(f'PlayerGame.{season}.csv')

    # Select only regular season data (should only be needed on backpark_adjusted_sals_with_vegas_linesing)
    league_stats = game_stats.loc[game_stats.SeasonType == 1].reset_index(drop=True)

    # Select only batter data
    league_stats = league_stats.loc[league_stats.PositionCategory != 'P'].reset_index(drop=True)

    #league_stats['Day'] = league_stats['Day'].astype('datetime64[ns]')
    #league_stats['Day'] = league_stats['Day'].dt.date
    #date_object = datetime.strptime(date, '%Y-%b-%d').date()

    #league_stats = league_stats[league_stats['Day'] < date_object].reset_index(drop=True)

    return league_stats

In [166]:
league_stats = get_league_stats(d)

In [167]:
league_hbp = league_stats.HitByPitch.sum()
league_hr = league_stats.HomeRuns.sum()
league_bb = league_stats.Walks.sum()
league_so = league_stats.Strikeouts.sum()
league_innings = game_logs.InningsPitchedDecimal.sum()
league_ER = game_logs.ER.sum()

In [177]:
def get_marcels_pitchers(date):

    season = date[:4]

    os.chdir('../..')
    cwd = os.getcwd()
    os.chdir(cwd + '/BaselineProjections')
    marcels = pd.read_csv(f'marcel_pitchers_{season}.csv')

    marcels['TotalOutsPitched'] = marcels.apply(lambda row: row['IP'] * 3, axis=1)
    marcels.rename(columns = {'rel': 'Reliability'}, inplace = True)

    return marcels

In [178]:
marcels = get_marcels_pitchers(d)

In [180]:
df = marcels[['Name', 'Reliability']]
game_logs = game_logs.merge(df, how='left', on='Name')
game_logs = game_logs.loc[game_logs['PlayerID'].isin(players)]
sum_data = game_logs[['PlayerID', 'Started', 'Games', 'W', 'TotalOutsPitched', 'ER', 'BB', 'SO', 'H', 'HR', 'H-HR']].reset_index(drop=True).groupby(['PlayerID']).sum()

reliability_dict = {}

for index, row in game_logs.iterrows():
    if math.isnan(row['Reliability']) == True:
        reliability_dict[row['PlayerID']] = 0
    else:
        reliability_dict[row['PlayerID']] = row['Reliability']

In [181]:
def create_stabilization_dict_pitchers(sum_data, reliability_dict):
    ## Create dictionary of current season total stats and stabilization factors for each player
    ## Separate out HR from non HR

    player_dict = {}
    if sum_data.shape[0] == 0: pass # need to include something for the first day of the season
    else:
        player_id_list = list(sum_data.index.values)
    for player in player_id_list:
        player_dict[player] = {}
        for stat in ['TotalOutsPitched', 'ER', 'BB', 'SO', 'H', 'HR', 'H-HR']:
            value = sum_data.loc[player, stat]
            player_dict[player][stat] = value

        Outs = player_dict[player]['TotalOutsPitched']
        ER = player_dict[player]['ER']
        BB = player_dict[player]['BB']
        K = player_dict[player]['SO']
        H = player_dict[player]['H']
        HR = player_dict[player]['HR']
        H_HR = player_dict[player]['H-HR']
        
        PA_est = Outs + H + BB

        rel = reliability_dict[player]
        rel_fact = (2.2 ** rel) / 2

        player_dict[player]['SO_s'] = PA_est / (PA_est + (126 * rel_fact))
        player_dict[player]['BB_s'] = PA_est / (PA_est + (303 * rel_fact))
        player_dict[player]['H-HR_s'] = (PA_est - BB - HR - K) / ((PA_est - BB - HR - K) + (3729 * rel_fact))
        player_dict[player]['HR_s'] = (PA_est - BB - K) / (((PA_est - BB - K) + (1271 * rel_fact)))


    return player_dict

In [182]:
player_stabilization_dict = create_stabilization_dict_pitchers(sum_data, reliability_dict)

In [183]:
def create_per_pa_marcels_rates_pitchers(game_logs_file, marcels_file):
    data_ID = game_logs_file[['PlayerID', 'Name']].drop_duplicates()
    marcels = marcels_file.merge(data_ID, how='left', on='Name')
    marcels = marcels[marcels['PlayerID'].notna()]
    marcels = marcels.set_index('PlayerID')
    rel_columns = marcels.columns.to_list()[8:]
    marcels = marcels[rel_columns]
    marcels = marcels.div(marcels.TotalOutsPitched, axis=0)
    marcels['H-HR'] = marcels.apply(lambda row: row['H'] - row['HR'], axis=1)
    marcel_players = marcels.index.to_list()
    marcels_dict = marcels.to_dict('index')

    return marcels_dict, marcel_players

In [184]:
marcels_dict, marcel_players = create_per_pa_marcels_rates_pitchers(game_logs, marcels)

In [185]:
new = sals['OperatorPosition'].str.split('/', n = 1, expand = True)
sals['EffectivePosition'] = new[0]

In [186]:
def get_average_stats_by_position(date, game_logs, pos_group):

    if pos_group == 'hitters':

        ## Average all stats by position

        average_stats_by_position = game_logs[['Position', 'PA', 'S', 'D', 'T', 'HR', 'BB', 'HP', 'SB', 'CS']].reset_index(drop=True).groupby(['Position']).sum()

        ## DH and PH stats get lumped into 1B position
        average_stats_by_position.loc['1B'] = average_stats_by_position.loc[['1B', 'DH', 'PH']].sum()

        ## DFS sites only use the OF position, so LF, RF, and CF get lumped in together
        average_stats_by_position.loc['OF'] = average_stats_by_position.loc[['LF', 'RF', 'CF']].sum()
        average_stats_by_position = average_stats_by_position.drop(['DH', 'PH', 'PR', 'LF', 'RF', 'CF'])

        prior_season_stats = get_prior_season_stats(date)
        prior_season_stats = prior_season_stats.loc[prior_season_stats.SeasonType == 1].reset_index(drop=True)
        prior_season_stats = prior_season_stats.loc[prior_season_stats.PositionCategory != 'P'].reset_index(drop=True)
        prior_season_stats['PlateAppearances'] = prior_season_stats.apply(lambda row: row['AtBats'] + row['Walks'] + row['HitByPitch'] + row['Sacrifices'], axis=1)
        prior_season_stats.rename(columns = {'Runs': 'R', 'Singles': 'S', 'Doubles': 'D', 'Triples': 'T', 'HomeRuns': 'HR', 'AtBats': 'AB', 'Walks':'BB', 'RunsBattedIn': 'RBI', 'PlateAppearances': 'PA', 'Hits': 'H', 'HitByPitch': 'HP', 'StolenBases': 'SB', 'CaughtStealing': 'CS', 'Strikeouts': 'SO'},  
                inplace = True)
        
        prior_season_league_stats = prior_season_stats[['Position', 'PA', 'S', 'D', 'T', 'HR', 'BB', 'HP', 'SB', 'CS']].reset_index(drop=True).groupby(['Position']).sum()
        prior_season_league_stats.loc['1B'] = prior_season_league_stats.loc[['1B', 'DH']].sum()
        prior_season_league_stats.loc['OF'] = prior_season_league_stats.loc[['LF', 'RF', 'CF']].sum()
        prior_season_league_stats = prior_season_league_stats.drop(['DH', 'LF', 'RF', 'CF'])
        prior_season_league_stats = prior_season_league_stats.div(prior_season_league_stats.PA, axis=0)

        positions = ['1B', '2B', '3B', 'SS', 'C', 'OF']

        for pos in positions:
            total = prior_season_league_stats.loc[pos] * 2000 + average_stats_by_position.loc[pos]
            new_row = total.divide(total.PA)
            average_stats_by_position.loc[pos] = new_row

        return average_stats_by_position

    elif pos_group == 'pitchers':

        average_stats_by_position = game_logs[['Started', 'W', 'TotalOutsPitched', 'ER', 'BB', 'SO', 'HR', 'H', 'H-HR']].reset_index(drop=True).groupby(['Started']).sum()

        prior_season_stats = get_prior_season_stats(date)
        prior_season_stats = prior_season_stats.loc[prior_season_stats.SeasonType == 1].reset_index(drop=True)
        prior_season_stats = prior_season_stats.loc[prior_season_stats.PositionCategory == 'P'].reset_index(drop=True)
        prior_season_stats['isPrimaryStarter'] = prior_season_stats.apply(lambda row: 1 if row['Started'] > (row['Games'] - row['Started']) else 0, axis=1)
        prior_season_stats.rename(columns = {'Wins': 'W', 'PitchingEarnedRuns': 'ER', 'PitchingWalks': 'BB', 'PitchingStrikeouts': 'SO', 'PitchingHomeRuns': 'HR', 'PitchingHits': 'H'}, inplace = True)
        prior_season_stats['H-HR'] = prior_season_stats.apply(lambda row: row['H'] - row['HR'], axis=1)

        prior_season_league_stats = prior_season_stats[['isPrimaryStarter', 'W', 'TotalOutsPitched', 'ER', 'BB', 'SO', 'HR', 'H', 'H-HR']].reset_index(drop=True).groupby(['isPrimaryStarter']).sum()
        prior_season_league_stats = prior_season_league_stats.div(prior_season_league_stats.TotalOutsPitched, axis=0)

        positions = [0, 1]

        for pos in positions:
            total = prior_season_league_stats.loc[pos] * 15000 + average_stats_by_position.loc[pos]
            new_row = total.divide(total.TotalOutsPitched)
            average_stats_by_position.loc[pos] = new_row

        return average_stats_by_position

    else:

        return

In [188]:
average_stats_by_position = get_average_stats_by_position(d, game_logs, 'pitchers')

In [253]:
average_stats_by_position

Unnamed: 0_level_0,W,TotalOutsPitched,ER,BB,SO,HR,H,H-HR
Started,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.019336,1.0,0.149535,0.139484,0.339646,0.039978,0.292766,0.252788
1,0.018146,1.0,0.157473,0.113813,0.316293,0.045696,0.313871,0.268175


In [189]:
def create_blended_projections_pitchers(players, marcel_players, player_dict, marcels_dict, sum_data, average_stats_by_position):
    ## Create blended projections for the request date
    ## Different methods based on availability of pre season projections

    player_projs_dict = {}

    for player in players:
        
        if player in marcel_players:
        
            new_player_dict = {}
            stat_list = ['SO', 'BB', 'HR', 'H-HR']
            stab_list = ['SO_s', 'BB_s', 'HR_s', 'H-HR_s']
            i = 0

            try:
                player_proj = player_dict[player]
            except:
                
                for stat in stat_list:
                    new_player_dict[stat] = marcels_dict[player][stat]
                    
                player_projs_dict[player] = new_player_dict
                continue


            Outs = player_proj['TotalOutsPitched']
            
            if Outs == 0:
                
                for stat in stat_list:
                    new_player_dict[stat] = marcels_dict[player][stat]
                    
                player_projs_dict[player] = new_player_dict
                continue
                

            for stat in stat_list:

                stat_exp = marcels_dict[player][stat] * Outs
                stat_act = player_proj[stat]
                stat_blend = (stat_act * player_proj[stab_list[i]]) + (stat_exp * (1 - player_proj[stab_list[i]]))

                new_player_dict[stat_list[i]] = stat_blend / Outs

                i += 1

            player_projs_dict[player] = new_player_dict
        
        else:
            
            if player in player_dict: # Check if they've played games
                # No marcels, but games (2)
                # position average acts as default marcels projections
                player_proj = player_dict[player]
                
                starts = sum_data.loc[player, 'Started']
                games = sum_data.loc[player, 'Games']
                
                if starts > (games - starts):
                    eff_pos = 1
                else:
                    eff_pos = 0
                
                proj_by_position = average_stats_by_position.loc[eff_pos]
                proj_by_position = proj_by_position.divide(proj_by_position.TotalOutsPitched)
                proj_by_pos_dict = proj_by_position.to_dict()
                
                new_player_dict = {}
                stat_list = ['SO', 'BB', 'HR', 'H-HR']
                stab_list = ['SO_s', 'BB_s', 'HR_s', 'H-HR_s']
                i = 0
                
                Outs = player_proj['TotalOutsPitched']

                if Outs == 0:

                    for stat in stat_list:
                        new_player_dict[stat] = proj_by_pos_dict[stat]

                    player_projs_dict[player] = new_player_dict
                    continue


                for stat in stat_list:

                    stat_exp = proj_by_pos_dict[stat] * Outs
                    stat_act = player_proj[stat]
                    stat_blend = (stat_act * player_proj[stab_list[i]]) + (stat_exp * (1 - player_proj[stab_list[i]]))

                    new_player_dict[stat_list[i]] = stat_blend / Outs

                    i += 1

                player_projs_dict[player] = new_player_dict
                
            else:

                # Find effective position

                eff_pos = 0
                proj_by_position = average_stats_by_position.loc[eff_pos]
                proj_by_position = proj_by_position.divide(proj_by_position.TotalOutsPitched)
                proj_by_pos_dict = proj_by_position.to_dict()
                player_projs_dict[player] = proj_by_pos_dict       
            


    return player_projs_dict

In [190]:
blended_projections_dict = create_blended_projections_pitchers(players, marcel_players, player_stabilization_dict, marcels_dict, sum_data, average_stats_by_position)


In [191]:
sals['pSO/Out'] = sals.apply(lambda row: round(blended_projections_dict[row['PlayerID']]['SO'], 3) if row['PlayerID'] in blended_projections_dict else np.NaN, axis=1)
sals['pBB/Out'] = sals.apply(lambda row: round(blended_projections_dict[row['PlayerID']]['BB'], 3) if row['PlayerID'] in blended_projections_dict else np.NaN, axis=1)
sals['pHR/Out'] = sals.apply(lambda row: round(blended_projections_dict[row['PlayerID']]['HR'], 3) if row['PlayerID'] in blended_projections_dict else np.NaN, axis=1)
sals['pH-HR/Out'] = sals.apply(lambda row: round(blended_projections_dict[row['PlayerID']]['H-HR'], 3) if row['PlayerID'] in blended_projections_dict else np.NaN, axis=1)
sals['pHBP/Out'] = sals.apply(lambda row: round(league_hbp / (league_innings * 3), 3), axis=1)

league_ERA = (9 / league_innings) * league_ER
FIP_constant = league_ERA - (((13 * league_hr) + (3 * (league_bb + league_hbp)) - (2 * league_so)) / league_innings)


In [194]:
def get_prior_season_game_logs(date):

    season = str(int(date[:4]) - 1)

    # based on current file structure and the order of events in the master_projections formulas
    os.chdir('..')
    cwd = os.getcwd()
    os.chdir(cwd + '/daily-downloads/Fantasy.2019-2022' + date)
    game_stats_prior = pd.read_csv(f'PlayerGame.{season}.csv')
    # Select only regular season data (should only be needed on backpark_adjusted_sals_with_vegas_linesing)
    game_stats_prior = game_stats_prior.loc[game_stats_prior.SeasonType == 1].reset_index(drop=True)
    # Select only pitcher data
    game_stats_prior = game_stats_prior.loc[game_stats_prior.PositionCategory == 'P'].reset_index(drop=True)
    # Select only starts
    game_stats_prior = game_stats_prior.loc[game_stats_prior.Started == 1].reset_index(drop=True)
    game_stats_prior['ER/out'] = game_stats_prior.apply(lambda row: row['PitchingEarnedRuns'] / row['TotalOutsPitched'] if row['TotalOutsPitched'] > 0 else 0, axis=1)

    os.chdir('..')

    return game_stats_prior

In [196]:
prior_year_ind_pitcher_dist = game_stats_prior.groupby('PlayerID').TotalOutsPitched.agg(['sum', 'mean', 'std']).fillna(0)
prior_year_league_innings_dist = game_stats_prior.TotalOutsPitched.agg(['sum', 'mean', 'std'])
current_year_starts = game_logs.loc[game_logs.Started == 1].reset_index(drop=True)
current_year_ind_pitcher_dist = current_year_starts.groupby('PlayerID').TotalOutsPitched.agg(['sum', 'mean', 'std']).fillna(0)
current_year_league_innings_dist = current_year_starts.TotalOutsPitched.agg(['sum', 'mean', 'std'])
current_year_outs = current_year_league_innings_dist['sum']
weighted_league_innings_dist_mean = ((current_year_league_innings_dist['mean'] * current_year_outs) + (prior_year_league_innings_dist['mean'] * 10000)) / (current_year_outs + 10000)
weighted_league_innings_dist_std = ((current_year_league_innings_dist['std'] * current_year_outs) + (prior_year_league_innings_dist['std'] * 10000)) / (current_year_outs + 10000)
current_year_starts_vs_team = game_logs.loc[game_logs.Started == 1].groupby('OpponentID').TotalOutsPitched.agg(['sum', 'mean', 'std']).fillna(0)


In [200]:
def get_vegas_lines(date, player_salaries_df):

    response = requests.get(f'https://api.sportsdata.io/api/mlb/fantasy/json/DfsSlatesByDate/{date}', headers={'Ocp-Apim-Subscription-Key': '6fcab751d8594ce9909283dcdc522d24'})
    games = response.json()
    df_games = pd.json_normalize(games, record_path =['DfsSlateGames'])
    df_games = df_games[['SlateGameID', 'GameID', 'OperatorGameID', 'Game.Season', 'Game.Day', 'Game.AwayTeam', 'Game.HomeTeam', 'Game.AwayTeamID', 'Game.HomeTeamID', 'Game.StadiumID', 'Game.AwayTeamProbablePitcherID', 'Game.HomeTeamProbablePitcherID', 'Game.PointSpread', 'Game.OverUnder', 'Game.AwayTeamMoneyLine', 'Game.HomeTeamMoneyLine']]
    result_df = player_salaries_df.merge(df_games, how='left', on = ['SlateGameID'])

    starting_pitchers = list(set(list(df_games['Game.AwayTeamProbablePitcherID'])).union(set(list(df_games['Game.HomeTeamProbablePitcherID']))))
    starting_pitchers = [x for x in starting_pitchers if str(x) != 'nan']

    result_df['HomeOrAway'] = result_df.apply(lambda row: 'AWAY' if row['Game.AwayTeamID'] == row['TeamID'] else 'HOME', axis=1)
    result_df['PlayerTeamMoneyLine'] = result_df.apply(lambda row: row['Game.AwayTeamMoneyLine'] if row['HomeOrAway'] == 'AWAY' else row['Game.HomeTeamMoneyLine'], axis=1)
    result_df['PlayerTeamPointSpread'] = result_df.apply(lambda row: abs(row['Game.PointSpread']) * -1 if row['PlayerTeamMoneyLine'] < 0 else abs(row['Game.PointSpread']), axis=1)
    result_df['PlayerTeamVegasWinProb'] = result_df.apply(lambda row: 100 / (100 + row['PlayerTeamMoneyLine']) if row['PlayerTeamMoneyLine'] > 0 else row['PlayerTeamMoneyLine'] / (row['PlayerTeamMoneyLine'] - 100), axis=1)
    result_df['PlayerTeamTotal'] = result_df.apply(lambda row: round((row['Game.OverUnder'] / 2) - ((row['PlayerTeamPointSpread'] * (100 / (abs(row['PlayerTeamMoneyLine']) + 100))) / 2), 2), axis=1)

    return result_df, starting_pitchers

In [202]:
sals_with_vegas_lines, starting_pitchers = get_vegas_lines(d, sals)

In [208]:
def adjust_for_park_factors_pitchers(sals_with_vegas_lines):
    os.chdir('..')
    park_factors = pd.read_csv('ParkFactors.csv')
    park_adjusted_sals_with_vegas_lines = sals_with_vegas_lines.merge(park_factors, how = 'left', on = ['Game.StadiumID'])
    park_adjusted_sals_with_vegas_lines['pK'] = park_adjusted_sals_with_vegas_lines['pSO/Out'] * park_adjusted_sals_with_vegas_lines['SO'] / 100
    park_adjusted_sals_with_vegas_lines['pBB'] = park_adjusted_sals_with_vegas_lines['pBB/Out'] * park_adjusted_sals_with_vegas_lines['BB'] / 100
    park_adjusted_sals_with_vegas_lines['pHR'] = park_adjusted_sals_with_vegas_lines['pHR/Out'] * park_adjusted_sals_with_vegas_lines['HR'] / 100
    park_adjusted_sals_with_vegas_lines['pH'] = park_adjusted_sals_with_vegas_lines['pH-HR/Out'] * (park_adjusted_sals_with_vegas_lines['1B'] * 0.65 + park_adjusted_sals_with_vegas_lines['2B'] * 0.2 + park_adjusted_sals_with_vegas_lines['HR'] * 0.15) / 100

    return park_adjusted_sals_with_vegas_lines



In [209]:
sals_with_vegas_lines = adjust_for_park_factors_pitchers(sals_with_vegas_lines)

In [220]:
def generate_starting_pitcher_projections(list_of_starters, result_df, prior_year_ind_pitcher_dist, prior_year_league_innings_dist, current_year_ind_pitcher_dist, current_year_league_innings_dist, current_year_outs, weighted_league_innings_dist_mean, weighted_league_innings_dist_std, current_year_starts_vs_team, FIP_constant):
    all_starters = {}

    for starter in list_of_starters:
        try:
            starter_team = result_df.loc[result_df.PlayerID == starter,'TeamID'].reset_index(drop=True)[0]
        except: continue
        home_team = result_df.loc[result_df.PlayerID == starter,'Game.HomeTeamID'].reset_index(drop=True)[0]
        away_team = result_df.loc[result_df.PlayerID == starter,'Game.AwayTeamID'].reset_index(drop=True)[0]
        if starter_team == home_team:
            opponent_id = away_team
        else:
            opponent_id = home_team
            
        starter_team_w_pct = result_df.loc[result_df.PlayerID == starter,'PlayerTeamVegasWinProb'].reset_index(drop=True)[0]
        
        if math.isnan(starter_team_w_pct):
            starter_team_w_pct = 0.5
        else: pass
            
        try:    
            mean_vs_team = current_year_starts_vs_team.loc[opponent_id]['mean']
            total_outs_vs_team = current_year_starts_vs_team.loc[opponent_id]['sum']
            weighted_outs = total_outs_vs_team / 10
        except:
            mean_vs_team = weighted_league_innings_dist_mean
            weighted_outs = 0
            
        if starter in current_year_ind_pitcher_dist.index:
            current_year_pitcher_outs = current_year_ind_pitcher_dist.loc[starter]['sum']
            if starter in prior_year_ind_pitcher_dist.index:
                prior_year_pitcher_outs = prior_year_ind_pitcher_dist.loc[starter]['sum']
                total_outs = current_year_pitcher_outs + prior_year_pitcher_outs
                mean_of_starter = ((current_year_ind_pitcher_dist.loc[starter]['mean'] * current_year_pitcher_outs) + (prior_year_ind_pitcher_dist.loc[starter]['mean'] * prior_year_pitcher_outs)) / total_outs
            else:
                mean_of_starter = ((current_year_ind_pitcher_dist.loc[starter]['mean'] * current_year_pitcher_outs) + (weighted_league_innings_dist_mean * 100)) / (current_year_pitcher_outs + 100)
                total_outs = current_year_pitcher_outs
        else:
            if starter in prior_year_ind_pitcher_dist.index:
                prior_year_pitcher_outs = prior_year_ind_pitcher_dist.loc[starter]['sum']
                mean_of_starter = ((prior_year_ind_pitcher_dist.loc[starter]['mean'] * prior_year_pitcher_outs) + (weighted_league_innings_dist_mean * 100)) / (prior_year_pitcher_outs + 100)
                total_outs = prior_year_pitcher_outs
            else:
                mean_of_starter = weighted_league_innings_dist_mean
                total_outs = 0
            
        mean_of_league = weighted_league_innings_dist_mean
        
        combined_mean = (((mean_vs_team * weighted_outs) + (mean_of_starter * total_outs) + (mean_of_league * 100)) / (weighted_outs + total_outs + 100))

        try:
            var_vs_team = current_year_starts_vs_team.loc[opponent_id]['std'] ** 2
            total_outs_vs_team = current_year_starts_vs_team.loc[opponent_id]['sum']
            weighted_outs = total_outs_vs_team / 10
        except:
            var_vs_team = weighted_league_innings_dist_std ** 2
            weighed_outs = 0
        
        if starter in current_year_ind_pitcher_dist.index:
            current_year_pitcher_outs = current_year_ind_pitcher_dist.loc[starter]['sum']
            if starter in prior_year_ind_pitcher_dist.index:
                prior_year_pitcher_outs = prior_year_ind_pitcher_dist.loc[starter]['sum']
                total_outs = current_year_pitcher_outs + prior_year_pitcher_outs
                var_of_starter = (((current_year_ind_pitcher_dist.loc[starter]['std'] ** 2) * current_year_pitcher_outs) + ((prior_year_ind_pitcher_dist.loc[starter]['std'] ** 2) * prior_year_pitcher_outs)) / total_outs
            else:
                var_of_starter = (((current_year_ind_pitcher_dist.loc[starter]['std'] ** 2) * current_year_pitcher_outs) + ((weighted_league_innings_dist_std ** 2) * 100)) / (current_year_pitcher_outs + 100)
                total_outs = current_year_pitcher_outs 
        else:
            if starter in prior_year_ind_pitcher_dist.index:
                prior_year_pitcher_outs = prior_year_ind_pitcher_dist.loc[starter]['sum']
                var_of_starter = (((prior_year_ind_pitcher_dist.loc[starter]['std'] ** 2) * prior_year_pitcher_outs) + ((weighted_league_innings_dist_std ** 2) * 100)) / (prior_year_pitcher_outs + 100)
                total_outs = prior_year_pitcher_outs
            else:
                var_of_starter = weighted_league_innings_dist_std ** 2
                total_outs = 0

        var_of_league = weighted_league_innings_dist_std ** 2
        
        
        total_var_outs = weighted_outs + total_outs + 100
        combined_var = ((((weighted_outs / total_var_outs) ** 2) * var_vs_team) + (((total_outs / total_var_outs) ** 2) * var_of_starter) + (((100 / total_var_outs) ** 2) * var_of_league))
        combined_std = np.sqrt(combined_var)
        
        s = np.random.normal(combined_mean, combined_std, 1000)
        
        k_per_out = result_df.loc[result_df.PlayerID == starter, 'pSO/Out'].reset_index(drop=True)[0]
        bb_per_out = result_df.loc[result_df.PlayerID == starter, 'pBB/Out'].reset_index(drop=True)[0]
        hr_per_out = result_df.loc[result_df.PlayerID == starter, 'pHR/Out'].reset_index(drop=True)[0]
        h_hr_per_out = result_df.loc[result_df.PlayerID == starter, 'pH-HR/Out'].reset_index(drop=True)[0]
        hbp_per_out = result_df.loc[result_df.PlayerID == starter, 'pHBP/Out'].reset_index(drop=True)[0]

        ks = []
        bbs = []
        hrs = []
        h_min_hr = []
        hbps = []
        ers = []
        qs = []
        over_5 = []
        ips = []

        for i in range(len(s)):
            sim_ks = s[i] * k_per_out
            sim_bbs = s[i] * bb_per_out
            sim_hrs = s[i] * hr_per_out
            sim_hits = s[i] * h_hr_per_out
            sim_hbp = s[i] * hbp_per_out
            sim_ip = s[i] / 3
            
            ips.append(sim_ip)

            sim_fip = (((13 * sim_hrs) + (3 * (sim_bbs + sim_hbp)) - (2 * sim_ks)) / sim_ip) + FIP_constant
            fip_total_er = (sim_fip / 9) * sim_ip
            sim_er_total = np.random.normal(fip_total_er, 1) 
            
            if (sim_ip >= 6) & (sim_er_total <= 3):
                qs.append(1)
            else:
                qs.append(0)

            if sim_ip >= 5:
                over_5.append(1)
            else:
                over_5.append(0)

            ks.append(sim_ks)
            bbs.append(sim_bbs)
            hrs.append(sim_hrs)
            h_min_hr.append(sim_hits)
            hbps.append(sim_hbp)
            ers.append(sim_er_total)
        

        starter_dict = {}
        
        starter_dict['pIP'] = round(mean(ips), 2)
        starter_dict['pK'] = round(mean(ks), 2)
        starter_dict['pBB'] = round(mean(bbs), 2)
        starter_dict['pHR'] = round(mean(hrs), 2)
        starter_dict['pH'] = round(mean(hrs) + mean(h_min_hr), 2)
        starter_dict['pHBP'] = round(mean(hbps), 2)
        starter_dict['pQS'] = round(mean(qs), 2)
        starter_dict['pER'] = round(mean(ers), 2)
        starter_dict['pW'] = round(mean(over_5) * starter_team_w_pct, 2)
        
        all_starters[starter] = starter_dict

    return all_starters

In [221]:
all_starters_projections_dict = generate_starting_pitcher_projections(starting_pitchers, sals_with_vegas_lines, prior_year_ind_pitcher_dist, prior_year_league_innings_dist, current_year_ind_pitcher_dist, current_year_league_innings_dist, current_year_outs, weighted_league_innings_dist_mean, weighted_league_innings_dist_std, current_year_starts_vs_team, FIP_constant)


In [257]:
def generate_projection_df_pitchers(sals_df, all_starters_dict, FIP_constant):

    sals_df['pIP'] = sals_df.apply(lambda row: all_starters_dict[row['PlayerID']]['pIP'] if row['PlayerID'] in all_starters_dict else 1, axis=1)
    sals_df['pW'] = sals_df.apply(lambda row: all_starters_dict[row['PlayerID']]['pW'] if row['PlayerID'] in all_starters_dict else 0, axis=1)
    sals_df['pQS'] = sals_df.apply(lambda row: all_starters_dict[row['PlayerID']]['pQS'] if row['PlayerID'] in all_starters_dict else 0, axis=1)
    sals_df['pK'] = sals_df.apply(lambda row: all_starters_dict[row['PlayerID']]['pK'] if row['PlayerID'] in all_starters_dict else row['pSO/Out'] * 3, axis=1)
    sals_df['pBB'] = sals_df.apply(lambda row: all_starters_dict[row['PlayerID']]['pBB'] if row['PlayerID'] in all_starters_dict else row['pBB/Out'] * 3, axis=1)
    sals_df['pHR'] = sals_df.apply(lambda row: all_starters_dict[row['PlayerID']]['pHR'] if row['PlayerID'] in all_starters_dict else row['pHR/Out'] * 3, axis=1)
    sals_df['pH'] = sals_df.apply(lambda row: all_starters_dict[row['PlayerID']]['pH'] if row['PlayerID'] in all_starters_dict else (row['pH-HR/Out'] + row['pHR/Out']) * 3, axis=1)
    sals_df['pHBP'] = sals_df.apply(lambda row: all_starters_dict[row['PlayerID']]['pHBP'] if row['PlayerID'] in all_starters_dict else row['pHBP/Out'] * 3, axis=1)
    sals_df['pER'] = sals_df.apply(lambda row: all_starters_dict[row['PlayerID']]['pER'] if row['PlayerID'] in all_starters_dict else round(((((13 * row['pHR']) + (3 * (row['pBB'] + row['pHBP'])) - (2 * row['pK'])) / 1) + FIP_constant) / 9, 2), axis=1)
    sals_df['pBF'] = sals_df.apply(lambda row: 3 * row['pIP'] + row['pBB'] + row['pH'] + row['pHBP'], axis=1)
    sals_df['DraftKingsPoints'] = sals_df.apply(lambda row: round(row['pIP'] * 2.25 + row['pK'] * 2 + row['pW'] * 4 + row['pER'] * -2 + row['pH'] * -0.6 + row['pBB'] * -0.6 + row['pHBP'] * -0.6, 2), axis=1)
    sals_df['FanDuelPoints'] = sals_df.apply(lambda row: round(row['pW'] * 6 + row['pQS'] * 4 + row['pER'] * -3 + row['pK'] * 3 + row['pIP'] * 3, 2), axis=1)

    projection_df = sals_df[['PlayerID','SlateID', 'Operator', 'OperatorPlayerID', 'TeamID', 'OperatorSalary','OperatorGameType', 'SalaryCap', 'OperatorPlayerName', 'OperatorPosition', 'OperatorRosterSlots', 'pIP', 'pW', 'pQS', 'pK', 'pBB', 'pHR', 'pH', 'pHBP', 'pER', 'pBF', 'DraftKingsPoints', 'FanDuelPoints']].reset_index(drop=True)

    return projection_df

In [258]:
projection_df = generate_projection_df_pitchers(sals_with_vegas_lines, all_starters_projections_dict, FIP_constant)


In [262]:
p = projection_df[['PlayerID', 'TeamID', 'OperatorPlayerName', 'pIP', 'pK', 'pBB', 'pHR', 'pH', 'pHBP', 'pBF']]
p = p[p.pIP > 1]
p.drop_duplicates(subset=None, keep='first', inplace=True)
p = p.reset_index(drop=True)
p

Unnamed: 0,PlayerID,TeamID,OperatorPlayerName,pIP,pK,pBB,pHR,pH,pHBP,pBF
0,10000432,30.0,Justin Verlander,5.03,5.48,1.39,0.77,3.97,0.24,20.69
1,10009500,19.0,Bruce Zimmermann,4.68,4.71,1.63,0.79,4.64,0.22,20.53
2,10010729,17.0,Tarik Skubal,5.02,5.35,1.43,0.96,4.57,0.24,21.3
3,10005575,16.0,Michael Kopech,4.47,5.27,1.58,0.56,3.74,0.21,18.94
4,10005970,5.0,Brad Keller,5.16,4.33,1.97,0.56,4.97,0.25,22.67
5,10010317,20.0,Bailey Ober,4.81,4.51,1.3,0.81,4.59,0.23,20.55
6,10000986,29.0,Jameson Taillon,4.95,4.65,1.37,0.8,4.54,0.24,21.0
7,10007369,13.0,Chris Flexen,5.61,4.14,1.55,0.62,5.57,0.27,24.22
8,10005998,23.0,Austin Gomber,4.91,4.73,1.93,0.69,4.28,0.24,21.18
9,10001253,28.0,Martín Pérez,4.68,4.03,1.77,0.65,4.85,0.22,20.88


In [267]:
p.to_csv('pitchers-apr28.csv', index=False)