In [1]:
# for data
from pyNBA.Data.constants import ROTO_NAME_TO_NBA_NAME, DB_TEAM_TO_NBA_TEAM, BAD_CONTEST_SUBSTRINGS, OWNERSHIP_NAME_TO_NBA_NAME
from pyNBA.Data.data import QueryData
from pyNBA.Models.helpers import CleanData
import pandas as pd
import numpy as np
from numpy.random import normal

# for models
from pyNBA.DFS.rules import FPCalculator
from pyNBA.DFS.constants import Site
from pyNBA.Models.fantasy_points import FPModel
from pyNBA.Models.variance import VarianceModel
from pyNBA.Models.ownership import OwnershipModel
import pulp
from pydfs_lineup_optimizer.solvers.pulp_solver import PuLPSolver

# for backtesting
import time
from pydfs_lineup_optimizer import get_optimizer, Sport
from pydfs_lineup_optimizer import Player
from numpy.random import normal
from numpy.random import choice
import scipy.stats as st

# for plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
from datetime import datetime
import seaborn as sns; sns.set()

# misc
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
query_data = QueryData()
clean_data = CleanData()

# pull boxscore data
boxscores = query_data.query_boxscore_data()

# we are only interested in the boxscores of healthy and non-resting players
boxscores = clean_data.drop_rows_player_injured(boxscores)
boxscores = clean_data.drop_rows_player_rest(boxscores)

# historical DFS salary data
salary_data = query_data.query_salary_data()
salary_data = salary_data.rename(columns={'POSITION': 'DFS_POSITION'})
salary_data['NAME'] = salary_data['PLAYER'].apply(clean_data.roto_name_to_nba_name)

# historical DFS contest data. I am only interested in predicting ownership in Classic GPP compeititions.
contest_data = query_data.query_contest_data()
contest_data['MAXROI'] = contest_data['TOPPRIZE']/contest_data['ENTRYFEE']
contest_data = contest_data.loc[
    (contest_data['SLATETYPE'] == 'Classic') & (contest_data['CASHLINE'] > 200) & (contest_data['MAXROI'] > 2) &
    (~contest_data['CONTESTNAME'].str.lower().str.contains('|'.join(BAD_CONTEST_SUBSTRINGS)))
].dropna(subset=['CASHLINE'])

# contest payout structure
contest_info = query_data.query_contest_info_data()

# historical player ownership
ownership_data = query_data.query_ownership_data()
ownership_data['NAME'] = ownership_data['PLAYERNAME'].apply(
    lambda x: x if x not in OWNERSHIP_NAME_TO_NBA_NAME else OWNERSHIP_NAME_TO_NBA_NAME[x]
    )

# historical vegas odds data
odds_data = query_data.query_odds_data()

# historical quarterly boxscore data
quarterly_boxscore_data = query_data.query_quarterly_boxscore_data()

/Users/brandonshimiaie/Projects/pyNBA/sqlite/db/nba.db
2.6.0


In [3]:
season_list = list(boxscores['SEASON'].unique())

# to calculate fantasy points
DKFPCalculator = FPCalculator(Site.DRAFTKINGS)

# isolate Draftkings salaries
boxscores['REB'] = boxscores['DREB'] + boxscores['OREB']
    
# optimize Draftkings NBA lineups
class CustomPuLPSolver(PuLPSolver):
    LP_SOLVER = pulp.GLPK_CMD(msg=0)
optimizer = get_optimizer(Site.DRAFTKINGS, Sport.BASKETBALL, solver=CustomPuLPSolver)

In [4]:
test_season_index = len(season_list) - 1
fp_train_data = boxscores.loc[boxscores['SEASON'].isin(season_list[0:test_season_index])]
ownership_train_data = boxscores.loc[~(boxscores['SEASON'] == season_list[test_season_index])]
test_data = boxscores.loc[boxscores['SEASON'] == season_list[test_season_index]]

print(time.ctime())

# projected fantasy points
fp_model = FPModel(fp_train_data, test_data, Site.DRAFTKINGS)
fp_model.train_model(quarterly_boxscore_data, odds_data)
predictions, fp_hat_column = fp_model.predict()
predictions = predictions.dropna(subset=[fp_hat_column])

print(time.ctime())

# actual fantasy points
predictions['DKFP'] = predictions.apply(
    lambda x: DKFPCalculator.calculate_fantasy_points(
        x['SEASON'], x['PTS'], x['REB'], x['AST'], x['TOV'], x['BLK'], x['STL'], x['FG3M']
    ), 
    axis=1
)

print(time.ctime())

# fantasy point variance
variance_model = VarianceModel(predictions)
predictions, variance_column = variance_model.predict('DKFP', fp_hat_column)
pct_variance_column = 'RESIDUAL_PCT'
predictions[pct_variance_column] = predictions[variance_column]/predictions[fp_hat_column]
predictions[pct_variance_column] = predictions[pct_variance_column].replace([np.inf, -np.inf], np.nan)
predictions[pct_variance_column] = predictions[pct_variance_column].fillna(0.25)
predictions[pct_variance_column] = predictions[pct_variance_column].clip(lower=0.05)
print(time.ctime())

# projected ownership
ownership_model = OwnershipModel(ownership_train_data, test_data, Site.DRAFTKINGS)
ownership_model.create_features(salary_data, contest_data, ownership_data, odds_data)
ownership_model.train_model()
ownership_predictions, ownership_column = ownership_model.predict()


dk_salary_data = salary_data.loc[salary_data['SITE'] == Site.DRAFTKINGS]
predictions = predictions.merge(dk_salary_data, on=['DATE', 'NAME'], how='left')
predictions = predictions.dropna(subset=['SALARY', 'DFS_POSITION'])

ownership_predictions = ownership_predictions.merge(
    predictions[['DATE', 'NAME', fp_hat_column, pct_variance_column, 'SALARY']],
    on=['DATE', 'NAME'], how='left'
)

print(time.ctime())

Thu Jul  2 14:59:17 2020
Thu Jul  2 15:43:23 2020
Thu Jul  2 15:43:25 2020
Thu Jul  2 15:43:25 2020
[0]	validation_0-mae:0.40375
Will train until validation_0-mae hasn't improved in 25 rounds.
[1]	validation_0-mae:0.38428
[2]	validation_0-mae:0.36580
[3]	validation_0-mae:0.34830
[4]	validation_0-mae:0.33170
[5]	validation_0-mae:0.31595
[6]	validation_0-mae:0.30104
[7]	validation_0-mae:0.28691
[8]	validation_0-mae:0.27351
[9]	validation_0-mae:0.26084
[10]	validation_0-mae:0.24884
[11]	validation_0-mae:0.23747
[12]	validation_0-mae:0.22668
[13]	validation_0-mae:0.21650
[14]	validation_0-mae:0.20686
[15]	validation_0-mae:0.19773
[16]	validation_0-mae:0.18910
[17]	validation_0-mae:0.18093
[18]	validation_0-mae:0.17319
[19]	validation_0-mae:0.16587
[20]	validation_0-mae:0.15892
[21]	validation_0-mae:0.15236
[22]	validation_0-mae:0.14617
[23]	validation_0-mae:0.14028
[24]	validation_0-mae:0.13475
[25]	validation_0-mae:0.12948
[26]	validation_0-mae:0.12449
[27]	validation_0-mae:0.11979
[28]	v

[263]	validation_0-mae:0.02887
[264]	validation_0-mae:0.02882
[265]	validation_0-mae:0.02881
[266]	validation_0-mae:0.02878
[267]	validation_0-mae:0.02876
[268]	validation_0-mae:0.02876
[269]	validation_0-mae:0.02874
[270]	validation_0-mae:0.02869
[271]	validation_0-mae:0.02864
[272]	validation_0-mae:0.02862
[273]	validation_0-mae:0.02858
[274]	validation_0-mae:0.02854
[275]	validation_0-mae:0.02851
[276]	validation_0-mae:0.02847
[277]	validation_0-mae:0.02844
[278]	validation_0-mae:0.02842
[279]	validation_0-mae:0.02836
[280]	validation_0-mae:0.02833
[281]	validation_0-mae:0.02832
[282]	validation_0-mae:0.02828
[283]	validation_0-mae:0.02825
[284]	validation_0-mae:0.02820
[285]	validation_0-mae:0.02817
[286]	validation_0-mae:0.02814
[287]	validation_0-mae:0.02810
[288]	validation_0-mae:0.02807
[289]	validation_0-mae:0.02803
[290]	validation_0-mae:0.02800
[291]	validation_0-mae:0.02796
[292]	validation_0-mae:0.02792
[293]	validation_0-mae:0.02786
[294]	validation_0-mae:0.02782
[295]	va

[528]	validation_0-mae:0.02273
[529]	validation_0-mae:0.02271
[530]	validation_0-mae:0.02271
[531]	validation_0-mae:0.02268
[532]	validation_0-mae:0.02266
[533]	validation_0-mae:0.02263
[534]	validation_0-mae:0.02261
[535]	validation_0-mae:0.02258
[536]	validation_0-mae:0.02258
[537]	validation_0-mae:0.02255
[538]	validation_0-mae:0.02253
[539]	validation_0-mae:0.02252
[540]	validation_0-mae:0.02249
[541]	validation_0-mae:0.02247
[542]	validation_0-mae:0.02247
[543]	validation_0-mae:0.02244
[544]	validation_0-mae:0.02242
[545]	validation_0-mae:0.02241
[546]	validation_0-mae:0.02240
[547]	validation_0-mae:0.02239
[548]	validation_0-mae:0.02238
[549]	validation_0-mae:0.02236
[550]	validation_0-mae:0.02233
[551]	validation_0-mae:0.02232
[552]	validation_0-mae:0.02230
[553]	validation_0-mae:0.02228
[554]	validation_0-mae:0.02226
[555]	validation_0-mae:0.02224
[556]	validation_0-mae:0.02222
[557]	validation_0-mae:0.02221
[558]	validation_0-mae:0.02220
[559]	validation_0-mae:0.02218
[560]	va

[793]	validation_0-mae:0.01931
[794]	validation_0-mae:0.01930
[795]	validation_0-mae:0.01928
[796]	validation_0-mae:0.01928
[797]	validation_0-mae:0.01927
[798]	validation_0-mae:0.01927
[799]	validation_0-mae:0.01926
[800]	validation_0-mae:0.01924
[801]	validation_0-mae:0.01924
[802]	validation_0-mae:0.01921
[803]	validation_0-mae:0.01920
[804]	validation_0-mae:0.01920
[805]	validation_0-mae:0.01919
[806]	validation_0-mae:0.01918
[807]	validation_0-mae:0.01917
[808]	validation_0-mae:0.01916
[809]	validation_0-mae:0.01915
[810]	validation_0-mae:0.01914
[811]	validation_0-mae:0.01913
[812]	validation_0-mae:0.01913
[813]	validation_0-mae:0.01911
[814]	validation_0-mae:0.01910
[815]	validation_0-mae:0.01910
[816]	validation_0-mae:0.01909
[817]	validation_0-mae:0.01906
[818]	validation_0-mae:0.01906
[819]	validation_0-mae:0.01905
[820]	validation_0-mae:0.01904
[821]	validation_0-mae:0.01903
[822]	validation_0-mae:0.01903
[823]	validation_0-mae:0.01901
[824]	validation_0-mae:0.01901
[825]	va

[1056]	validation_0-mae:0.01734
[1057]	validation_0-mae:0.01733
[1058]	validation_0-mae:0.01732
[1059]	validation_0-mae:0.01731
[1060]	validation_0-mae:0.01731
[1061]	validation_0-mae:0.01730
[1062]	validation_0-mae:0.01730
[1063]	validation_0-mae:0.01729
[1064]	validation_0-mae:0.01728
[1065]	validation_0-mae:0.01727
[1066]	validation_0-mae:0.01726
[1067]	validation_0-mae:0.01725
[1068]	validation_0-mae:0.01725
[1069]	validation_0-mae:0.01725
[1070]	validation_0-mae:0.01724
[1071]	validation_0-mae:0.01724
[1072]	validation_0-mae:0.01723
[1073]	validation_0-mae:0.01723
[1074]	validation_0-mae:0.01722
[1075]	validation_0-mae:0.01722
[1076]	validation_0-mae:0.01722
[1077]	validation_0-mae:0.01722
[1078]	validation_0-mae:0.01721
[1079]	validation_0-mae:0.01720
[1080]	validation_0-mae:0.01719
[1081]	validation_0-mae:0.01719
[1082]	validation_0-mae:0.01718
[1083]	validation_0-mae:0.01718
[1084]	validation_0-mae:0.01717
[1085]	validation_0-mae:0.01716
[1086]	validation_0-mae:0.01716
[1087]	v

[1313]	validation_0-mae:0.01612
[1314]	validation_0-mae:0.01612
[1315]	validation_0-mae:0.01611
[1316]	validation_0-mae:0.01611
[1317]	validation_0-mae:0.01611
[1318]	validation_0-mae:0.01610
[1319]	validation_0-mae:0.01610
[1320]	validation_0-mae:0.01610
[1321]	validation_0-mae:0.01609
[1322]	validation_0-mae:0.01609
[1323]	validation_0-mae:0.01608
[1324]	validation_0-mae:0.01608
[1325]	validation_0-mae:0.01607
[1326]	validation_0-mae:0.01607
[1327]	validation_0-mae:0.01607
[1328]	validation_0-mae:0.01606
[1329]	validation_0-mae:0.01606
[1330]	validation_0-mae:0.01606
[1331]	validation_0-mae:0.01605
[1332]	validation_0-mae:0.01605
[1333]	validation_0-mae:0.01605
[1334]	validation_0-mae:0.01605
[1335]	validation_0-mae:0.01605
[1336]	validation_0-mae:0.01605
[1337]	validation_0-mae:0.01604
[1338]	validation_0-mae:0.01604
[1339]	validation_0-mae:0.01603
[1340]	validation_0-mae:0.01603
[1341]	validation_0-mae:0.01603
[1342]	validation_0-mae:0.01603
[1343]	validation_0-mae:0.01603
[1344]	v

In [79]:
def calc_p(mu, var, x):
    z = (x - mu)/np.sqrt(400 + var)
    return 1 - st.norm.cdf(z)

def get_contest_prize(contest_info, score):
    prize = contest_info.loc[contest_info['MINPOINTS'] <= score, 'PRIZE'].max()
    return prize

In [None]:
max_entry_fee_per_contest = 1000
pnl = 0

season_contests = contest_data.loc[
    (contest_data['DATE'] >= test_data['DATE'].min()) & (contest_data['DATE'] <= test_data['DATE'].max())
]
    
# iterate through each slate in the season
for (date, _), slate_contest_data in season_contests.groupby(['DATE', 'SLATEID']):
    print(time.ctime(), date)

    # get players in slate
    slate_teams = slate_contest_data.iloc[0]['TEAMS'].split('_')
    slate_teams = [i if i not in DB_TEAM_TO_NBA_TEAM else DB_TEAM_TO_NBA_TEAM[i] for i in slate_teams]
    board = predictions.loc[
        (predictions['DATE'] == date) &
        (predictions['TEAM'].isin(slate_teams))
    ]
    
    board_teams = list(board['TEAM'].unique())
    game_count = slate_contest_data.iloc[0]['GAMECOUNT']
    sym_dif = (set(board_teams).symmetric_difference(set(slate_teams)))
    if bool(sym_dif) or len(slate_teams) != game_count*2:
        print('ERROR')
        continue

    # iterate through each contest in the slate
    slate_contest_data = slate_contest_data.loc[slate_contest_data['ENTRYFEE'] <= 250]
    for _, contest in slate_contest_data.iterrows():

        contest_total_entries = contest['TOTALENTRIES']
        contest_id = contest['CONTESTID']

        contest_ownership_predictions = ownership_predictions.loc[
            ownership_predictions['CONTESTID'] == contest_id
        ]
        contest_board = board.merge(
            contest_ownership_predictions, on=['NAME']
        )

        # generate distribution of possible max scores of contest
        mean_max_score = contest['TOPSCORE']

        # generate lineups for contest
        players = []
        for _, player in board.iterrows():
            player_id = player['PLAYERID']

            name = player['NAME'].split()
            first_name = name[0]
            last_name = name[1] if len(name) > 1 else ''

            positions = player['DFS_POSITION'].split('_')

            team = player['TEAM']

            mu = player[fp_hat_column]
            sigma = player[pct_variance_column]
            salary =  player['SALARY']

            player = Player(player_id, first_name, last_name, positions, team, salary, mu, standard_deviation=sigma/2)
            players.append(player)

        optimizer.load_players(players)
        optimizer.set_max_deviation(4)
        optimizer.set_min_salary_cap(49000)
        
        max_contest_entries = int(contest['MAXENTRIES'])
        n = int(1.5*max_contest_entries + 10)
        lineups = optimizer.optimize(n=n, randomness=True, max_exposure=0.5)

        try:
            lineup_df = pd.DataFrame()
            contest_board = contest_ownership_predictions[['DATE', 'NAME', ownership_column]].merge(board, on=['DATE', 'NAME'], how='left')
            for lineup in lineups:
                player_ids = [i.id for i in lineup.lineup]

                lineup_players = contest_board.loc[contest_board['PLAYERID'].isin(player_ids)]
                
                lineup_mean_fp = lineup_players[fp_hat_column].sum()

                lineup_players['VARIANCE'] = (lineup_players[pct_variance_column]*lineup_players[fp_hat_column]).apply(lambda x: x**2)
                lineup_players['PARTIALVARIANCE'] = lineup_players[ownership_column].apply(lambda x: 1-x) * lineup_players['VARIANCE']
                lineup_variance = lineup_players['PARTIALVARIANCE'].sum()

                real_score = lineup_players['DKFP'].sum()

                lineup_df = lineup_df.append(
                    pd.DataFrame({'PLAYERIDS': [player_ids], 'MEAN_FP': [lineup_mean_fp], 'VARIANCE': [lineup_variance], 'REALSCORE': [real_score]})
                )
        except:
            print('COULD NOT GENERATE LINEUPS')
            continue

        # isolate +ev lineups
        lineup_df['PVALUE'] = lineup_df.apply(lambda x: calc_p(x['MEAN_FP'], x['VARIANCE'], mean_max_score), axis=1)

        value = float(1/contest['TOTALENTRIES'])
        contest_entries = min(max_contest_entries, int(max_entry_fee_per_contest/contest['ENTRYFEE'])) if contest['ENTRYFEE'] > 0 else max_contest_entries
        plus_ev_lineups = lineup_df.loc[lineup_df['PVALUE'] > value].sort_values(by='PVALUE', ascending=False).head(int(contest_entries))

        # get lineup results
        entered_contest_info = contest_info.loc[contest_info['CONTESTID'] == contest_id]
        entered_contest_info['MINPOINTS'] = entered_contest_info['MINPOINTS'].fillna(0)

        if plus_ev_lineups.empty:
            print('Not Entering Contest: {} {} {}'.format(str(game_count), str(contest_total_entries), str(max_contest_entries)))

        for _, lineup in plus_ev_lineups.iterrows():
            display(contest_board.loc[contest_board['PLAYERID'].isin(lineup['PLAYERIDS']), ['NAME', fp_hat_column, pct_variance_column, ownership_column]])
            score = lineup['REALSCORE']
            prize = get_contest_prize(entered_contest_info, score)
            pnl += (prize - contest['ENTRYFEE'])

        print(round(pnl, 2))
                
            