In [1]:
!pip install basketball_reference_web_scraper
import sklearn
import numpy as np
import pandas as pd
import array as array

import csv
import requests
from bs4 import BeautifulSoup
from basketball_reference_web_scraper import client
import basketball_reference_web_scraper.data as br_data

Collecting basketball_reference_web_scraper
  Downloading basketball_reference_web_scraper-4.11.0-py3-none-any.whl (24 kB)
Collecting idna==2.7
  Downloading idna-2.7-py2.py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 2.1 MB/s eta 0:00:01
[?25hCollecting certifi==2018.10.15
  Downloading certifi-2018.10.15-py2.py3-none-any.whl (146 kB)
[K     |████████████████████████████████| 146 kB 5.2 MB/s eta 0:00:01
[?25hCollecting urllib3==1.24.3
  Downloading urllib3-1.24.3-py2.py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 4.3 MB/s eta 0:00:01
[?25hCollecting lxml==4.5.1
  Downloading lxml-4.5.1-cp38-cp38-macosx_10_9_x86_64.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 2.3 MB/s eta 0:00:01
[?25hCollecting requests==2.20.0
  Downloading requests-2.20.0-py2.py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 4.7 MB/s eta 0:00:011
Collecting pytz==2018.6
  Downloading pytz-2018.6-py2.py3-none-any.wh

ModuleNotFoundError: No module named 'bs4'

We first create our target variable by generating a metric of fantasy outlook indexed by the slug of each player

In [None]:
target_vec = []
player_data_2020 = client.players_season_totals(season_end_year=2020)
player_map = {}
for cur_p in player_data_2020:
  comp_fantasy_total = cur_p["assists"] + cur_p["blocks"] + cur_p["defensive_rebounds"] + cur_p["offensive_rebounds"] + cur_p["points"] + cur_p["steals"] - cur_p["turnovers"]
  point_diff = cur_p["made_field_goals"] + cur_p["made_free_throws"] + cur_p["made_three_point_field_goals"] - cur_p["attempted_field_goals"] - cur_p["attempted_free_throws"] - cur_p["attempted_three_point_field_goals"] 
  comp_fantasy_total += point_diff
  if cur_p["slug"] in player_map:
    player_map[cur_p["slug"]] += comp_fantasy_total
  else:
    player_map[cur_p["slug"]] = comp_fantasy_total
player_map
target_df = pd.DataFrame([player_map[i] for i in player_map], index = player_map.keys(), columns = ["TFP"])
target_df

Unnamed: 0,TFP
adamsst01,1156
adebaba01,1740
aldrila01,1040
alexaky01,3
alexani01,144
...,...
youngtr01,1233
zelleco01,789
zellety01,3
zizican01,134


Below we build a feature set using advanced stats from the 2018-19 season. What we're trying to accomplish is a proof of concept for the model by using the 2018-19 stats to predict the 2019-20 numbers. Now of course due to COVID-19 pandemic circumstances there will be a marked difference between the scale of the input numbers, but we're more looking for a measure of ordinality amongst players rather than absolute output metrics for draft purposes. There are ways to extend the numbers to create a more accurate and precise fantasy outlook but I'm mainly just toying around for the purposes of the draft; the end goal is to be able to plug in 2019-20 advanced stats and predict some sort of relative fantasy outlook for the 2020-21 season.

In [None]:
stats_map = {}
adv_season_totals_2019 = client.players_advanced_season_totals(2019)
cols = adv_season_totals_2019[0].keys()
#[print(str(c)) for c in cols]
for p in adv_season_totals_2019:
  cur_vec = [p["age"], p["games_played"], p["minutes_played"], 
                            p["player_efficiency_rating"], p["true_shooting_percentage"],
                            p["three_point_attempt_rate"], p["free_throw_attempt_rate"],
                            p["offensive_rebound_percentage"], p["defensive_rebound_percentage"],
                            p["total_rebound_percentage"], p["assist_percentage"], p["steal_percentage"], 
                            p["block_percentage"], p["turnover_percentage"], p["usage_percentage"], 
                            p["offensive_win_shares"], p["defensive_win_shares"], p["win_shares"], 
                            p["win_shares_per_48_minutes"], p["offensive_box_plus_minus"], p["defensive_box_plus_minus"], 
                            p["box_plus_minus"], p["value_over_replacement_player"]]
  if p["slug"] in stats_map:
    denom = float(cur_vec[1] + stats_map[p["slug"]][1])
    numer_rc = stats_map[p["slug"]][1]
    numer_lc = cur_vec[1]
    adjusted_sum_vec = [(numer_lc*cur_vec[c]
                         +numer_rc*(stats_map[p["slug"]][c]))/denom for c in range(0,len(cur_vec))]
    former = stats_map[p["slug"]]
    stats_map[p["slug"]] = [p["age"], cur_vec[1]+former[1], cur_vec[2] + former[2], 
                            adjusted_sum_vec[3], adjusted_sum_vec[4],
                            adjusted_sum_vec[5], adjusted_sum_vec[6],
                            adjusted_sum_vec[7], adjusted_sum_vec[8],
                            adjusted_sum_vec[9], adjusted_sum_vec[10], adjusted_sum_vec[11], 
                            adjusted_sum_vec[12], adjusted_sum_vec[13], adjusted_sum_vec[14], 
                            adjusted_sum_vec[15], adjusted_sum_vec[16], adjusted_sum_vec[17], 
                            adjusted_sum_vec[18], adjusted_sum_vec[19], adjusted_sum_vec[20], 
                            adjusted_sum_vec[21], adjusted_sum_vec[22]]
  else:
    stats_map[p["slug"]] = cur_vec

featX = pd.DataFrame(data=[stats_map[k] for k in stats_map], index=stats_map.keys(), 
                     columns = ["age","games_played", "minutes_played", "player_efficiency_rating", 
                                "true_shooting_percentage", "three_point_attempt_rate", "free_throw_attempt_rate", 
                                "offensive_rebound_percentage", "defensive_rebound_percentage", "total_rebound_percentage",
                                "assist_percentage", "steal_percentage","block_percentage","turnover_percentage",
                                "usage_percentage","offensive_win_shares","defensive_win_shares","win_shares",
                                "win_shares_per_48_minutes","offensive_box_plus_minus","defensive_box_plus_minus",
                                "box_plus_minus","value_over_replacement_player"])
featX

Unnamed: 0,age,games_played,minutes_played,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,total_rebound_percentage,assist_percentage,steal_percentage,block_percentage,turnover_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_minutes,offensive_box_plus_minus,defensive_box_plus_minus,box_plus_minus,value_over_replacement_player
abrinal01,25,31,588,6.300000,0.507000,0.809000,0.083000,0.900000,7.800000,4.200000,4.300000,1.300000,0.900000,7.900000,12.200000,0.100000,0.600000,0.600000,0.053000,-3.700000,0.400000,-3.300000,-0.2
acyqu01,28,10,123,2.900000,0.379000,0.833000,0.556000,2.700000,20.100000,11.300000,8.200000,0.400000,2.700000,15.200000,9.200000,-0.100000,0.000000,-0.100000,-0.022000,-7.600000,-0.500000,-8.100000,-0.2
adamsja01,22,34,428,7.600000,0.474000,0.673000,0.082000,2.600000,12.300000,7.400000,19.800000,1.500000,1.000000,19.700000,13.500000,-0.100000,0.200000,0.100000,0.011000,-3.800000,-0.500000,-4.300000,-0.2
adamsst01,25,80,2669,18.500000,0.591000,0.002000,0.361000,14.700000,14.800000,14.700000,6.600000,2.000000,2.400000,12.600000,16.400000,5.100000,4.000000,9.100000,0.163000,0.700000,0.400000,1.100000,2.1
adebaba01,21,82,1913,17.900000,0.623000,0.031000,0.465000,9.200000,24.000000,16.600000,14.200000,1.800000,3.000000,17.100000,15.800000,3.400000,3.400000,6.800000,0.171000,-0.400000,2.200000,1.800000,1.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
youngtr01,20,81,2503,17.000000,0.539000,0.384000,0.330000,2.600000,10.200000,6.300000,40.500000,1.300000,0.500000,17.600000,28.400000,2.500000,0.700000,3.300000,0.062000,2.300000,-2.800000,-0.600000,0.9
zelleco01,26,49,1243,17.200000,0.611000,0.064000,0.409000,9.400000,19.700000,14.500000,12.100000,1.500000,2.900000,13.200000,16.300000,2.600000,1.200000,3.900000,0.150000,-0.600000,0.700000,0.100000,0.7
zellety01,29,6,93,13.433333,0.426667,0.166667,0.428667,14.300000,21.366667,17.766667,7.666667,0.400000,2.333333,6.666667,17.000000,0.133333,0.066667,0.200000,0.123000,-4.433333,-1.800000,-6.266667,0.0
zizican01,22,59,1082,16.200000,0.590000,0.000000,0.399000,10.600000,22.800000,16.400000,7.700000,0.600000,1.800000,13.600000,18.200000,1.700000,0.300000,2.000000,0.087000,-1.100000,-2.100000,-3.200000,-0.3


Let's cut out any datapoints that aren't in either dataset and make a test/train split

In [None]:
not_in_target = [i for i in featX.index if i not in target_df.index]
featX = featX.drop(index=not_in_target)
not_in_feat = [i for i in target_df.index if i not in featX.index]
target_df = target_df.drop(index=not_in_feat)

In [None]:
print("feature set shape: ", (featX.shape))
print("target df shape: ", (target_df.shape))

feature set shape:  (400, 23)
target df shape:  (400, 1)


Perfect. Now all that's left is the split. We'll just use sklearn to make a straight-up split. Honestly alphabetic oculd work here because it's relatively arbitrary, but we'll go with a built in random method to be sure.

In [None]:
from sklearn import model_selection, linear_model, metrics
xTrain, xTest, yTrain, yTest = model_selection.train_test_split(featX, target_df)
yTest

Unnamed: 0,TFP
bacondw01,139
dellama01,207
jordade01,992
cookqu01,125
baynear01,442
...,...
powelno01,625
chrisma01,808
brogdma01,857
thomptr01,1011


In [None]:
model = linear_model.LinearRegression()
model.fit(xTrain, yTrain)
print(model.coef_)
print(featX.columns)

[[-1.26317790e+01 -3.50973599e+00  2.14411391e-01  4.18025398e+01
  -9.07641990e+02 -1.21830776e+01  1.42968670e+02 -1.01581650e+02
  -8.34442644e+01  1.68143541e+02 -8.93513042e-01 -2.29352298e+01
   9.92220033e+00  5.39566618e+00  3.87460684e+00  3.73919295e+02
   4.31541772e+02 -3.43256114e+02  7.95702837e+02 -9.59462728e+01
  -9.61976744e+01  6.96062372e+01 -2.79716700e+01]]
Index(['age', 'games_played', 'minutes_played', 'player_efficiency_rating',
       'true_shooting_percentage', 'three_point_attempt_rate',
       'free_throw_attempt_rate', 'offensive_rebound_percentage',
       'defensive_rebound_percentage', 'total_rebound_percentage',
       'assist_percentage', 'steal_percentage', 'block_percentage',
       'turnover_percentage', 'usage_percentage', 'offensive_win_shares',
       'defensive_win_shares', 'win_shares', 'win_shares_per_48_minutes',
       'offensive_box_plus_minus', 'defensive_box_plus_minus',
       'box_plus_minus', 'value_over_replacement_player'],
      dt

In [None]:
yHat = model.predict(xTest)
print("MSE: ", metrics.mean_absolute_error(yTest, yHat))
print("R^2: ", metrics.r2_score(yTest, yHat))

MSE:  218.98734825541237
R^2:  0.5888033503078975


Now let's generalize the feature set creation into a method so that we can build a feature set for prediction.

In [None]:
def generate_feature_set(year):
  stats_map = {}
  adv_season_totals_yr = client.players_advanced_season_totals(year)
  cols = adv_season_totals_yr[0].keys()
  #[print(str(c)) for c in cols]
  for p in adv_season_totals_yr:
    cur_vec = [p["age"], p["games_played"], p["minutes_played"], 
                              p["player_efficiency_rating"], p["true_shooting_percentage"],
                              p["three_point_attempt_rate"], p["free_throw_attempt_rate"],
                              p["offensive_rebound_percentage"], p["defensive_rebound_percentage"],
                              p["total_rebound_percentage"], p["assist_percentage"], p["steal_percentage"], 
                              p["block_percentage"], p["turnover_percentage"], p["usage_percentage"], 
                              p["offensive_win_shares"], p["defensive_win_shares"], p["win_shares"], 
                              p["win_shares_per_48_minutes"], p["offensive_box_plus_minus"], p["defensive_box_plus_minus"], 
                              p["box_plus_minus"], p["value_over_replacement_player"]]
    if p["slug"] in stats_map:
      denom = float(cur_vec[1] + stats_map[p["slug"]][1])
      numer_rc = stats_map[p["slug"]][1]
      numer_lc = cur_vec[1]
      adjusted_sum_vec = [(numer_lc*cur_vec[c]
                          +numer_rc*(stats_map[p["slug"]][c]))/denom for c in range(0,len(cur_vec))]
      former = stats_map[p["slug"]]
      stats_map[p["slug"]] = [p["age"], cur_vec[1]+former[1], cur_vec[2] + former[2], 
                              adjusted_sum_vec[3], adjusted_sum_vec[4],
                              adjusted_sum_vec[5], adjusted_sum_vec[6],
                              adjusted_sum_vec[7], adjusted_sum_vec[8],
                              adjusted_sum_vec[9], adjusted_sum_vec[10], adjusted_sum_vec[11], 
                              adjusted_sum_vec[12], adjusted_sum_vec[13], adjusted_sum_vec[14], 
                              adjusted_sum_vec[15], adjusted_sum_vec[16], adjusted_sum_vec[17], 
                              adjusted_sum_vec[18], adjusted_sum_vec[19], adjusted_sum_vec[20], 
                              adjusted_sum_vec[21], adjusted_sum_vec[22]]
    else:
      stats_map[p["slug"]] = cur_vec

  featX = pd.DataFrame(data=[stats_map[k] for k in stats_map], index=stats_map.keys(), 
                      columns = ["age","games_played", "minutes_played", "player_efficiency_rating", 
                                  "true_shooting_percentage", "three_point_attempt_rate", "free_throw_attempt_rate", 
                                  "offensive_rebound_percentage", "defensive_rebound_percentage", "total_rebound_percentage",
                                  "assist_percentage", "steal_percentage","block_percentage","turnover_percentage",
                                  "usage_percentage","offensive_win_shares","defensive_win_shares","win_shares",
                                  "win_shares_per_48_minutes","offensive_box_plus_minus","defensive_box_plus_minus",
                                  "box_plus_minus","value_over_replacement_player"])
  return featX

We'll run this on the year 2020 to generate our prediction feature set for the season ending in 2021.

In [None]:
featX_2020 = generate_feature_set(2020)
pred_21 = pd.DataFrame(data=model.predict(featX_2020), index=featX_2020.index)
t = [(pred_21.iloc[p]) for p in range(0, len(pred_21.index))]
pred_21.to_excel("fantasy_outlooks_2021.xls")