In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

pd.options.mode.chained_assignment = None

In [2]:
season_2014_15 = pd.read_csv('https://raw.githubusercontent.com/andrewkoo/aml_data/master/2014-15_season.csv')
season_2014_15['season'] = '2014-15'
season_2015_16 = pd.read_csv('https://raw.githubusercontent.com/andrewkoo/aml_data/master/2015-16_season.csv')
season_2015_16['season'] = '2015-16'
season_2016_17 = pd.read_csv('https://raw.githubusercontent.com/andrewkoo/aml_data/master/2016-17_season.csv')
season_2016_17['season'] = '2016-17'
season_2017_18 = pd.read_csv('https://raw.githubusercontent.com/andrewkoo/aml_data/master/2017-18_season.csv')
season_2017_18['season'] = '2017-18'
season_2018_19 = pd.read_csv('https://raw.githubusercontent.com/andrewkoo/aml_data/master/2018-19_season.csv')
season_2018_19['season'] = '2018-19'
season_2019_20 = pd.read_csv('https://raw.githubusercontent.com/andrewkoo/aml_data/master/2019-20_season.csv')
season_2019_20['season'] = '2019-20'

data = pd.concat([season_2014_15, season_2015_16])
data = pd.concat([data, season_2016_17])
data = pd.concat([data, season_2017_18])
data = pd.concat([data, season_2018_19])
data = pd.concat([data, season_2019_20])

In [3]:
train_data = data[data['season'] != '2019-20']
test_data = data[data['season'] == '2019-20']

In [4]:
def playoffs(x):
    tmp = x.split(" ")
    return tmp[1] if len(tmp) > 1 else ''

# integer representation of 'Date' column
def epoch_date(dt):
    pattern = '%Y-%m-%d'
    return int(time.mktime(time.strptime(dt, pattern)))

def draftkings_value(row):
    dk_value = 0
    dk_value += row['PTS']
    dk_value += row['3P'] * 0.5
    dk_value += row['TRB'] * 1.25
    dk_value += row['AST'] * 1.5
    dk_value += row['STL'] * 2
    dk_value += row['BLK'] * 2
    dk_value += row['TOV'] * -0.5

    # double double / triple double
    categories = ['PTS', 'TRB', 'AST', 'STL', 'BLK']
    check = (row[categories].values >= 10).sum()
    if check >= 3: 
        dk_value += 3
    elif check >= 2: 
        dk_value += 1.5
    return dk_value

def clean_data(df): 
    # split out 'Player' column to 'Name' and 'Player Key'
    df.loc[:,'Name'] = df['Player'].apply(lambda x: x.split('\\')[0])
    df.loc[:,'Player Key'] = df['Player'].apply(lambda x: x.split('\\')[1])
    df.loc[:,'Date'] = df['Date'].apply(lambda x: x.split(' ')[0])
    df.loc[:,'epoch_date'] = df['Date'].apply(epoch_date) 
    df.loc[:,'dk_value'] = df.apply(draftkings_value, axis=1)
    df.loc[:,'Playoffs'] = df['Date'].apply(playoffs) 
    df = df[df['Playoffs'] != '*']
    df = df[df['MP'] != 0]
    df = df.sort_values('epoch_date')

    df.loc[:,'HomeAway'] = df['Unnamed: 5']
    df.loc[:,'Outcome'] = df['Unnamed: 7']
    df.drop('Unnamed: 5', axis=1, inplace=True)
    df.drop('Unnamed: 7', axis=1, inplace=True)

    return df

# calculate dk value on pred columns
def draftkings_value_pred(row):
    dk_value = 0
    dk_value += row['PTS_pred']
    dk_value += row['3P_pred'] * 0.5
    dk_value += row['TRB_pred'] * 1.25
    dk_value += row['AST_pred'] * 1.5
    dk_value += row['STL_pred'] * 2
    dk_value += row['BLK_pred'] * 2
    dk_value += row['TOV_pred'] * -0.5

    # double double / triple double
    categories = ['PTS_pred', 'TRB_pred', 'AST_pred', 'STL_pred', 'BLK_pred']
    check = (row[categories].values >= 10).sum()
    if check >= 3: 
        dk_value += 3
    elif check >= 2: 
        dk_value += 1.5
    return dk_value

In [5]:
train_data = clean_data(train_data)
test_data = clean_data(test_data)

In [6]:
train_data.head()

Unnamed: 0,Rk,Player,Age,Pos,Tm,Opp,Date,GS,MP,FG,...,GmSc,BPM,season,Name,Player Key,epoch_date,dk_value,Playoffs,HomeAway,Outcome
27616,27617,Dirk Nowitzki\nowitdi01,36-131,F,DAL,SAS,2014-10-28,1,32,7,...,15.5,5.3,2014-15,Dirk Nowitzki,nowitdi01,1414468800,30.0,,@,L
27595,27596,Manu Ginóbili\ginobma01,37-092,G,SAS,DAL,2014-10-28,0,28,6,...,15.1,6.0,2014-15,Manu Ginóbili,ginobma01,1414468800,35.0,,,W
27594,27595,Jimmer Fredette\fredeji01,25-245,G,NOP,ORL,2014-10-28,0,18,0,...,-1.0,-13.6,2014-15,Jimmer Fredette,fredeji01,1414468800,7.0,,,W
27593,27594,Evan Fournier\fournev01,21-364,G-F,ORL,NOP,2014-10-28,1,32,3,...,2.4,-7.9,2014-15,Evan Fournier,fournev01,1414468800,16.75,,@,L
27592,27593,Tyreke Evans\evansty01,25-039,G-F,NOP,ORL,2014-10-28,1,35,5,...,10.0,-2.3,2014-15,Tyreke Evans,evansty01,1414468800,32.25,,,W


In [7]:
sql_columns = []
original_cols = ['MP', 'FG', 'FGA', 'FG%', '2P', '2PA', '2P%', '3P', '3PA',
                  '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
                  'TOV', 'PF', 'PTS', 'GmSc', 'BPM']
windows = ['3', '5', '10']
for col in original_cols: 
    for w in windows: 
        sql_columns.append(col + '_' + w)

In [8]:
# for col_name in original_cols:# iterate through original column name
for window in [3, 5, 10]:
    # set rolling avg column
    # new_column_name = col_name + '_' + str(window)
    tmp = train_data[['Player Key'] + original_cols].astype({
              'Player Key' : 'object',
              'MP' : 'float64', 
              'FG' : 'float64', 
              'FGA' : 'float64', 
              'FG%' : 'float64', 
              '2P' : 'float64', 
              '2PA' : 'float64', 
              '2P%' : 'float64', 
              '3P' : 'float64', 
              '3PA' : 'float64',
              '3P%' : 'float64', 
              'FT' : 'float64', 
              'FTA' : 'float64', 
              'FT%' : 'float64', 
              'ORB' : 'float64', 
              'DRB' : 'float64', 
              'TRB' : 'float64', 
              'AST' : 'float64', 
              'STL' : 'float64', 
              'BLK' : 'float64',
              'TOV' : 'float64', 
              'PF' : 'float64', 
              'PTS' : 'float64', 
              'GmSc' : 'float64', 
              'BPM' : 'float64'
          })

    mat = tmp.groupby('Player Key').transform(lambda x: x.rolling(window).mean().shift().bfill())
    mat = mat.add_suffix('_' + str(window))

    train_data = pd.concat((train_data, mat), axis=1)

# for col_name in original_cols:# iterate through original column name
for window in [20, 82]:
  # set rolling avg column
  # new_column_name = col_name + '_' + str(window)
    tmp = train_data[['Player Key', 'dk_value']].astype({
              'Player Key' : 'object',
              'dk_value' : 'float64'
          })

    mat = tmp.groupby('Player Key').transform(lambda x: x.rolling(window).mean().shift().bfill())
    mat = mat.add_suffix('_' + str(window))

    train_data = pd.concat((train_data, mat), axis=1)

In [9]:
# for col_name in original_cols:# iterate through original column name
for window in [3, 5, 10]:
  # set rolling avg column
  # new_column_name = col_name + '_' + str(window)
  tmp = test_data[['Player Key'] + original_cols].astype({
              'Player Key' : 'object',
              'MP' : 'float64', 
              'FG' : 'float64', 
              'FGA' : 'float64', 
              'FG%' : 'float64', 
              '2P' : 'float64', 
              '2PA' : 'float64', 
              '2P%' : 'float64', 
              '3P' : 'float64', 
              '3PA' : 'float64',
              '3P%' : 'float64', 
              'FT' : 'float64', 
              'FTA' : 'float64', 
              'FT%' : 'float64', 
              'ORB' : 'float64', 
              'DRB' : 'float64', 
              'TRB' : 'float64', 
              'AST' : 'float64', 
              'STL' : 'float64', 
              'BLK' : 'float64',
              'TOV' : 'float64', 
              'PF' : 'float64', 
              'PTS' : 'float64', 
              'GmSc' : 'float64', 
              'BPM' : 'float64'
          })

  mat = tmp.groupby('Player Key').transform(lambda x: x.rolling(window).mean().shift().bfill())
  mat = mat.add_suffix('_' + str(window))

  test_data = pd.concat((test_data, mat), axis=1)

# for col_name in original_cols:# iterate through original column name
for window in [20, 82]:
  # set rolling avg column
  # new_column_name = col_name + '_' + str(window)
  tmp = test_data[['Player Key', 'dk_value']].astype({
              'Player Key' : 'object',
              'dk_value' : 'float64'
          })

  mat = tmp.groupby('Player Key').transform(lambda x: x.rolling(window).mean().shift().bfill())
  mat = mat.add_suffix('_' + str(window))

  test_data = pd.concat((test_data, mat), axis=1)

In [10]:
train_data['HomeAway'] = train_data['HomeAway'].fillna(' ')
test_data['HomeAway'] = test_data['HomeAway'].fillna(' ')

In [11]:
# Initialize one hot encoder for non-ordinal categorical data
onehot_columns = ['Tm', 'Opp', 'HomeAway', 'Pos'] # homeaway: change to label-encoding
onehot_enc = OneHotEncoder(handle_unknown='ignore')

# Fit transform to train
onehot_train_data = onehot_enc.fit_transform(train_data[onehot_columns]).toarray()
encoded_train_data = np.concatenate((train_data.values, onehot_train_data), axis=1)

# Transform to test
onehot_test_data = onehot_enc.transform(test_data[onehot_columns]).toarray()
encoded_test_data = np.concatenate((test_data.values, onehot_test_data), axis=1)

# Get column names
encoded_onehot_columns = []
for i, cat in enumerate(onehot_enc.categories_): 
  for val in cat:
    encoded_onehot_columns.append(f'{onehot_columns[i]}_' + val)

# Update data frames
train_data = pd.DataFrame(encoded_train_data, columns = train_data.columns.tolist() + encoded_onehot_columns)
test_data = pd.DataFrame(encoded_test_data, columns = test_data.columns.tolist() + encoded_onehot_columns)

In [12]:
primary_keys = ['Player Key', 'Date']
feature_list = sql_columns + encoded_onehot_columns
# labels = ['PTS', 'TRB', 'AST', 'STL', 'TOV', ]
# cols_to_drop = ['dk_value_82', 'dk_value_20', 'MP', 'FG', 'FGA', 'FG%', '2P', '2PA', '2P%', '3P', '3PA',
#                 '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
#                 'TOV', 'PF', 'PTS', 'GmSc', 'BPM', 'dk_value']

In [13]:
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [14]:
pts_y_train = train_data['PTS']
pts_X_train = train_data[feature_list]
pts_rf = RandomForestRegressor(random_state = 0)
pts_rf.fit(pts_X_train, pts_y_train)

RandomForestRegressor(random_state=0)

In [None]:
trb_y_train = train_data['TRB']
trb_X_train = train_data[feature_list]
trb_rf = RandomForestRegressor(random_state = 0)
trb_rf.fit(trb_X_train, trb_y_train)

In [None]:
ast_y_train = train_data['AST']
ast_X_train = train_data[feature_list]
ast_rf = RandomForestRegressor(random_state = 0)
ast_rf.fit(ast_X_train, ast_y_train)

In [None]:
stl_y_train = train_data['STL']
stl_X_train = train_data[feature_list]
stl_rf = RandomForestRegressor(random_state = 0)
stl_rf.fit(stl_X_train, stl_y_train)

In [None]:
tov_y_train = train_data['TOV']
tov_X_train = train_data[feature_list]
tov_rf = RandomForestRegressor(random_state = 0)
tov_rf.fit(tov_X_train, tov_y_train)

In [None]:
three_p_y_train = train_data['3P']
three_p_X_train = train_data[feature_list]
three_p_rf = RandomForestRegressor(random_state = 0)
three_p_rf.fit(three_p_X_train, three_p_y_train)

In [None]:
blk_y_train = train_data['BLK']
blk_X_train = train_data[feature_list]
blk_rf = RandomForestRegressor(random_state = 0)
blk_rf.fit(blk_X_train, blk_y_train)

Tests

In [None]:
pts_X_test = test_data[feature_list]
pts_y_test = test_data['PTS']
pts_rf.score(pts_X_test, pts_y_test)

In [None]:
ast_X_test = test_data[feature_list]
ast_y_test = test_data['AST']
ast_rf.score(ast_X_test, ast_y_test)

In [None]:
stl_X_test = test_data[feature_list]
stl_y_test = test_data['STL']
stl_rf.score(stl_X_test, stl_y_test)

In [None]:
trb_X_test = test_data[feature_list]
trb_y_test = test_data['TRB']
trb_rf.score(trb_X_test, trb_y_test)

In [None]:
tov_X_test = test_data[feature_list]
tov_y_test = test_data['TOV']
tov_rf.score(tov_X_test, tov_y_test)

In [None]:
three_p_X_test = test_data[feature_list]
three_p_y_test = test_data['3P']
three_p_rf.score(three_p_X_test, three_p_y_test)

In [None]:
blk_X_test = test_data[feature_list]
blk_y_test = test_data['BLK']
blk_rf.score(blk_X_test, blk_y_test)

Benchmark

In [None]:
all_test_data = test_data.copy()

In [None]:
test_data['PTS_pred'] = pts_rf.predict(pts_X_test)
test_data['TRB_pred'] = trb_rf.predict(trb_X_test)
test_data['STL_pred'] = stl_rf.predict(stl_X_test)
test_data['AST_pred'] = ast_rf.predict(ast_X_test)
test_data['TOV_pred'] = tov_rf.predict(tov_X_test)
test_data['BLK_pred'] = blk_rf.predict(blk_X_test)
test_data['3P_pred'] = three_p_rf.predict(three_p_X_test)

In [None]:
test_data['dk_value_pred'] = test_data.apply(draftkings_value_pred, axis=1)

In [None]:
# compare to FPPG
test_data['l2_error'] = (test_data['dk_value_pred'] - test_data['dk_value'])**2
avg_pred_error = test_data['l2_error'].mean()
sum_pred_error = test_data['l2_error'].sum()

test_data['l2_error'] = (test_data['dk_value_20'] - test_data['dk_value'])**2
avg_benchmark_error = test_data['l2_error'].mean()
sum_benchmark_error = test_data['l2_error'].sum()

print('Average RF Prediction L2 Error: ', avg_pred_error)
print('Average Benchmark Prediction L2 Error: ', avg_benchmark_error)
print('Average Improvement: ', avg_benchmark_error - avg_pred_error)
print('')
print('Total RF Prediction L2 Error: ', sum_pred_error)
print('Total Benchmark Prediction L2 Error: ', sum_benchmark_error)
print('Total Improvement: ', sum_benchmark_error - sum_pred_error)

In [None]:
# all_model ( features ) => (pts, rbds, asts, stls, tov) for a player fo ra game
all_X_train = train_data[feature_list]
all_y_train = train_data[['PTS','TRB','AST','STL','TOV','BLK','3P']]

all_rf = RandomForestRegressor(random_state = 0)
all_rf.fit(all_X_train, all_y_train)

# define X_test, y_test
all_X_test = test_data[feature_list]
all_y_test = test_data[['PTS','TRB','AST','STL','TOV','BLK','3P']]

all_rf.score(all_X_test, all_y_test)



0.5012911380806248

In [None]:
all_preds = all_rf.predict(all_X_test)
all_preds_df = pd.DataFrame(all_preds, columns=['PTS_pred','TRB_pred','AST_pred','STL_pred','TOV_pred','BLK_pred','3P_pred'])
all_test_data = pd.concat((all_test_data, all_preds_df), axis=1)

In [None]:
all_test_data.loc[:,'dk_value_pred'] = all_test_data.apply(draftkings_value_pred, axis=1)

In [None]:
# compare to FPPG
all_test_data['l2_error'] = (all_test_data['dk_value_pred'] - all_test_data['dk_value'])**2
avg_pred_error = all_test_data['l2_error'].mean()
sum_pred_error = all_test_data['l2_error'].sum()

all_test_data['l2_error'] = (all_test_data['dk_value_20'] - all_test_data['dk_value'])**2
avg_benchmark_error = all_test_data['l2_error'].mean()
sum_benchmark_error = all_test_data['l2_error'].sum()

print('Average RF Prediction L2 Error: ', avg_pred_error)
print('Average Benchmark Prediction L2 Error: ', avg_benchmark_error)
print('Average Improvement: ', avg_benchmark_error - avg_pred_error)
print('')
print('Total RF Prediction L2 Error: ', sum_pred_error)
print('Total Benchmark Prediction L2 Error: ', sum_benchmark_error)
print('Total Improvement: ', sum_benchmark_error - sum_pred_error)

Average RF Prediction L2 Error:  94.60487042183743
Average Benchmark Prediction L2 Error:  100.93127979726897
Average Improvement:  6.326409375431538

Total RF Prediction L2 Error:  2269097.8170677796
Total Benchmark Prediction L2 Error:  2420836.7459375
Total Improvement:  151738.92886972055
