In [1]:
import pandas as pd
import numpy as np
from pulp import *
dat = '11-05'

In [2]:
file1 = ('https://github.com/ZTFisme/Data-Sets/blob/main/NBA_Advanced_Player_Data.xlsx?raw=true')
file2 = ('https://github.com/ZTFisme/Data-Sets/blob/main/NBA_Traditional_Player_Data.xlsx?raw=true')
file3 = ('https://github.com/ZTFisme/Data-Sets/blob/main/Opponent_List.xlsx?raw=true')
file4 = ('https://github.com/ZTFisme/Data-Sets/blob/main/Yahoo_DFS_Cost.xlsx?raw=true')

In [3]:
adv = pd.read_excel(file1, sheet_name = 0, header=0)
trad = pd.read_excel(file2, sheet_name = 0, header = 0)
opponent = pd.read_excel(file3, sheet_name = 0, header=0)
opponent = opponent[['TEAM',dat]].rename(columns={dat:'OPP'})
cost = pd.read_excel(file4, sheet_name = dat, header=0)
cost = cost[['PLAYER', 'POSITION', 'COST']]
positions = pd.read_excel(file3, sheet_name = 'Positions', header = 0)

In [4]:
import nba_api
from nba_api.stats.endpoints import playergamelogs
from datetime import timedelta
from datetime import datetime
from dateutil import parser

max_date = parser.parse(max(trad['GAME_DATE']))
date_from = (max_date + timedelta(days=1)).date().strftime('%m-%d-%y')

trad_logs = pd.DataFrame() 
advanced_logs = pd.DataFrame()
    
tradlog = playergamelogs.PlayerGameLogs( season_nullable= '2021-22', measure_type_player_game_logs_nullable = nba_api.stats.library.parameters.MeasureTypePlayerGameLogs.base)
tlogs = tradlog.get_data_frames()[0]
trad_logs = trad_logs.append(tlogs)

advlogs = playergamelogs.PlayerGameLogs(season_nullable= '2021-22', measure_type_player_game_logs_nullable = nba_api.stats.library.parameters.MeasureTypePlayerGameLogs.advanced)
alogs = advlogs.get_data_frames()[0]
advanced_logs = advanced_logs.append(alogs)

In [5]:
advanced_logs = advanced_logs[list(adv.columns)]
trad_logs = trad_logs[list(trad.columns)]
trad = trad.append(trad_logs).drop_duplicates().reset_index(drop=True)
adv = adv.append(advanced_logs).drop_duplicates().reset_index(drop=True)

In [6]:
def clean_data(trad, adv, positions):
    HA_lst = []
    opp_lst = []
    for i in range(len(trad)):
        game = trad.iloc[i]['MATCHUP']
        if '@' in game:
            HA_lst.append("Away")
            team, opp = game.split(' @ ')
            opp_lst.append(opp)
        else:
            HA_lst.append("Home")
            team, opp = game.split(' vs. ')
            opp_lst.append(opp)
    trad['H_A'] = HA_lst
    trad['OPP'] = opp_lst
    trad = trad.merge(positions, left_on='PLAYER_NAME', right_on='PLAYER')
    trad['OPP_POS'] = trad['OPP'] + '-' + trad['POSITION']
    ha_lst= []
    for i in range(len(trad)):
        if trad.iloc[i]['H_A'] == 'Home':
            ha_lst.append(1)
        else:
            ha_lst.append(0)
    trad['HOME_AWAY'] = ha_lst
    data = trad.merge(adv, how='right', on=['GAME_ID','TEAM_ID','PLAYER_ID'], suffixes=('', '_drop'))

    data.drop([col for col in data.columns if 'drop' in col], axis=1, inplace=True)
    data = data.dropna().drop_duplicates().reset_index(drop=True).rename(columns = {'NBA_FANTASY_PTS':'FPTS'})
    data['NEW_STAT'] = data['MIN'] * data['USG_PCT'] * data['PACE'] / 100
    data = data[data['MIN']!=0].reset_index(drop=True).rename(columns = {'TEAM_ABBREVIATION':'TEAM'})
    data['SMA2'] = data.groupby('PLAYER')['FPTS'].transform(lambda x: x.rolling(2,1).mean())
    data['SMA2_STAT'] = data.groupby('PLAYER')['NEW_STAT'].transform(lambda x: x.rolling(2,1).mean())
    data['FPTS/MIN'] = data['FPTS'] / data ['MIN']

    player_stats = pd.DataFrame(data.groupby('PLAYER')['FPTS','FPTS/MIN'].mean())
    last = pd.DataFrame(data.set_index('PLAYER').groupby(level='PLAYER').agg(['last']).stack())
    last = last.reset_index()[['PLAYER','POSITION','TEAM','SMA2', 'SMA2_STAT']]
    l = last.merge(player_stats, how='inner', left_on = 'PLAYER', right_on = 'PLAYER').rename(columns={'FPTS':'AVG_FPTS'})
    l_avg = l[['PLAYER','AVG_FPTS']]
    data = data.merge(l_avg)
    data['PTS_DIFF'] = data['FPTS'] - data['AVG_FPTS']

    opp_pos = data.groupby(['OPP','POSITION','OPP_POS'])['PTS_DIFF','FPTS', 'FPTS/MIN'].mean().reset_index()
    opp_c = opp_pos[opp_pos['POSITION']=='C'].reset_index(drop=True)
    opp_pf = opp_pos[opp_pos['POSITION']=='PF'].reset_index(drop=True)
    opp_sf = opp_pos[opp_pos['POSITION']=='SF'].reset_index(drop=True)
    opp_sg = opp_pos[opp_pos['POSITION']=='SG'].reset_index(drop=True)
    opp_pg = opp_pos[opp_pos['POSITION']=='PG'].reset_index(drop=True)

    def opp_process(opp):
        fpts_mu = opp['FPTS'].mean()
        diff_mu = opp['PTS_DIFF'].mean()
        fpts_min_mu = opp['FPTS/MIN'].mean()

        opp['FPTS_AGST_AVG'] = ''
        opp['DIFF_AGST_AVG'] = ''
        opp['FPTS_MIN_AGST_AVG'] = ''

        for i in range(len(opp)):
            fpt = opp.iloc[i]['FPTS'] / fpts_mu
            diff = opp.iloc[i]['PTS_DIFF'] - diff_mu
            fpts_min = opp.iloc[i]['FPTS/MIN'] / fpts_min_mu
            opp.at[i, 'FPTS_AGST_AVG'] = fpt
            opp.at[i, 'DIFF_AGST_AVG'] = diff
            opp.at[i, 'FPTS_MIN_AGST_AVG'] = fpts_min
        return opp

    opp_pg = opp_process(opp_pg)
    opp_sg = opp_process(opp_sg)
    opp_sf = opp_process(opp_sf)
    opp_pf = opp_process(opp_pf)
    opp_c = opp_process(opp_c)

    opps = [opp_pg, opp_sg, opp_sf, opp_pf, opp_c]
    opponents = pd.concat(opps).reset_index(drop=True)
    opponents = opponents[['OPP_POS','DIFF_AGST_AVG']].rename(columns = {'DIFF_AGST_AVG':'OPP_DIFF_AGST_AVG'})
    data = data.merge(opponents, on = 'OPP_POS')
    data = data.sort_values(by=['PLAYER_NAME','GAME_DATE'])
    return data

In [7]:
def update_model(data):

    import numpy as np
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler

    X = np.array(data[['SMA2',  'NEW_STAT']])
    y = np.array(data['FPTS'])

    #scaler = StandardScaler().fit(X)
    #X = scaler.transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LinearRegression().fit(X_train, y_train)
    r_sq = model.score(X_test, y_test)
    print('r squared value:', r_sq)
    print('intercept:', model.intercept_)
    #print('coefficients:', model.coef_)

    cdf = pd.DataFrame(model.coef_, ['SMA2',  'SMA2_STAT'], columns=['Coefficients'])
    print(cdf)
    sma_coef = model.coef_[0]
    new_stat_coef = model.coef_[1]
    return sma_coef, new_stat_coef

In [8]:
def prepare_predictions(data, opponent, cost, sma_coef, new_stat_coef):
    player_stats = pd.DataFrame(data.groupby('PLAYER')['FPTS','FPTS/MIN'].mean())
    last = pd.DataFrame(data.set_index('PLAYER').groupby(level='PLAYER').agg(['last']).stack())
    last = last.reset_index()[['PLAYER','POSITION','TEAM','SMA2', 'SMA2_STAT']]
    l = last.merge(player_stats, how='inner', left_on = 'PLAYER', right_on = 'PLAYER').rename(columns={'FPTS':'AVG_FPTS'})
    
    opp_pos = data.groupby(['OPP','POSITION','OPP_POS'])['PTS_DIFF','FPTS', 'FPTS/MIN'].mean().reset_index()
    opp_c = opp_pos[opp_pos['POSITION']=='C'].reset_index(drop=True)
    opp_pf = opp_pos[opp_pos['POSITION']=='PF'].reset_index(drop=True)
    opp_sf = opp_pos[opp_pos['POSITION']=='SF'].reset_index(drop=True)
    opp_sg = opp_pos[opp_pos['POSITION']=='SG'].reset_index(drop=True)
    opp_pg = opp_pos[opp_pos['POSITION']=='PG'].reset_index(drop=True)

    def opp_process(opp):
        fpts_mu = opp['FPTS'].mean()
        diff_mu = opp['PTS_DIFF'].mean()
        fpts_min_mu = opp['FPTS/MIN'].mean()

        opp['FPTS_AGST_AVG'] = ''
        opp['DIFF_AGST_AVG'] = ''
        opp['FPTS_MIN_AGST_AVG'] = ''

        for i in range(len(opp)):
            fpt = opp.iloc[i]['FPTS'] / fpts_mu
            diff = opp.iloc[i]['PTS_DIFF'] - diff_mu
            fpts_min = opp.iloc[i]['FPTS/MIN'] / fpts_min_mu
            opp.at[i, 'FPTS_AGST_AVG'] = fpt
            opp.at[i, 'DIFF_AGST_AVG'] = diff
            opp.at[i, 'FPTS_MIN_AGST_AVG'] = fpts_min
        return opp

    opp_pg = opp_process(opp_pg)
    opp_sg = opp_process(opp_sg)
    opp_sf = opp_process(opp_sf)
    opp_pf = opp_process(opp_pf)
    opp_c = opp_process(opp_c)

    opps = [opp_pg, opp_sg, opp_sf, opp_pf, opp_c]
    opponents = pd.concat(opps).reset_index(drop=True)
    opponents = opponents[['OPP_POS','DIFF_AGST_AVG']].rename(columns = {'DIFF_AGST_AVG':'OPP_DIFF_AGST_AVG'})
    
    oppo = l.merge(opponent, how='inner', left_on = 'TEAM', right_on = 'TEAM')
    oppo['OPP_POS'] = oppo['OPP'] + '-' + oppo['POSITION']
    pred = oppo.merge(opponents, how = 'inner', left_on='OPP_POS', right_on = 'OPP_POS')
    pred = pred.rename(columns={'OPP_x':'OPP','POSITION_x':'POSITION'})
    pred['PRED'] = (  (pred['SMA2']*sma_coef) + (pred['SMA2_STAT'] * new_stat_coef) + (pred['OPP_DIFF_AGST_AVG']))
    pred = pred.merge(cost, how = 'inner', left_on = 'PLAYER', right_on = 'PLAYER')
    pred = pred.drop(columns = ["POSITION_x"]).rename(columns = {"POSITION_y":"POSITION"})
    import re
    player_lst = []
    for i in range(len(pred)):
        player_lst.append(re.sub('-',' ',pred['PLAYER'][i]))
    pred['PLAYER'] = player_lst
    return pred

In [9]:
def optimize_predictions(pred):
    df = pred
    def pg_id(x):
        if x=='PG':
            return 1
        else:
            return 0
    def sg_id(x):
        if x=='SG':
            return 1
        else:
            return 0
    def sf_id(x):
        if x=='SF':
            return 1
        else:
            return 0
    def pf_id(x):
        if x=='PF':
            return 1
        else:
            return 0
    def c_id(x):
        if x=='C':
            return 1
        else:
            return 0

    df['PG'] = df['POSITION'].apply(pg_id)
    df['SG'] = df['POSITION'].apply(sg_id)
    df['SF'] = df['POSITION'].apply(sf_id)
    df['PF'] = df['POSITION'].apply(pf_id)
    df['C'] = df['POSITION'].apply(c_id)
    df['COST'] = [float(i) for i in df['COST']]

    #Clean data and convert to list- only look at rows with diet data and not constraint information
    df = df.where(df['COST'] > 0)
    df = df.dropna()
    df = df.values.tolist()

    players = [x[0] for x in df]
    cost = dict([(x[0], float(x[11])) for x in df])
    position = dict([(x[0], x[10]) for x in df])
    proj_pts = dict([(x[0], float(x[9])) for x in df])

    pg = dict([(x[0], float(x[12])) for x in df])
    sg = dict([(x[0], float(x[13])) for x in df])
    sf = dict([(x[0], float(x[14])) for x in df])
    pf = dict([(x[0], float(x[15])) for x in df])
    c = dict([(x[0], float(x[16])) for x in df])

    player_vars = LpVariable.dicts("Player", players, cat = "Integer", lowBound= 0, upBound = 1)

    # create the optimization problem framework - maximize points while meeting requirements of maximum cost
    prob = LpProblem("NBA_Optimize", LpMaximize)

    #Define objective function for projected points
    obj_func = lpSum([proj_pts[i] * player_vars[i] for i in players])
    prob += obj_func

    #Number of player choices constraint
    constraint_2 = lpSum([player_vars[i] for i in player_vars]) == 8
    prob += constraint_2 

    #Cost Constraint
    constraint_3 = lpSum([cost[f] * player_vars[f] for f in player_vars]) <= 200.0
    prob += constraint_3

    #Position Constraints
    #PG
    prob += lpSum([pg[f] * player_vars[f] for f in player_vars]) >= 1
    prob += lpSum([pg[f] * player_vars[f] for f in player_vars]) <= 3

    #SG
    prob += lpSum([sg[f] * player_vars[f] for f in player_vars]) >= 1
    prob += lpSum([sg[f] * player_vars[f] for f in player_vars]) <= 3

    #G
    prob += lpSum([(pg[f] * player_vars[f]) + (sg[f] * player_vars[f]) for f in player_vars]) >= 3
    prob += lpSum([(pg[f] * player_vars[f]) + (sg[f] * player_vars[f]) for f in player_vars]) <= 4

    #SF
    prob += lpSum([sf[f] * player_vars[f] for f in player_vars]) >= 1
    prob += lpSum([sf[f] * player_vars[f] for f in player_vars]) <= 3

    #PF
    prob += lpSum([pf[f] * player_vars[f] for f in player_vars]) >= 1
    prob += lpSum([pf[f] * player_vars[f] for f in player_vars]) <= 3

    #F
    prob += lpSum([(sf[f] * player_vars[f]) + (pf[f] * player_vars[f]) for f in player_vars]) >= 3
    prob += lpSum([(sf[f] * player_vars[f]) + (pf[f] * player_vars[f]) for f in player_vars]) <= 4

    #C
    prob += lpSum([c[f] * player_vars[f] for f in player_vars]) >= 1
    prob += lpSum([c[f] * player_vars[f] for f in player_vars]) <= 2

    #Solve the Objective Function
    prob.solve()
    name_lst = []
    import re
    for x in prob.variables():
        if x.varValue>0:
            nm = x.name
            name_lst.append(re.sub(r'[_]',' ',nm[7:]))
    lineup = pd.DataFrame(name_lst).rename(columns={0:'PLAYER'})
    lineup = lineup.merge(pred, how = 'inner', left_on = 'PLAYER', right_on = 'PLAYER')
    lineup = lineup[['PLAYER', 'POSITION', 'SMA2', 'SMA2_STAT', 'AVG_FPTS', 'COST', 'PRED']]
    print('The Projected Total Points are: ', sum(lineup['PRED']))
    return lineup


In [10]:
data = clean_data(trad, adv, positions)
sma_coef, new_stat_coef = update_model(data)
pred = prepare_predictions(data, opponent, cost, sma_coef, new_stat_coef)
lineup = optimize_predictions(pred)
lineup

r squared value: 0.8376967785129512
intercept: -0.5227502515945588
           Coefficients
SMA2           0.724869
SMA2_STAT      1.418664
The Projected Total Points are:  350.0650378054799


Unnamed: 0,PLAYER,POSITION,SMA2,SMA2_STAT,AVG_FPTS,COST,PRED
0,CJ McCollum,SG,49.55,10.909737,35.44264,32.0,51.4925
1,Devin Vassell,SF,32.5,5.561833,14.405479,12.0,31.7692
2,Evan Fournier,SG,40.9,8.416057,27.39,22.0,42.1414
3,Gordon Hayward,SF,39.85,9.880837,29.793889,24.0,42.937
4,Jarrett Allen,C,51.35,7.961613,29.732889,26.0,46.2402
5,Jordan Poole,SG,37.5,9.562853,17.911864,21.0,41.139
6,Julius Randle,PF,49.7,12.703731,40.323288,42.0,54.1418
7,Reggie Jackson,PG,37.5,8.877895,22.865608,21.0,40.2039


In [11]:
tab = pd.DataFrame(columns = ['Type','K-Value', 'RSFE', 'ME', 'MAD', 'MSD', 'TS'])

for k in range(2,11):
    forecast = data.groupby('PLAYER')['FPTS'].transform(lambda x: x.rolling(k,k-1).mean())
    E = (data['FPTS'] - forecast).fillna(0)
    MAD = (forecast - data['FPTS']).abs()
    MSD = (forecast - data['FPTS']).abs().pow(2)
    tab.at[k-1, 'Type'] = 'SMA'
    tab.at[k-1, 'K-Value'] = k
    tab.at[k-1, 'RSFE'] = sum(E)
    tab.at[k-1, 'ME'] = E.mean()
    tab.at[k-1, 'MAD'] = MAD.mean()
    tab.at[k-1, 'MSD'] = MSD.mean()
    tab.at[k-1, 'TS'] = sum(E) / MAD.mean()
    
    ema = (data.groupby(['PLAYER'])['FPTS'].transform(lambda x: x.ewm(span = k, min_periods = k-1).mean()))
    ema_E = (data['FPTS'] - ema).fillna(0)
    ema_MAD = (ema - data['FPTS']).abs()
    ema_MSD = (ema - data['FPTS']).abs().pow(2)
    tab.at[k+9, 'Type'] = 'EMA'
    tab.at[k+9, 'K-Value'] = k
    tab.at[k+9, 'RSFE'] = sum(ema_E)
    tab.at[k+9, 'ME'] = ema_E.mean()
    tab.at[k+9, 'MAD'] = ema_MAD.mean()
    tab.at[k+9, 'MSD'] = ema_MSD.mean()
    tab.at[k+9, 'TS'] = sum(ema_E) / ema_MAD.mean()
print ('\033[1m' + 'Moving Average Forecasts:\n')
tab.sort_values(by='MAD',ascending=True).reset_index(drop=True)

[1mMoving Average Forecasts:



Unnamed: 0,Type,K-Value,RSFE,ME,MAD,MSD,TS
0,EMA,2,542.716,0.00802383,2.83768,13.6629,191.253
1,EMA,3,977.896,0.0144578,4.0896,27.9638,239.118
2,EMA,4,1261.27,0.0186473,4.80338,38.3762,262.579
3,SMA,2,619.45,0.00915831,4.83085,40.1318,128.228
4,EMA,5,1181.28,0.0174647,5.27092,46.0564,224.112
5,EMA,6,1479.76,0.0218777,5.60733,51.9883,263.898
6,SMA,3,1192.7,0.0176336,5.78869,56.0158,206.04
7,EMA,7,1838.08,0.0271753,5.85846,56.6345,313.748
8,EMA,8,2150.9,0.0318002,6.05929,60.4776,354.975
9,EMA,9,2253.71,0.0333202,6.22188,63.6616,362.224


In [12]:
tab = pd.DataFrame(columns = ['Type','K-Value', 'RSFE', 'ME', 'MAD', 'MSD', 'TS'])

for k in range(2,11):
    forecast = data.groupby('PLAYER')['NEW_STAT'].transform(lambda x: x.rolling(k,k-1).mean())
    E = (data['NEW_STAT'] - forecast).fillna(0)
    MAD = (forecast - data['NEW_STAT']).abs()
    MSD = (forecast - data['NEW_STAT']).abs().pow(2)
    tab.at[k-1, 'Type'] = 'SMA'
    tab.at[k-1, 'K-Value'] = k
    tab.at[k-1, 'RSFE'] = sum(E)
    tab.at[k-1, 'ME'] = E.mean()
    tab.at[k-1, 'MAD'] = MAD.mean()
    tab.at[k-1, 'MSD'] = MSD.mean()
    tab.at[k-1, 'TS'] = sum(E) / MAD.mean()
    
    ema = (data.groupby(['PLAYER'])['NEW_STAT'].transform(lambda x: x.ewm(span = k, min_periods = k-1).mean()))
    ema_E = (data['NEW_STAT'] - ema).fillna(0)
    ema_MAD = (ema - data['NEW_STAT']).abs()
    ema_MSD = (ema - data['NEW_STAT']).abs().pow(2)
    tab.at[k+9, 'Type'] = 'EMA'
    tab.at[k+9, 'K-Value'] = k
    tab.at[k+9, 'RSFE'] = sum(ema_E)
    tab.at[k+9, 'ME'] = ema_E.mean()
    tab.at[k+9, 'MAD'] = ema_MAD.mean()
    tab.at[k+9, 'MSD'] = ema_MSD.mean()
    tab.at[k+9, 'TS'] = sum(ema_E) / ema_MAD.mean()
print ('\033[1m' + 'Moving Average Forecasts:\n')
tab.sort_values(by='MAD',ascending=True).reset_index(drop=True)

[1mMoving Average Forecasts:



Unnamed: 0,Type,K-Value,RSFE,ME,MAD,MSD,TS
0,EMA,2,87.6991,0.00129659,0.494291,0.42877,177.424
1,EMA,3,158.441,0.00234249,0.714579,0.88293,221.726
2,SMA,2,98.3518,0.00145409,0.837802,1.24707,117.393
3,EMA,4,220.171,0.00325514,0.841339,1.21757,261.692
4,EMA,5,201.739,0.00298264,0.925263,1.46707,218.035
5,EMA,6,254.205,0.00375831,0.985741,1.66138,257.882
6,SMA,3,187.214,0.00276788,1.01219,1.77309,184.959
7,EMA,7,273.262,0.00404007,1.03148,1.81561,264.924
8,EMA,8,342.042,0.00505695,1.06826,1.94416,320.184
9,SMA,4,261.265,0.0038627,1.09323,2.05344,238.984
