In [2]:
import numpy as np
import pandas as pd
import nba_api
from sklearn import linear_model

In [3]:
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams
from nba_api.stats.endpoints import teamdashboardbylastngames as teamdashbyn


In [43]:
gamefinder = leaguegamefinder.LeagueGameFinder(league_id_nullable='00', season_type_nullable='Regular Season')

In [5]:
def makeOneHot(id):
    onehot = np.zeros(30)
    onehot[id] = 1
    return onehot

In [61]:
dflg = gamefinder.get_data_frames()
df = dflg[0]
df = df[df['SEASON_ID'] == '22018']

base_team_id = df['TEAM_ID'].unique().min()
df.loc[:,'TEAM_ID'] = df.loc[:,'TEAM_ID'] - base_team_id
ab_to_id = df.loc[:,['TEAM_ID', 'TEAM_ABBREVIATION']].set_index('TEAM_ABBREVIATION').to_dict()



Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22018,9,LAC,LA Clippers,0021801229,2019-04-10,LAC vs. UTA,W,264,143,...,0.767,12,40,52,34,7,5,12,27,6.0
1,22018,20,POR,Portland Trail Blazers,0021801230,2019-04-10,POR vs. SAC,W,240,136,...,0.727,14,30,44,19,5,2,12,17,5.0
2,22018,21,SAC,Sacramento Kings,0021801230,2019-04-10,SAC @ POR,L,240,131,...,0.813,12,23,35,25,5,1,10,17,-5.0
3,22018,13,MIN,Minnesota Timberwolves,0021801228,2019-04-10,MIN @ DEN,L,240,95,...,0.667,7,34,41,24,6,0,10,22,-4.0
4,22018,25,UTA,Utah Jazz,0021801229,2019-04-10,UTA @ LAC,L,266,137,...,0.879,17,40,57,31,8,11,17,24,-6.0
5,22018,16,ORL,Orlando Magic,0021801222,2019-04-10,ORL @ CHA,W,240,122,...,0.750,9,30,39,24,5,2,6,19,8.0
6,22018,22,SAS,San Antonio Spurs,0021801227,2019-04-10,SAS vs. DAL,W,242,105,...,0.833,8,45,53,22,6,2,10,14,11.0
7,22018,23,OKC,Oklahoma City Thunder,0021801226,2019-04-10,OKC @ MIL,W,240,127,...,0.615,9,44,53,40,7,3,12,20,11.0
8,22018,14,BKN,Brooklyn Nets,0021801221,2019-04-10,BKN vs. MIA,W,241,113,...,0.857,20,48,68,29,7,3,12,13,19.0
9,22018,0,ATL,Atlanta Hawks,0021801220,2019-04-10,ATL vs. IND,L,240,134,...,0.816,22,39,61,29,5,7,17,25,-1.0


In [65]:
dflg[0]['SEASON_ID'].unique()

array(['22018', '22017', '22016', '22015', '22014', '22013', '22012',
       '22011', '22010', '22009', '22008', '22007', '22006'], dtype=object)

In [7]:
def matchupToRow(matchup):
    if 'vs.' in matchup:
        home = matchup[0:3]
        away = matchup[-3:]
        homeid = ab_to_id[home]
        awayid = ab_to_id[away]
        homeoh = makeOneHot(homeid)
        awayoh = makeOneHot(awayid)
        return np.hstack((homeoh, awayoh))

In [8]:
df = df.sort_values(by='GAME_ID')
matchupToRow(df.iloc[0]['MATCHUP'])

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [9]:
def firstDateBefore(date1,date2):
    return datetime.strptime(date1, '%Y-%m-%d') <= datetime.strptime(date2, '%Y-%m-%d')


In [10]:
def flattenAway(home, away):
    home.assign(AWAY_PTS=np.zeros(home.shape[0]))
    home.assign(AWAY_FG_PCT=np.zeros(home.shape[0]))
    home.assign(AWAY_FG3_PCT=np.zeros(home.shape[0]))
    home.assign(AWAY_TOV=np.zeros(home.shape[0]))
    home.assign(AWAY_TEAM_ABBREVIATION=np.empty(home.shape[0]))
    
    for i, row in home.iterrows():
        game_id = row['GAME_ID']
        otherRow = away[away['GAME_ID'] == game_id]
        home.at[i, "AWAY_PTS"] = otherRow['PTS'].iloc[0]
        home.at[i, "AWAY_TEAM_ABBREVIATION"] = otherRow["TEAM_ABBREVIATION"].iloc[0]
        home.at[i, "AWAY_FG_PCT"] = otherRow["FG_PCT"].iloc[0]
        home.at[i, "AWAY_FG3_PCT"] = otherRow["FG3_PCT"].iloc[0]
        home.at[i, "AWAY_TOV"] = otherRow["TOV"].iloc[0]
    return home

In [21]:
def getPastAvg(df, col, away = False):
    pastVals = {}
    abbrev = 'AWAY_TEAM_ABBREVIATION' if away else 'TEAM_ABBREVIATION'
    for team in home[abbrev].unique():
        pastVals[team] = [0.0, 0]
    df.loc[:,col + '_AVG'] = 0.0
    for i, row in df.iterrows():
        divisor = 1 if pastVals[df.at[i, abbrev]][1] == 0 else pastVals[df.at[i, abbrev]][1]
        df.at[i, col + '_AVG'] = pastVals[df.at[i, abbrev]][0] / divisor
        pastVals[df.at[i, abbrev]][0] = pastVals[df.at[i, abbrev]][0] + df.at[i, col]
        pastVals[df.at[i, abbrev]][1] = pastVals[df.at[i, abbrev]][1] + 1
    return df
        

In [17]:
home = df.loc[df.MATCHUP.str.contains('vs.')]
away = df.loc[df.MATCHUP.str.contains('@')]

home = flattenAway(home,away)
home

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,STL,BLK,TOV,PF,PLUS_MINUS,AWAY_PTS,AWAY_TEAM_ABBREVIATION,AWAY_FG_PCT,AWAY_FG3_PCT,AWAY_TOV
2456,22018,1,BOS,Boston Celtics,0021800001,2018-10-16,BOS vs. PHI,W,240,105,...,7,5,14,20,18.0,87.0,PHI,0.391,0.192,16.0
2459,22018,7,GSW,Golden State Warriors,0021800002,2018-10-16,GSW vs. OKC,W,241,108,...,7,7,21,29,8.0,100.0,OKC,0.363,0.270,14.0
2440,22018,29,CHA,Charlotte Hornets,0021800003,2018-10-17,CHA vs. MIL,L,241,112,...,8,9,11,19,-1.0,113.0,MIL,0.494,0.412,21.0
2446,22018,28,DET,Detroit Pistons,0021800004,2018-10-17,DET vs. BKN,W,240,103,...,5,5,14,20,3.0,100.0,BKN,0.488,0.185,17.0
2450,22018,17,IND,Indiana Pacers,0021800005,2018-10-17,IND vs. MEM,W,240,111,...,2,7,20,24,28.0,83.0,MEM,0.298,0.345,7.0
2453,22018,16,ORL,Orlando Magic,0021800006,2018-10-17,ORL vs. MIA,W,240,104,...,7,7,12,25,3.0,101.0,MIA,0.392,0.273,16.0
2435,22018,15,NYK,New York Knicks,0021800007,2018-10-17,NYK vs. ATL,W,240,126,...,12,6,15,23,19.0,107.0,ATL,0.456,0.278,24.0
2448,22018,24,TOR,Toronto Raptors,0021800008,2018-10-17,TOR vs. CLE,W,239,116,...,6,7,9,29,12.0,104.0,CLE,0.400,0.368,16.0
2444,22018,8,HOU,Houston Rockets,0021800009,2018-10-17,HOU vs. NOP,L,240,112,...,8,7,11,22,-19.0,131.0,NOP,0.531,0.400,12.0
2454,22018,22,SAS,San Antonio Spurs,0021800010,2018-10-17,SAS vs. MIN,W,241,112,...,3,4,12,22,4.0,108.0,MIN,0.429,0.316,11.0


In [25]:
home = getPastAvg(home, 'PTS')
home = getPastAvg(home, 'AWAY_PTS', True)
home = getPastAvg(home, 'FG_PCT')
home = getPastAvg(home, 'AWAY_FG_PCT', True)
home = getPastAvg(home, 'FG3_PCT')
home = getPastAvg(home, 'AWAY_FG3_PCT', True)
home = getPastAvg(home, 'TOV')
home = getPastAvg(home, 'AWAY_TOV', True)
home[home.MATCHUP.str.contains('BOS')]

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,AWAY_FG3_PCT,AWAY_TOV,PTS_AVG,AWAY_PTS_AVG,FG_PCT_AVG,AWAY_FG_PCT_AVG,FG3_PCT_AVG,AWAY_FG3_PCT_AVG,TOV_AVG,AWAY_TOV_AVG
2456,22018,1,BOS,Boston Celtics,0021800001,2018-10-16,BOS vs. PHI,W,240,105,...,0.192,16.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2410,22018,24,TOR,Toronto Raptors,0021800019,2018-10-19,TOR vs. BOS,W,239,113,...,0.389,14.0,116.000000,0.000000,0.489000,0.000000,0.424000,0.000000,9.000000,0.000000
2403,22018,15,NYK,New York Knicks,0021800028,2018-10-20,NYK vs. BOS,L,238,101,...,0.360,15.0,126.000000,101.000000,0.455000,0.404000,0.364000,0.389000,15.000000,14.000000
2373,22018,1,BOS,Boston Celtics,0021800040,2018-10-22,BOS vs. ORL,L,238,90,...,0.286,9.0,105.000000,115.000000,0.433000,0.456000,0.297000,0.552000,14.000000,14.000000
2330,22018,23,OKC,Oklahoma City Thunder,0021800065,2018-10-25,OKC vs. BOS,L,241,95,...,0.344,11.0,120.000000,102.000000,0.442000,0.403000,0.231000,0.374500,18.000000,14.500000
2308,22018,28,DET,Detroit Pistons,0021800074,2018-10-27,DET vs. BOS,L,239,89,...,0.412,13.0,115.333333,101.666667,0.474000,0.396667,0.335667,0.364333,14.000000,13.333333
2263,22018,1,BOS,Boston Celtics,0021800099,2018-10-30,BOS vs. DET,W,239,108,...,0.267,17.0,97.500000,118.000000,0.420000,0.406000,0.261000,0.450000,13.000000,9.000000
2234,22018,1,BOS,Boston Celtics,0021800115,2018-11-01,BOS vs. MIL,W,240,117,...,0.310,14.0,101.000000,119.000000,0.426667,0.513500,0.270667,0.412500,13.333333,19.000000
2204,22018,17,IND,Indiana Pacers,0021800127,2018-11-03,IND vs. BOS,W,239,102,...,0.413,14.0,112.000000,103.500000,0.510333,0.410750,0.456000,0.376250,12.666667,13.250000
2181,22018,6,DEN,Denver Nuggets,0021800145,2018-11-05,DEN vs. BOS,W,240,115,...,0.290,12.0,112.800000,103.000000,0.474800,0.415000,0.302200,0.383600,13.600000,13.400000


NameError: name 'home' is not defined

In [26]:
def mapToNumpy(df):
    x = None
    for i, row in df.iterrows():
        currow = matchupToRow(row['MATCHUP'])
        currow = np.append(currow, row['FG_PCT_AVG'])
        currow = np.append(currow, row['AWAY_FG_PCT_AVG'])
        currow = np.append(currow, row['FG3_PCT_AVG'])
        currow = np.append(currow, row['AWAY_FG3_PCT_AVG'])
        currow = np.append(currow, row['PTS_AVG'])
        currow = np.append(currow, row['AWAY_PTS_AVG'])
        currow = np.append(currow, row['TOV_AVG'])
        currow = np.append(currow, row['AWAY_TOV_AVG'])
        currow = np.append(currow, row['PLUS_MINUS']);
        if currow is None:
            print('fukc')
        if x is None:
            x = currow
        else:
            x = np.vstack((x, currow))
    return x

In [27]:
xy = mapToNumpy(home)
xy

array([[ 0.   ,  1.   ,  0.   , ...,  0.   ,  0.   , 18.   ],
       [ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   ,  8.   ],
       [ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   , -1.   ],
       ...,
       [ 0.   ,  0.   ,  0.   , ..., 12.55 , 12.175,  4.   ],
       [ 0.   ,  0.   ,  0.   , ..., 13.475, 14.925,  6.   ],
       [ 0.   ,  0.   ,  0.   , ..., 12.95 , 13.225,  5.   ]])

In [28]:
def getTrainAndVal(data):
    np.random.shuffle(data)
    x = data[:,0:-1]
    y = data[:,-1]
    trainEndIndex = int((x.shape[0] * 7) / 10)
    xtrain = x[0:trainEndIndex,:]
    ytrain = y[0:trainEndIndex]

    xval = x[trainEndIndex:,:]
    yval = y[trainEndIndex:]
    return (xtrain, ytrain, xval, yval)


# Ridge

In [32]:
minloss_l2 = 1000000000.00
minloss_l1 = 1000000000.00

best_l2_model = None
best_l1_model = None

for aVal in [0.01, 0.05, 0.1, 0.5]:
    for i in range(100):
        xtrain, ytrain, xval, yval = getTrainAndVal(xy)

        reg = linear_model.Ridge(alpha=aVal)
        reg.fit(xtrain,ytrain)

        w = np.dot(xval, reg.coef_)
        avg_loss_l2 = np.linalg.norm(w - yval) / w.shape[0]
        avg_loss_l1 = np.linalg.norm(w - yval, 1) / w.shape[0]
        if avg_loss_l2 < minloss_l2:
            minloss_l2 = avg_loss_l2
            best_l2_model = reg.coef_
        if avg_loss_l1 < minloss_l1:
            minloss_l1 = avg_loss_l1
            best_l1_model = reg.coef_
print('L2 Norm Loss:' + str(minloss_l2))
print('L1 Norm Loss:' + str(minloss_l1))
print('L2 Norm Loss Accuracy:' + str(accuracy(yval, np.dot(xval, best_l2_model))))
print('L1 Norm Loss Accuracy:' + str(accuracy(yval, np.dot(xval, best_l1_model))))



L2 Norm Loss:0.6365473123427583
L1 Norm Loss:9.685227723610984
L2 Norm Loss Accuracy:0.6693766937669376
L1 Norm Loss Accuracy:0.6558265582655827


In [30]:
def accuracy(y, preds):
    conf_mat = [[0,0],[0,0]]
    preds = np.sign(preds)
    actuals = np.sign(y)
    for i, j in zip(preds, actuals):
        if i > j:
            conf_mat[0][1] += 1
        elif j > i:
            conf_mat[1][0] += 1
        elif j == 1:
            conf_mat[1][1] += 1
        elif j == -1:
            conf_mat[0][0] += 1
    accuracy = (conf_mat[0][0]+conf_mat[1][1])/(sum(conf_mat[0])+sum(conf_mat[1]))
    return accuracy

# Neural Net

In [481]:
minloss = 1000000000.00
best_model = None

for net in [5, 10, 25, 50, 100, 1000]:
    nn = neural_network.MLPRegressor(hidden_layer_sizes=(net,),learning_rate='adaptive', solver='sgd', max_iter=100)
    xtrain, ytrain, xval, yval = getTrainAndVal(xy)
    n = nn.fit(xtrain, ytrain)

    w = nn.predict(xval)
    avg_loss = np.linalg.norm(w - yval) / w.shape[0]
    if avg_loss < minloss:
        minloss = avg_loss
        best_model = nn
        best_acc = accuracy(yval, nn.predict(xval))
best_acc



0.5853658536585366

# SVM