In [203]:
import numpy as np
import pandas as pd
import nba_api
from sklearn import linear_model

In [205]:
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams


from nba_api.stats.endpoints import teamdashboardbylastngames as teamdashbyn


In [207]:
nba_teams = teams.get_teams()
knicks = [team for team in nba_teams if team['abbreviation'] == 'NYK'][0]
knicks_id = knicks['id']

In [209]:
gamefinder = leaguegamefinder.LeagueGameFinder(league_id_nullable='00', season_nullable='2018-19', season_type_nullable='Regular Season')

In [211]:
def makeOneHot(id):
    onehot = np.zeros(30)
    onehot[id] = 1
    return onehot

In [213]:
df = gamefinder.get_data_frames()[0]
base_team_id = df['TEAM_ID'].unique().min()
df['TEAM_ID'] = df['TEAM_ID'] - base_team_id
ab_to_id = df.loc[:,['TEAM_ID', 'TEAM_ABBREVIATION']].set_index('TEAM_ABBREVIATION').to_dict()
ab_to_id = ab_to_id['TEAM_ID']
ab_to_id

{'MEM': 26,
 'DET': 28,
 'NYK': 15,
 'PHI': 18,
 'GSW': 7,
 'CHI': 4,
 'IND': 17,
 'LAC': 9,
 'CHA': 29,
 'UTA': 25,
 'DAL': 5,
 'POR': 20,
 'DEN': 6,
 'ORL': 16,
 'ATL': 0,
 'BKN': 14,
 'OKC': 23,
 'SAC': 21,
 'MIL': 12,
 'SAS': 22,
 'MIN': 13,
 'MIA': 11,
 'HOU': 8,
 'LAL': 10,
 'BOS': 1,
 'PHX': 19,
 'WAS': 27,
 'CLE': 2,
 'TOR': 24,
 'NOP': 3}

In [215]:
def matchupToRow(matchup):
    if 'vs.' in matchup:
        home = matchup[0:3]
        away = matchup[-3:]
        homeid = ab_to_id[home]
        awayid = ab_to_id[away]
        homeoh = makeOneHot(homeid)
        awayoh = makeOneHot(awayid)
        return np.hstack((homeoh, awayoh))

In [217]:
df = df.sort_values(by='GAME_ID')
matchupToRow(df.iloc[0]['MATCHUP'])

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [266]:
def firstDateBefore(date1,date2):
    return datetime.strptime(date1, '%Y-%m-%d') <= datetime.strptime(date2, '%Y-%m-%d')


In [359]:
def flattenAway(home, away):
    home.assign(AWAY_PTS=np.zeros(home.shape[0]))
    home.assign(AWAY_FG_PCT=np.zeros(home.shape[0]))
    home.assign(AWAY_FG3_PCT=np.zeros(home.shape[0]))
    home.assign(AWAY_TOV=np.zeros(home.shape[0]))
    home.assign(AWAY_TEAM_ABBREVIATION=np.empty(home.shape[0]))
    
    for i, row in home.iterrows():
        game_id = row['GAME_ID']
        otherRow = away[away['GAME_ID'] == game_id]
        home.at[i, "AWAY_PTS"] = otherRow['PTS'].iloc[0]
        home.at[i, "AWAY_TEAM_ABBREVIATION"] = otherRow["TEAM_ABBREVIATION"].iloc[0]
        home.at[i, "AWAY_FG_PCT"] = otherRow["FG_PCT"].iloc[0]
        home.at[i, "AWAY_FG3_PCT"] = otherRow["FG3_PCT"].iloc[0]
        home.at[i, "AWAY_TOV"] = otherRow["TOV"].iloc[0]
    return home

array(['BOS', 'GSW', 'CHA', 'DET', 'IND', 'ORL', 'NYK', 'TOR', 'HOU',
       'SAS', 'SAC', 'LAC', 'PHX', 'PHI', 'WAS', 'POR', 'BKN', 'MEM',
       'MIN', 'NOP', 'MIL', 'UTA', 'MIA', 'CHI', 'DAL', 'DEN', 'LAL',
       'CLE', 'OKC', 'ATL'], dtype=object)

In [352]:
def getPastAvg(df, col, away = False):
    pastVals = {}
    abbrev = 'AWAY_TEAM_ABBREVIATION' if away else 'TEAM_ABBREVIATION'
    for team in home[abbrev].unique():
        pastVals[team] = [0.0, 0]
    df[col + '_AVG'] = 0.0
    for i, row in df.iterrows():
        divisor = 1 if pastVals[df.at[i, abbrev]][1] == 0 else pastVals[df.at[i, abbrev]][1]
        df.at[i, col + '_AVG'] = pastVals[df.at[i, abbrev]][0] / divisor
        pastVals[df.at[i, abbrev]][0] = pastVals[df.at[i, abbrev]][0] + df.at[i, col]
        pastVals[df.at[i, abbrev]][1] = pastVals[df.at[i, abbrev]][1] + 1
    return df
        

In [360]:
home = df[df.MATCHUP.str.contains('vs.')]
away = df[df.MATCHUP.str.contains('@')]

home = flattenAway(home,away)


In [366]:
home = getPastAvg(home, 'PTS')
home = getPastAvg(home, 'AWAY_PTS', True)
home = getPastAvg(home, 'FG_PCT')
home = getPastAvg(home, 'AWAY_FG_PCT', True)
home = getPastAvg(home, 'FG3_PCT')
home = getPastAvg(home, 'AWAY_FG3_PCT', True)
home = getPastAvg(home, 'TOV')
home = getPastAvg(home, 'AWAY_TOV', True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [358]:
home.head(50)

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,PLUS_MINUS,PTS_AVG,AWAY_PTS,AWAY_TEAM_ABBREVIATION,AWAY_FG_PCT,AWAY_FG3_PCT,AWAY_PTS_AVG,FG_PCT_AVG,FG3_PCT_AVG,AWAY_FG_PCT_AVG
2456,22018,1,BOS,Boston Celtics,21800001,2018-10-16,BOS vs. PHI,W,240,105,...,18.0,0.0,87.0,PHI,0.391,0.192,0.0,0.0,0.0,0.0
2459,22018,7,GSW,Golden State Warriors,21800002,2018-10-16,GSW vs. OKC,W,241,108,...,8.0,0.0,100.0,OKC,0.363,0.27,0.0,0.0,0.0,0.0
2440,22018,29,CHA,Charlotte Hornets,21800003,2018-10-17,CHA vs. MIL,L,241,112,...,-1.0,0.0,113.0,MIL,0.494,0.412,0.0,0.0,0.0,0.0
2446,22018,28,DET,Detroit Pistons,21800004,2018-10-17,DET vs. BKN,W,240,103,...,3.0,0.0,100.0,BKN,0.488,0.185,0.0,0.0,0.0,0.0
2450,22018,17,IND,Indiana Pacers,21800005,2018-10-17,IND vs. MEM,W,240,111,...,28.0,0.0,83.0,MEM,0.298,0.345,0.0,0.0,0.0,0.0
2453,22018,16,ORL,Orlando Magic,21800006,2018-10-17,ORL vs. MIA,W,240,104,...,3.0,0.0,101.0,MIA,0.392,0.273,0.0,0.0,0.0,0.0
2435,22018,15,NYK,New York Knicks,21800007,2018-10-17,NYK vs. ATL,W,240,126,...,19.0,0.0,107.0,ATL,0.456,0.278,0.0,0.0,0.0,0.0
2448,22018,24,TOR,Toronto Raptors,21800008,2018-10-17,TOR vs. CLE,W,239,116,...,12.0,0.0,104.0,CLE,0.4,0.368,0.0,0.0,0.0,0.0
2444,22018,8,HOU,Houston Rockets,21800009,2018-10-17,HOU vs. NOP,L,240,112,...,-19.0,0.0,131.0,NOP,0.531,0.4,0.0,0.0,0.0,0.0
2454,22018,22,SAS,San Antonio Spurs,21800010,2018-10-17,SAS vs. MIN,W,241,112,...,4.0,0.0,108.0,MIN,0.429,0.316,0.0,0.0,0.0,0.0


In [367]:
def mapToNumpy(df):
    x = None
    for i, row in df.iterrows():
        currow = matchupToRow(row['MATCHUP'])
        currow = np.append(currow, row['FG_PCT_AVG'])
        currow = np.append(currow, row['AWAY_FG_PCT_AVG'])
        currow = np.append(currow, row['FG3_PCT_AVG'])
        currow = np.append(currow, row['AWAY_FG3_PCT_AVG'])
        currow = np.append(currow, row['PTS_AVG'])
        currow = np.append(currow, row['AWAY_PTS_AVG'])
        currow = np.append(currow, row['TOV_AVG'])
        currow = np.append(currow, row['AWAY_TOV_AVG'])
        currow = np.append(currow, row['PLUS_MINUS']);
        if currow is None:
            print('fukc')
        if x is None:
            x = currow
        else:
            x = np.vstack((x, currow))
    return x

In [369]:
xy = mapToNumpy(home)
xy

array([[ 0.   ,  1.   ,  0.   , ...,  0.   ,  0.   , 18.   ],
       [ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   ,  8.   ],
       [ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   , -1.   ],
       ...,
       [ 0.   ,  0.   ,  0.   , ..., 12.55 , 12.175,  4.   ],
       [ 0.   ,  0.   ,  0.   , ..., 13.475, 14.925,  6.   ],
       [ 0.   ,  0.   ,  0.   , ..., 12.95 , 13.225,  5.   ]])

In [381]:
minloss_l2 = 1000000000.00
minloss_l1 = 1000000000.00

best_l2_model = None
best_l1_model = None

for aVal in [0.01, 0.05, 0.1, 0.5]:
    for i in range(100):
        np.random.shuffle(xy)
        x = xy[:,0:-1]
        y = xy[:,-1]
        trainEndIndex = int((x.shape[0] * 7) / 10)
        xtrain = x[0:trainEndIndex,:]
        ytrain = y[0:trainEndIndex]

        xval = x[trainEndIndex:,:]
        yval = y[trainEndIndex:]

        reg = linear_model.Ridge(alpha=aVal)
        reg.fit(xtrain,ytrain)

        w = np.dot(xval, reg.coef_)
        avg_loss_l2 = np.linalg.norm(w - yval) / w.shape[0]
        avg_loss_l1 = np.linalg.norm(w - yval, 1) / w.shape[0]
        if avg_loss_l2 < minloss_l2:
            minloss_l2 = avg_loss_l2
            best_l2_model = reg.coef_
        if avg_loss_l1 < minloss_l1:
            minloss_l1 = avg_loss_l1
            best_l1_model = reg.coef_
print('L2 Norm Loss:' + str(minloss_l2))
print('L1 Norm Loss:' + str(minloss_l1))
print('L2 Norm Loss Accuracy:' + str(accuracy(xval,yval,best_l2_model)))
print('L1 Norm Loss Accuracy:' + str(accuracy(xval,yval,best_l1_model)))



L2 Norm Loss:0.6370683026518109
L1 Norm Loss:9.53856089579556


TypeError: unsupported operand type(s) for +: 'int' and 'list'

In [380]:
def accuracy(X, y, model):
    conf_mat = [[0,0],[0,0]]
    preds = np.sign(np.dot(X, model))
    actuals = np.sign(y)
    for i, j in zip(preds, actuals):
        if i > j:
            conf_mat[0][1] += 1
        elif j > i:
            conf_mat[1][0] += 1
        elif j == 1:
            conf_mat[1][1] += 1
        elif j == -1:
            conf_mat[0][0] += 1
    accuracy = (conf_mat[0][0]+conf_mat[1][1])/sum(conf_mat[0])+sum(conf_mat[1])
    return accuracy