In [2]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 999

In [3]:
players= pd.read_csv('NBA_player_data_2016.csv')

In [4]:
teams = pd.read_csv('team_boxscores_2016.csv')

In [5]:
teams.drop('Unnamed: 0', inplace=True,axis=1)
players.drop('Unnamed: 0', inplace=True,axis=1)

In [6]:
teams['Date']=teams['Game_Id'].apply(lambda x: x.split('_')[0])

In [7]:
teams['Date'] = pd.to_datetime(teams['Date'])

In [8]:
matrix = teams.as_matrix()

In [9]:
##get a list of all the ball clubs
clubs = []
for el in matrix:
    clubs.append(el[16])
clubs = list(set(clubs))

In [10]:
##initialize last date at 0
last_date = {}
for team in clubs:
    last_date[team]=0


##calculate how many days since last game
new_matrix = []
for row in matrix:
    row = list(row)
    if last_date[row[16]] == 0:
        row.append(np.nan)
    else:
        row.append((row[32]-last_date[row[16]]).days)
    new_matrix.append(row)
    last_date[row[16]] = row[32]

##add Days_Since to column names
columns = ['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'is_Home', 'Team_Win', 'Team', 'opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV',
       'opponent', 'Game_Id', 'Date','Days_Since']
##add the delta column to new data frame
df = pd.DataFrame(new_matrix,columns = columns)

In [11]:
##dictionary of time zones
time_zone_dict = {}
for team in clubs:
    time_zone_dict[team]=0
for team in clubs:
    if team in ['Phoenix','Denver', 'Utah']:
        time_zone_dict[team]+= 1
    elif team in ['Chicago','Oklahoma City','Milwaukee','Houston','Dallas',
                  'San Antonio','Memphis','Minnesota','New Orleans']:
        time_zone_dict[team]+=2
    elif team in ['Atlanta','Boston','Charlotte','Brooklyn', 'Cleveland','Detroit',
                 'Indiana', 'Miami','New York', 'Orlando','Philadelphia','Washington','Toronto']:
        time_zone_dict[team]+=3
    else:
        pass

In [12]:
##correct for wrong is_home function
df['is_Home'] = df['is_Home'].map({0:1,1:0})

In [13]:
##grab the location from the game id
df['game_Location'] = df['Game_Id'].apply(lambda x: x.split("_")[1])

In [14]:
##make time zone function
def time_zone(row):
    return time_zone_dict[row['game_Location']]

##add time zone to data frame
df['Time_Zone'] =df.apply(time_zone, axis=1)

In [15]:
import requests
import json

In [16]:
##change location column to accurate google locations
def change_Locations(row):
    if row['game_Location'] in ['LA Clippers', 'LA Lakers']:
        return 'Los Angeles'
    elif row['game_Location'] == 'Golden State':
        return 'Oakland'
    elif row['game_Location'] == 'Utah':
        return "Salt Lake City"
    elif row['game_Location'] =='Indiana':
        return 'Indianapolis'
    elif row['game_Location'] =='Minnesota':
        return 'Minneapolis'
    elif row['game_Location'] =='Washington':
        return 'Washington DC'
    else:
        return row['game_Location']
df['game_Location'] = df.apply(change_Locations, axis=1)

In [17]:
##call google map API to get distances between stadiums and make them a dictionary
dict_distances = {}
for i,loc1 in enumerate(df['game_Location'].unique()):
    print loc1
    for k in range(i,len(df['game_Location'].unique())):
        loc2 = df['game_Location'].unique()[k]
        if loc1!=loc2:
            string = 'https://maps.googleapis.com/maps/api/distancematrix/json?origins={}&destinations={}&key=AIzaSyAtEyirTGKvMkeAPCThQYY83-a1Dzq6SHo'.format(loc1,loc2)
            response = requests.get(string)
            dict_distances[loc1+'-'+loc2] = response.content.split('\"text" : \"')[1].split('\"')[0]

Cleveland
Oakland
Portland
Boston
Indianapolis
Los Angeles
Memphis
Milwaukee
New Orleans
Orlando
Philadelphia
Phoenix
Toronto
Atlanta
Chicago
Sacramento
Brooklyn
Dallas
Detroit
Miami
Oklahoma City
Salt Lake City
Charlotte
Denver
New York
San Antonio
Houston
Minneapolis
Washington DC


In [18]:
## do the same thing as game location
def team_location(row):
    if row['Team'] in ['LA Clippers', 'LA Lakers']:
        return 'Los Angeles'
    elif row['Team'] == 'Golden State':
        return 'Oakland'
    elif row['Team'] == 'Utah':
        return "Salt Lake City"
    elif row['Team'] =='Indiana':
        return 'Indianapolis'
    elif row['Team'] =='Minnesota':
        return 'Minneapolis'
    elif row['Team'] =='Washington':
        return 'Washington DC'
    else:
        return row['Team']

df['team_location'] = df.apply(team_location,axis=1)

##use the dictionary created to calculated distances between home and game
def distance_from_home(row):
    if row['game_Location'] ==row['team_location']:
        return np.nan
    else:
        string = row['game_Location']+'-'+row['team_location']
        if string in dict_distances:
            return dict_distances[string]
        else:
            return dict_distances[row['team_location']+'-'+row['game_Location']]
df['distance_From_Home'] = df.apply(distance_from_home,axis=1)

In [19]:
##turn distances into necessary form 
def fix_Distance(row):
    if row['is_Home']:
        return row['distance_From_Home']
    else:
        try:
            return row['distance_From_Home'].split(' ')[0].replace(',','')
        except:
            return row['distance_From_Home']
        
df['distance_From_Home'] = df.apply(fix_Distance,axis=1)

df['distance_From_Home']=df['distance_From_Home'].apply(float)

In [20]:
#since we will deal with team time zones we need to change the name
df['game_time_zone'] =df['Time_Zone']
df.drop('Time_Zone', axis=1, inplace=True)

In [21]:
def team_time_zone(row):
    return time_zone_dict[row['Team']]
df['team_time_zone'] = df.apply(team_time_zone,axis=1)

In [22]:
df['time_zone_diff'] = df['game_time_zone']-df['team_time_zone']

In [23]:
#The next process will be repeated. I am keeping track of the last game each team played
#and gathering data on those games. 
#This cell is for the last game location.
matrix = df.as_matrix()


##initialize last_opp and last_time_zone at 0
last_opp = {}
for team in clubs:
    last_opp[team]=0


##calculate how many days since last game
new_matrix = []
for row in matrix:
    row = list(row)
    if last_opp[row[16]] == 0:
        row.append(np.nan)
    else:
        row.append(last_opp[row[16]])
    new_matrix.append(row)
    last_opp[row[16]] = row[34]

columns = ['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'is_Home', 'Team_Win', 'Team', 'opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV',
       'opponent', 'Game_Id', 'Date', 'Days_Since', 'game_Location',
       'team_location', 'distance_From_Home', 'game_time_zone',
       'team_time_zone', 'time_zone_diff_From_home','last_loc']

##add the last loc column to new data frame
df= pd.DataFrame(new_matrix,columns = columns)



In [24]:
##calculate the distance from the last location
def distance_from_last_game(row):
    if row['last_loc']==row['game_Location']:
        return np.nan
    else:
        try:
            string = row['game_Location']+'-'+row['last_loc']
            if string in dict_distances:
                return dict_distances[string]
            else:
                return dict_distances[row['last_loc']+'-'+row['game_Location']]
        except:
            return np.nan
df['distance_From_Last_Game'] = df.apply(distance_from_last_game,axis=1)


In [25]:
##convert this distance to a float
def fix_Distance(row):
    try:
        return row['distance_From_Last_Game'].split(' ')[0].replace(',','')
    except:
        return row['distance_From_Last_Game']
        
df['distance_From_Last_Game'] = df.apply(fix_Distance,axis=1)

df['distance_From_Last_Game']=df['distance_From_Last_Game'].apply(float)

In [26]:
##initialize last_time_zone at -1
matrix=df.as_matrix()
last_time = {}
for team in clubs:
    last_time[team]=-1


##calculate how many days since last game
new_matrix = []
for row in matrix:
    row = list(row)
    if last_time[row[16]] == -1:
        row.append(np.nan)
    else:
        row.append(last_time[row[16]])
    new_matrix.append(row)
    last_time[row[16]] = row[37]

columns = ['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'is_Home', 'Team_Win', 'Team', 'opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV',
       'opponent', 'Game_Id', 'Date', 'Days_Since', 'game_Location',
       'team_location', 'distance_From_Home', 'game_time_zone',
       'team_time_zone', 'time_zone_diff_From_home', 'last_loc',
       'distance_From_Last_Game','last_time_zone']

##add the last loc column to new data frame
df= pd.DataFrame(new_matrix,columns = columns)




In [27]:
##calculate the time zone difference from the last game
df['time_zone_diff_From_last'] = df['game_time_zone']- df['last_time_zone']

In [74]:

matrix = df.as_matrix()


game = {}
for team in clubs:
    game[team]=1


new_matrix = []
for row in matrix:
    row = list(row)
    if game[row[16]] == 1:
        row.append(1)
    else:
        row.append(game[row[16]])
    new_matrix.append(row)
    game[row[16]] +=1

columns = ['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'is_Home', 'Team_Win', 'Team', 'opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV',
       'opponent', 'Game_Id', 'Date', 'Days_Since', 'game_Location',
       'team_location', 'distance_From_Home', 'game_time_zone',
       'team_time_zone', 'time_zone_diff_From_home','last_loc','distance_From_Last_Game',
        'last_time_zone','time_zone_diff_From_last','game_number']

##add the last loc column to new data frame
df= pd.DataFrame(new_matrix,columns = columns)


In [73]:
df.drop('is_Opener',axis=1,inplace=True)

In [79]:
def is_opener(row):
    return int(row['game_number']==1)
df['is_Opener'] = df.apply(is_opener,axis=1)

In [81]:
df['line'] = df['PTS']-df['opp_PTS']

In [83]:
df.head(8)

Unnamed: 0,3P,3PA,AST,BLK,FG,FGA,FT,FTA,ORB,PF,PTS,STL,TOV,TRB,is_Home,Team_Win,Team,opp_3P,opp_3PA,opp_AST,opp_BLK,opp_FG,opp_FGA,opp_FT,opp_FTA,opp_PTS,opp_ORB,opp_STL,opp_PF,opp_TOV,opponent,Game_Id,Date,Days_Since,game_Location,team_location,distance_From_Home,game_time_zone,team_time_zone,time_zone_diff_From_home,last_loc,distance_From_Last_Game,last_time_zone,time_zone_diff_From_last,game_number,is_Opener,line
0,9.0,27.0,17.0,6.0,32.0,87.0,15.0,20.0,13.0,22.0,88.0,6.0,18.0,42.0,1,0,Cleveland,13.0,35.0,31.0,5.0,45.0,94.0,14.0,19.0,117.0,11.0,12.0,22.0,14.0,New York,10/25/2016_Cleveland_New York,2016-10-25,,Cleveland,Cleveland,,3,3,0,,,,,1,1,-29.0
1,13.0,35.0,31.0,5.0,45.0,94.0,14.0,19.0,11.0,22.0,117.0,12.0,14.0,51.0,0,1,New York,9.0,27.0,17.0,6.0,32.0,87.0,15.0,20.0,88.0,13.0,6.0,22.0,18.0,Cleveland,10/25/2016_Cleveland_New York,2016-10-25,,Cleveland,New York,744.0,3,3,0,,,,,1,1,29.0
2,12.0,24.0,25.0,3.0,47.0,98.0,23.0,26.0,21.0,19.0,129.0,13.0,13.0,55.0,1,1,Golden State,7.0,33.0,24.0,6.0,40.0,85.0,13.0,18.0,100.0,8.0,11.0,19.0,16.0,San Antonio,10/25/2016_Golden State_San Antonio,2016-10-25,,Oakland,Oakland,,0,0,0,,,,,1,1,29.0
3,7.0,33.0,24.0,6.0,40.0,85.0,13.0,18.0,8.0,19.0,100.0,11.0,16.0,35.0,0,0,San Antonio,12.0,24.0,25.0,3.0,47.0,98.0,23.0,26.0,129.0,21.0,13.0,19.0,13.0,Golden State,10/25/2016_Golden State_San Antonio,2016-10-25,,Oakland,San Antonio,2771.0,0,2,-2,,,,,1,1,-29.0
4,8.0,24.0,19.0,5.0,40.0,82.0,16.0,16.0,6.0,19.0,104.0,9.0,11.0,31.0,1,0,Portland,13.0,19.0,22.0,3.0,39.0,75.0,22.0,22.0,113.0,5.0,5.0,18.0,12.0,Utah,10/25/2016_Portland_Utah,2016-10-25,,Portland,Portland,,0,0,0,,,,,1,1,-9.0
5,13.0,19.0,22.0,3.0,39.0,75.0,22.0,22.0,5.0,18.0,113.0,5.0,12.0,34.0,0,1,Utah,8.0,24.0,19.0,5.0,40.0,82.0,16.0,16.0,104.0,6.0,9.0,19.0,11.0,Portland,10/25/2016_Portland_Utah,2016-10-25,,Portland,Salt Lake City,1232.0,0,1,-1,,,,,1,1,9.0
6,15.0,44.0,22.0,3.0,43.0,97.0,16.0,20.0,15.0,19.0,117.0,8.0,16.0,44.0,1,0,Boston,11.0,32.0,36.0,9.0,48.0,89.0,15.0,18.0,122.0,12.0,13.0,20.0,19.0,Brooklyn,10/26/2016_Boston_Brooklyn,2016-10-26,,Boston,Boston,,3,3,0,,,,,1,1,-5.0
7,11.0,32.0,36.0,9.0,48.0,89.0,15.0,18.0,12.0,20.0,122.0,13.0,19.0,47.0,0,1,Brooklyn,15.0,44.0,22.0,3.0,43.0,97.0,16.0,20.0,117.0,15.0,8.0,19.0,16.0,Boston,10/26/2016_Boston_Brooklyn,2016-10-26,,Boston,Brooklyn,347.0,3,3,0,,,,,1,1,5.0


In [106]:
def boxscore_aves(team,date):
    subdata = df[(df['Team']==team) &(df['Date']<date)]
    features = subdata[['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV']]
    
    return features.expanding().mean().values[-1]

In [109]:
['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV'],boxscore_aves('Golden State','3/4/17')))

{'3P': 9.2622950819672134,
 '3PA': 28.409836065573771,
 'AST': 22.704918032786885,
 'BLK': 3.819672131147541,
 'FG': 38.983606557377051,
 'FGA': 89.47540983606558,
 'FT': 18.098360655737704,
 'FTA': 23.590163934426229,
 'ORB': 11.475409836065573,
 'PF': 20.0,
 'PTS': 105.32786885245902,
 'STL': 8.6885245901639347,
 'TOV': 14.934426229508198,
 'TRB': 43.639344262295083,
 'opp_3P': 11.983606557377049,
 'opp_3PA': 31.409836065573771,
 'opp_AST': 30.819672131147541,
 'opp_BLK': 6.6721311475409832,
 'opp_FG': 43.409836065573771,
 'opp_FGA': 87.459016393442624,
 'opp_FT': 18.737704918032787,
 'opp_FTA': 23.590163934426229,
 'opp_ORB': 9.0327868852459012,
 'opp_PF': 19.344262295081968,
 'opp_PTS': 117.54098360655738,
 'opp_STL': 9.6557377049180335,
 'opp_TOV': 14.508196721311476}

In [128]:
##this creates a new dataframe with up to the date averages for each team going into each game.
##these averages will be used as features for the models
game_features = []
for i in range(df.shape[0]):
    row = df.iloc[i]
    if row['is_Opener']:
        pass
    else:
        team = dict(zip(['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV'],
                        boxscore_aves(row['Team'],row['Date'])))
        team['team'] = row['Team']
        team['Date'] = row['Date']
        team['Opponent'] = row['opponent']
        team['Days_Since'] = row['Days_Since']
        team['is_Home'] = row['is_Home']
        team['Team_Win'] = row['Team_Win']
        team['distance_From_Home'] = row['distance_From_Home']
        team['time_zone_diff_From_home'] = row['time_zone_diff_From_home']
        team['distance_From_Last_Game'] = row['distance_From_Last_Game']
        team['game_number'] = row['game_number']
        team['time_zone_diff_From_last'] = row['time_zone_diff_From_last']
        team['line'] =row['line']
        game_features.append(team)
new_df = pd.DataFrame(game_features)

Unnamed: 0,3P,3PA,AST,BLK,Date,Days_Since,FG,FGA,FT,FTA,ORB,Opponent,PF,PTS,STL,TOV,TRB,Team_Win,distance_From_Home,distance_From_Last_Game,game_number,is_Home,line,opp_3P,opp_3PA,opp_AST,opp_BLK,opp_FG,opp_FGA,opp_FT,opp_FTA,opp_ORB,opp_PF,opp_PTS,opp_STL,opp_TOV,team,time_zone_diff_From_home,time_zone_diff_From_last
100,9.750000,27.000000,24.000000,4.500000,2016-11-03,2.0,42.000000,93.500000,18.000000,23.750000,12.500000,Oklahoma City,21.500000,111.750000,10.750000,13.750000,46.500000,0,,1013.0,5,1,-26.0,8.500000,29.000000,29.750000,4.250000,42.250000,87.000000,20.750000,25.500000,7.750000,20.750000,113.750000,9.750000,15.250000,Golden State,0,0.0
101,6.500000,26.500000,19.500000,5.000000,2016-11-03,1.0,36.750000,88.000000,16.500000,24.750000,7.750000,Golden State,22.000000,96.500000,11.000000,16.500000,42.250000,1,2600.0,598.0,5,0,26.0,6.250000,22.250000,18.750000,4.000000,39.500000,92.250000,18.250000,25.250000,11.750000,23.250000,103.500000,10.250000,18.000000,Oklahoma City,-2,0.0
102,8.000000,25.750000,24.000000,6.250000,2016-11-03,2.0,38.000000,85.750000,22.500000,28.250000,11.750000,Indiana,18.750000,106.500000,9.500000,14.000000,49.000000,0,,1634.0,5,1,-18.0,6.250000,21.750000,25.000000,4.750000,39.500000,89.500000,16.250000,19.750000,9.750000,21.500000,101.500000,9.250000,12.250000,Milwaukee,0,0.0
103,10.750000,32.500000,23.500000,5.000000,2016-11-03,2.0,42.000000,91.500000,17.750000,21.000000,11.500000,Milwaukee,22.250000,112.500000,7.500000,15.500000,48.000000,1,449.0,449.0,5,0,18.0,8.500000,22.250000,26.250000,6.750000,42.000000,91.500000,17.500000,23.000000,10.250000,20.250000,110.000000,9.000000,14.250000,Indiana,-1,-1.0
104,8.000000,24.666667,17.666667,6.666667,2016-11-03,2.0,34.000000,80.000000,20.000000,27.000000,10.000000,Denver,23.666667,96.000000,9.666667,15.333333,36.000000,1,,,4,1,3.0,8.333333,20.000000,22.666667,5.000000,38.666667,78.000000,20.000000,27.666667,10.666667,23.333333,105.666667,7.333333,14.000000,Minnesota,0,0.0
105,5.666667,22.333333,23.333333,7.333333,2016-11-03,3.0,41.333333,92.666667,19.000000,24.000000,7.000000,Minnesota,29.000000,107.333333,11.000000,11.000000,40.333333,0,1472.0,1505.0,4,0,-3.0,7.666667,23.000000,17.000000,6.000000,36.000000,88.000000,27.666667,35.666667,12.666667,20.333333,107.333333,6.333333,16.333333,Denver,1,-1.0
106,7.750000,23.000000,24.750000,6.250000,2016-11-03,2.0,42.750000,90.250000,12.250000,17.000000,11.250000,Sacramento,20.000000,105.500000,6.500000,11.500000,46.750000,0,,1579.0,5,1,-8.0,7.000000,24.250000,22.500000,4.500000,36.750000,90.250000,14.500000,20.750000,13.500000,16.500000,95.000000,6.500000,12.500000,Orlando,0,0.0
107,8.200000,23.000000,22.000000,6.200000,2016-11-03,2.0,35.800000,78.200000,22.800000,32.200000,9.800000,Orlando,24.200000,102.600000,8.600000,13.200000,41.000000,1,4651.0,379.0,6,0,8.0,7.200000,22.600000,21.600000,4.000000,36.200000,81.000000,21.200000,27.000000,8.400000,24.400000,100.800000,6.400000,15.000000,Sacramento,3,0.0
108,9.200000,26.000000,26.000000,7.000000,2016-11-04,2.0,41.600000,90.400000,16.600000,20.200000,11.400000,Charlotte,21.000000,109.000000,10.800000,15.600000,47.200000,1,,,6,1,4.0,10.800000,33.800000,21.200000,3.800000,38.400000,88.000000,17.400000,21.200000,11.800000,20.400000,105.000000,9.600000,16.800000,Brooklyn,0,0.0
109,11.000000,29.750000,22.500000,4.250000,2016-11-04,2.0,36.000000,86.250000,13.000000,18.000000,8.250000,Brooklyn,23.750000,96.000000,4.500000,14.250000,44.250000,0,1016.0,1016.0,5,0,-4.0,9.000000,27.750000,23.500000,7.000000,36.500000,84.750000,20.750000,26.750000,9.750000,19.000000,102.750000,7.750000,10.250000,Charlotte,0,0.0
