In [4]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 999

In [6]:
teams = pd.read_csv('team_boxscores_2016.csv')

In [7]:
##drop added index column
teams.drop('Unnamed: 0', inplace=True,axis=1)

In [8]:
##add date feature
teams['Date']=teams['Game_Id'].apply(lambda x: x.split('_')[0])

In [9]:
##convert to datetime
teams['Date'] = pd.to_datetime(teams['Date'])

In [10]:
matrix = teams.as_matrix()

In [11]:
##get a list of all the teams
clubs = []
for el in matrix:
    clubs.append(el[16])
clubs = list(set(clubs))

In [12]:
##This cell calculates how many days since the last game.

##initialize a dictionary of last dates and set all to 0
last_date = {}
for team in clubs:
    last_date[team]=0


##calculate how many days since last game
new_matrix = []
for row in matrix:
    row = list(row)
    if last_date[row[16]] == 0:
        row.append(np.nan)
    else:
        row.append((row[33]-last_date[row[16]]).days)
    new_matrix.append(row)
    last_date[row[16]] = row[33]

##add Days_Since to column names
columns = ['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'is_Home', 'Team_Win', 'Team', 'opp_TRB','opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV',
       'opponent', 'Game_Id', 'Date','Days_Since']
##add the delta column back to new data frame
df = pd.DataFrame(new_matrix,columns = columns)

In [13]:
##dictionary of time zones
time_zone_dict = {}
for team in clubs:
    time_zone_dict[team]=0
for team in clubs:
    if team in ['Phoenix','Denver', 'Utah']:
        time_zone_dict[team]+= 1
    elif team in ['Chicago','Oklahoma City','Milwaukee','Houston','Dallas',
                  'San Antonio','Memphis','Minnesota','New Orleans']:
        time_zone_dict[team]+=2
    elif team in ['Atlanta','Boston','Charlotte','Brooklyn', 'Cleveland','Detroit',
                 'Indiana', 'Miami','New York', 'Orlando','Philadelphia','Washington','Toronto']:
        time_zone_dict[team]+=3
    else:
        pass
    
##grab the location from the game id
df['game_Location'] = df['Game_Id'].apply(lambda x: x.split("_")[1])

##make time zone function
def time_zone(row):
    return time_zone_dict[row['game_Location']]

##add time zone to data frame
df['Time_Zone'] =df.apply(time_zone, axis=1)

In [15]:
import requests
import json

In [16]:
##change location column to accurate google locations. This is for the next cell to call google API
def change_Locations(row):
    if row['game_Location'] in ['LA Clippers', 'LA Lakers']:
        return 'Los Angeles'
    elif row['game_Location'] == 'Golden State':
        return 'Oakland'
    elif row['game_Location'] == 'Utah':
        return "Salt Lake City"
    elif row['game_Location'] =='Indiana':
        return 'Indianapolis'
    elif row['game_Location'] =='Minnesota':
        return 'Minneapolis'
    elif row['game_Location'] =='Washington':
        return 'Washington DC'
    else:
        return row['game_Location']
df['game_Location'] = df.apply(change_Locations, axis=1)

In [17]:
##call google map API to get distances between stadiums and make them a dictionary
dict_distances = {}
for i,loc1 in enumerate(df['game_Location'].unique()):
    for k in range(i,len(df['game_Location'].unique())):
        loc2 = df['game_Location'].unique()[k]
        if loc1!=loc2:
            string = 'https://maps.googleapis.com/maps/api/distancematrix/json?origins={}&destinations={}&key=AIzaSyAtEyirTGKvMkeAPCThQYY83-a1Dzq6SHo'.format(loc1,loc2)
            response = requests.get(string)
            dict_distances[loc1+'-'+loc2] = response.content.split('\"text" : \"')[1].split('\"')[0]

In [18]:
## do the same thing as game location
def team_location(row):
    if row['Team'] in ['LA Clippers', 'LA Lakers']:
        return 'Los Angeles'
    elif row['Team'] == 'Golden State':
        return 'Oakland'
    elif row['Team'] == 'Utah':
        return "Salt Lake City"
    elif row['Team'] =='Indiana':
        return 'Indianapolis'
    elif row['Team'] =='Minnesota':
        return 'Minneapolis'
    elif row['Team'] =='Washington':
        return 'Washington DC'
    else:
        return row['Team']

df['team_location'] = df.apply(team_location,axis=1)

In [19]:
##use the dictionary created to calculated distances between home and game
def distance_from_home(row):
    if row['game_Location'] ==row['team_location']:
        return np.nan
    else:
        string = row['game_Location']+'-'+row['team_location']
        if string in dict_distances:
            return dict_distances[string]
        else:
            return dict_distances[row['team_location']+'-'+row['game_Location']]
df['distance_From_Home'] = df.apply(distance_from_home,axis=1)

In [20]:
##turn distances into necessary form 
def fix_Distance(row):
    if row['is_Home']:
        return row['distance_From_Home']
    else:
        try:
            return row['distance_From_Home'].split(' ')[0].replace(',','')
        except:
            return row['distance_From_Home']
        
df['distance_From_Home'] = df.apply(fix_Distance,axis=1)

df['distance_From_Home']=df['distance_From_Home'].apply(float)

In [21]:
#since we will deal with team time zones we need to change the name
df['game_time_zone'] =df['Time_Zone']
df.drop('Time_Zone', axis=1, inplace=True)

In [22]:
##add the time zone of the team's home stadium
def team_time_zone(row):
    return time_zone_dict[row['Team']]
df['team_time_zone'] = df.apply(team_time_zone,axis=1)

In [23]:
##time zone differential between where the game is and the team's home location
df['time_zone_diff'] = df['game_time_zone']-df['team_time_zone']

In [24]:
##This cell adds the last location to the dataframe. So, if GS played Dallas in the previous game,
##That will be added as a feature.
matrix = df.as_matrix()


##initialize last_opp to check to see if a team has played a game.
last_opp = {}
for team in clubs:
    last_opp[team]=0


##calculate how many days since last game
new_matrix = []
for row in matrix:
    row = list(row)
    if last_opp[row[16]] == 0:
        row.append(np.nan)
    else:
        row.append(last_opp[row[16]])
    new_matrix.append(row)
    last_opp[row[16]] = row[35]

columns = ['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'is_Home', 'Team_Win', 'Team', 'opp_TRB','opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV',
       'opponent', 'Game_Id', 'Date', 'Days_Since', 'game_Location',
       'team_location', 'distance_From_Home', 'game_time_zone',
       'team_time_zone', 'time_zone_diff_From_home','last_loc']

##add the last loc column to new data frame
df= pd.DataFrame(new_matrix,columns = columns)



In [25]:
##calculate the distance from the last location
def distance_from_last_game(row):
    if row['last_loc']==row['game_Location']:
        return np.nan
    else:
        try:
            string = row['game_Location']+'-'+row['last_loc']
            if string in dict_distances:
                return dict_distances[string]
            else:
                return dict_distances[row['last_loc']+'-'+row['game_Location']]
        except:
            return np.nan
df['distance_From_Last_Game'] = df.apply(distance_from_last_game,axis=1)


In [26]:
##convert this distance to a float
def fix_Distance(row):
    try:
        return row['distance_From_Last_Game'].split(' ')[0].replace(',','')
    except:
        return row['distance_From_Last_Game']
        
df['distance_From_Last_Game'] = df.apply(fix_Distance,axis=1)

df['distance_From_Last_Game']=df['distance_From_Last_Game'].apply(float)

In [27]:
##Calculate the last time zone that the team played in.

##initialize last_time_zone at -1
matrix=df.as_matrix()
last_time = {}
for team in clubs:
    last_time[team]=-1


##calculate how many days since last game
new_matrix = []
for row in matrix:
    row = list(row)
    if last_time[row[16]] == -1:
        row.append(np.nan)
    else:
        row.append(last_time[row[16]])
    new_matrix.append(row)
    last_time[row[16]] = row[38]

columns = ['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'is_Home', 'Team_Win', 'Team','opp_TRB', 'opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV',
       'opponent', 'Game_Id', 'Date', 'Days_Since', 'game_Location',
       'team_location', 'distance_From_Home', 'game_time_zone',
       'team_time_zone', 'time_zone_diff_From_home', 'last_loc',
       'distance_From_Last_Game','last_time_zone']

##add the last loc column to new data frame
df= pd.DataFrame(new_matrix,columns = columns)




In [28]:
##calculate the time zone difference from the last game
df['time_zone_diff_From_last'] = df['game_time_zone']- df['last_time_zone']

In [29]:
##this adds an index for which game the team is playing
##it can be used to filter out the first game to be used as data.
matrix = df.as_matrix()


game = {}
for team in clubs:
    game[team]=1


new_matrix = []
for row in matrix:
    row = list(row)
    if game[row[16]] == 1:
        row.append(1)
    else:
        row.append(game[row[16]])
    new_matrix.append(row)
    game[row[16]] +=1

columns = ['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'is_Home', 'Team_Win', 'Team', 'opp_TRB','opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV',
       'opponent', 'Game_Id', 'Date', 'Days_Since', 'game_Location',
       'team_location', 'distance_From_Home', 'game_time_zone',
       'team_time_zone', 'time_zone_diff_From_home','last_loc','distance_From_Last_Game',
        'last_time_zone','time_zone_diff_From_last','game_number']

##add the last loc column to new data frame
df= pd.DataFrame(new_matrix,columns = columns)


In [30]:
##using the last cell to create a boolean if it is the opening game
def is_opener(row):
    return int(row['game_number']==1)
df['is_Opener'] = df.apply(is_opener,axis=1)

In [32]:
##this is the value I will be trying to predict
df['line'] = df['PTS']-df['opp_PTS']

In [36]:
##somehow team and opponent got mixed up. DOUBLE CHECK THIS CELL
series = df['Team']
df['Team'] = df['opponent']
df['opponent'] = series


In [46]:
##these are the important features according to http://www.basketball-reference.com/about/factors.html

df['EFG%'] = (df['FG']+.5*df['3P'])/df['FGA']
df['TOV%'] = df['TOV']/(df['FGA']+df['FTA']/2.25+df['TOV'])
df['ORB%'] = (df['ORB']/(df['ORB']+df['opp_TRB']))
df['DRB%'] = (df['TRB']/(df['TRB']+df['opp_ORB']))
df['FT_factor'] = df['FT']/df['FGA']
df['opp_EFG%'] = (df['opp_FG']+.5*df['opp_3P'])/df['opp_FGA']
df['opp_TOV%'] = df['opp_TOV']/(df['opp_FGA']+df['opp_FTA']/2.25+df['opp_TOV'])
df['opp_ORB%'] = (df['opp_ORB']/(df['opp_ORB']+df['TRB']))
df['opp_DRB%'] = (df['opp_TRB']/(df['opp_TRB']+df['ORB']))
df['opp_FT_factor'] = df['opp_FT']/df['opp_FGA']

In [None]:
##insert new features here.

Now, we have a dataframe with sufficient data to start doing some feature engineering. Any other features that I want will be added in the cell above

In [73]:
##return the current season expanded means, up to the date of a game for the major stats for a team and their opponents
##It will be used as a features matrix to predict the line for the current game.
##Consider giving these means less weight earlier in the season.
def current_season_expanded_means(team,date):
    subdata = df[(df['Team']==team) &(df['Date']<date)]
    features = subdata[['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'opp_TRB','opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV']]
    feats = ['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'opp_TRB','opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV']
    current_cols = ['tot_'+feat for feat in feats]
    return pd.DataFrame(features.expanding().mean().values[-1],current_cols).T

##return the means of the last five games for the major statistical categories for a team
##and the opponents they played.
def last_five_rolling_means(team,date):
    subdata = df[(df['Team']==team) &(df['Date']<date)]
    features = subdata[['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB','opp_TRB', 'opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV']]
    feats = ['3P', '3PA', 'AST', 'BLK', 'FG', 'FGA', 'FT', 'FTA', 'ORB', 'PF',
       'PTS', 'STL', 'TOV', 'TRB', 'opp_TRB','opp_3P',
       'opp_3PA', 'opp_AST', 'opp_BLK', 'opp_FG', 'opp_FGA', 'opp_FT',
       'opp_FTA', 'opp_PTS', 'opp_ORB', 'opp_STL', 'opp_PF', 'opp_TOV']
    current_cols = ['fiv_'+feat for feat in feats]

    return pd.DataFrame(features.rolling(window=5).mean().values[-1],current_cols).T
    



In [74]:
##take all these features and concatenate them into one frame
pd.concat([current_season_expanded_means('Golden State','3/4/17'),last_five_rolling_means('Golden State','3/4/17')],axis=1)

Unnamed: 0,tot_3P,tot_3PA,tot_AST,tot_BLK,tot_FG,tot_FGA,tot_FT,tot_FTA,tot_ORB,tot_PF,tot_PTS,tot_STL,tot_TOV,tot_TRB,tot_opp_TRB,tot_opp_3P,tot_opp_3PA,tot_opp_AST,tot_opp_BLK,tot_opp_FG,tot_opp_FGA,tot_opp_FT,tot_opp_FTA,tot_opp_PTS,tot_opp_ORB,tot_opp_STL,tot_opp_PF,tot_opp_TOV,fiv_3P,fiv_3PA,fiv_AST,fiv_BLK,fiv_FG,fiv_FGA,fiv_FT,fiv_FTA,fiv_ORB,fiv_PF,fiv_PTS,fiv_STL,fiv_TOV,fiv_TRB,fiv_opp_TRB,fiv_opp_3P,fiv_opp_3PA,fiv_opp_AST,fiv_opp_BLK,fiv_opp_FG,fiv_opp_FGA,fiv_opp_FT,fiv_opp_FTA,fiv_opp_PTS,fiv_opp_ORB,fiv_opp_STL,fiv_opp_PF,fiv_opp_TOV
0,11.983607,31.409836,30.819672,6.672131,43.409836,87.459016,18.737705,23.590164,9.032787,19.344262,117.540984,9.655738,14.508197,44.688525,43.639344,9.262295,28.409836,22.704918,3.819672,38.983607,89.47541,18.098361,23.590164,105.327869,11.47541,8.688525,20.0,14.934426,8.8,29.0,28.6,7.6,39.6,88.4,21.8,25.6,11.6,20.6,109.8,11.0,15.8,46.4,45.4,9.2,27.4,24.0,3.4,38.0,88.4,19.2,24.4,104.4,11.4,10.0,19.6,17.2


In [None]:
##this creates a new dataframe with up to the date averages for each team going into each game.
##these averages will be used as features for the models
        team['team'] = row['Team']
        team['Date'] = row['Date']
        team['Opponent'] = row['opponent']
        team['Days_Since'] = row['Days_Since']
        team['is_Home'] = row['is_Home']
        team['Team_Win'] = row['Team_Win']
        team['distance_From_Home'] = row['distance_From_Home']
        team['time_zone_diff_From_home'] = row['time_zone_diff_From_home']
        team['distance_From_Last_Game'] = row['distance_From_Last_Game']
        team['game_number'] = row['game_number']
        team['line'] =row['line']
        team['game_Id'] = row['Game_Id']
        game_features.append(team)
new_df = pd.DataFrame(game_features)

Other features to consider:

current win streak 
current lose streak
record in the last 10 games
did they play last night
did they play a road game last night
how long is current road trip
how active has a team been in the last 10 days
LATER:
how to we factor players into the model? Players injured/sitting out? Player mismatches? 
Can I categorize games into different types? We could use some unsupervised model to classify games into different types and then compare historical lines within that game type
Features of a game type: 
major injuries, back to backs, both teams records, both teams streaks/record in last ten, distance/time zone between stadiums, time of the game, 

team styles? Can I categorize/quantify a style for all 30 teams? Such as pace of play. Big/small? 



'tring'