In [35]:
import pandas as pd

%matplotlib inline

In [36]:
nba2006 = pd.read_csv('./data/2006-2007_NBA_Box_Score_Team_Stats.csv')
nba2007 = pd.read_csv('./data/2007-2008_NBA_Box_Score_Team_Stats.csv')
nba2008 = pd.read_csv('./data/2008-2009_NBA_Box_Score_Team_Stats.csv')

In [37]:
nba2008.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2631 entries, 0 to 2630
Data columns (total 50 columns):
DATASET             2630 non-null object
DATE                2630 non-null object
TEAMS               2630 non-null object
VENUE               2630 non-null object
1Q                  2630 non-null float64
2Q                  2630 non-null float64
3Q                  2630 non-null float64
4Q                  2630 non-null float64
OT1                 154 non-null float64
OT2                 30 non-null float64
OT3                 6 non-null float64
OT4                 0 non-null float64
F                   2630 non-null float64
MIN                 2630 non-null float64
FG                  2630 non-null float64
FGA                 2630 non-null float64
3P                  2630 non-null float64
3PA                 2630 non-null float64
FT                  2630 non-null float64
FTA                 2630 non-null float64
OR                  2630 non-null float64
DR                  2630

In [38]:
nba2006['DATE'].dropna(inplace = True)


In [39]:
nba2006.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
DATASET,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season
DATE,10/31/06,10/31/06,10/31/06,10/31/06,11/01/06,11/01/06,11/01/06,11/01/06,11/01/06,11/01/06
TEAMS,Chicago,Miami,Phoenix,LA Lakers,Indiana,Charlotte,Chicago,Orlando,Atlanta,Philadelphia
VENUE,Road,Home,Road,Home,Road,Home,Road,Home,Road,Home
1Q,22,16,41,26,23,27,20,32,14,29
2Q,37,14,17,27,26,23,25,31,21,20
3Q,21,21,21,34,28,23,25,23,23,24
4Q,28,15,27,27,29,26,24,23,17,15
OT1,,,,,,,,,,
OT2,,,,,,,,,,


### NBA csv File Structure

Currently, the NBA data has two records for each game one with the home team data and one with the away team data.  The unit of analysis for this study is the game so the data needs to be rearranged to use the game as the unit of analysis.  Therefore, each even row must be appended to the odd row above it to make the game the unit of analysis

In [40]:


def merge_home_away(df):
    
    #make all column names lower snake case
    df.columns = [col.lower().replace(' ', '_') for col in df.columns]
    
    #replace missing data with zeros
    missing_list = ['ot1', 'ot2', 'ot3', 'ot4']
    
    
    #some missing moneylines going to set them to zero for now look explore later
    df['moneyline'].fillna(0, inplace = True)
    df['movements'].fillna('none', inplace = True)
    
    
    for ot in missing_list:
        df[ot].fillna(0, inplace = True)
    
    
    #split up the rows
    df1 =  df[df.index %2 == 0]
    df2 = df[df.index %2 == 1] 
    
    #drop unnessecary columns in df1
    
    df1.drop(columns = ['box_score', 
                        'odds',
                        'halftime'], inplace = True)
    
    #drop redundant columns in df2
    df2.drop(columns = ['dataset',
                        'date',
                        'venue', 
                        'pts', 
                        'poss', 
                        'pace', 
                        'box_score', 
                        'odds',
                        'opening_odds', 
                        'main_referee',
                        'halftime'], inplace = True)
    
    #rename columns df1 and df2 
    
    home =  {'starting_lineup' : 'home_starter1',
             'unnamed:_35'     : 'home_starter2',
             'unnamed:_36'     : 'home_starter3',
             'unnamed:_37'     : 'home_starter4',
             'unnamed:_38'     : 'home_starter5',}
   
    df2.rename(columns = home, inplace = True) 
    
    away =  {'teams'     : 'away_team',
             '1q'        : 'away_1q',
             '2q'        : 'away_2q',
             '3q'        : 'away_3q',
             '4q'        : 'away_4q',
             'ot1'       : 'away_ot1',
             'ot2'       : 'away_ot2',
             'ot3'       : 'away_ot3',
             'ot4'       : 'away_ot4',
             'f'         : 'away_score',
             'min'       : 'away_min',
             'fg'        : 'away_fg',
             'fga'       : 'away_fga',
             '3p'        : 'away_3p',
             '3pa'       : 'away_3pa',
             'ft'        : 'away_ft',
             'fta'       : 'away_fta',
             'or'        : 'away_or',
             'dr'        : 'away_dr',
             'tot'       : 'away_total_reb',
             'a'         : 'away_assists',
             'pf'        : 'away_fouls',
             'st'        : 'away_steals',
             'or'        : 'away_or',
             'to'        : 'away_turnovers',
             'bl'        : 'away_blocks',
             'poss'      : 'away_poss',
             'pace'      : 'away_pace',
             'oeff'      : 'away_off_eff',
             'deff'      : 'away_def_eff',
             'rest_days' : 'away_rest',
             'starting_lineup' : 'away_starter1',
             'unnamed:_35'     : 'away_starter2',
             'unnamed:_36'     : 'away_starter3',
             'unnamed:_37'     : 'away_starter4',
             'unnamed:_38'     : 'away_starter5',
             'crew_referees'   : 'ref_3',
             'spread'          : 'away_line',
             'total'           : 'away_total',
             'movements'       : 'total_moves',
             'closing'         : 'closing total',
             'moneyline'       : 'away_moneyline'
             }
    df2.rename(columns = away, inplace = True)
    
    #reset the indexs to merge the files
    df1.reset_index(inplace = True)
    df2.reset_index(inplace = True)
                         
    #merge data so game becomes rather than team unit of analysis                      
    new =  pd.concat([df1,df2], axis = 1)
    
    #call second function
    new2 = create_home_mov_ave(new)
    new3 = create_away_mov_ave(new2)
    return new3

In [41]:
def create_home_mov_ave(df):
    

    
    #add underscores to teams
    df['teams'] = [str(team).replace(' ', '_') for team in df['teams']]
    
    #loop through teams
    teams = ['Atlanta','Boston','Charlotte','Chicago',
             'Cleveland','Dallas','Denver','Detroit',
             'Golden_State','Houston','Indiana','LA_Clippers',
             'LA_Lakers','Memphis','Miami','Milwaukee',
             'Minnesota','New_Jersey','New_Orleans','New_York',
             'Orlando','Philadelphia','Phoenix','Portland',
             'Sacramento','San_Antonio','Seattle','Toronto',
             'Utah','Washington']
    

    columns = ['1q', '2q', '3q', '4q',
               'ot1', 'ot2', 'ot3', 'ot4', 'f', 'min', 'fg', 
               'fga', '3p', '3pa', 'ft','fta', 'or', 'dr', 'tot', 
               'a', 'pf', 'st', 'to', 'pts', 'poss',
               'pace', 'oeff', 'deff', 'bl']
    #create empty data frame to put in results
    home_vars = pd.DataFrame()
    
    for team in teams:
        print(team)
        df_team = df[df['teams'] == team]
        
        for column in columns:
            column_new = 'mov_5_' + column
            df_team[column_new] = df_team[column].rolling(5).mean().shift(1)
        
        home_vars = home_vars.append(df_team)
    
        
    return home_vars 



In [42]:
def create_away_mov_ave(df):
    

    
    #add underscores to teams
    df['away_team'] = [str(team).replace(' ', '_') for team in df['away_team']]
    
    #loop through teams
    teams = ['Atlanta','Boston','Charlotte','Chicago',
             'Cleveland','Dallas','Denver','Detroit',
             'Golden_State','Houston','Indiana','LA_Clippers',
             'LA_Lakers','Memphis','Miami','Milwaukee',
             'Minnesota','New_Jersey','New_Orleans','New_York',
             'Orlando','Philadelphia','Phoenix','Portland',
             'Sacramento','San_Antonio','Seattle','Toronto',
             'Utah','Washington']
    

    columns = ['away_1q', 'away_2q', 'away_3q', 'away_4q',
               'away_ot1', 'away_ot2', 'away_ot3', 'away_ot4', 
               'away_score', 'away_min', 'away_fg', 'away_fga', 
               'away_3p', 'away_3pa', 'away_ft','away_fta',
               'away_or', 'away_dr', 'away_total_reb', 
               'away_assists', 'away_fouls', 'away_steals', 
               'away_turnovers','away_blocks', 
               'away_off_eff',
               'away_def_eff']
    #create empty data frame to put in results
    away_vars = pd.DataFrame()
    
    for team in teams:
        print(team)
        df_team = df[df['away_team'] == team]
        
        for column in columns:
            column_new = 'mov_5_' + column
            df_team[column_new] = df_team[column].rolling(5).mean().shift(1)
        
        away_vars = away_vars.append(df_team)
    
        
    return away_vars



In [43]:
nba2006_trans = merge_home_away(nba2006)
nba2007_trans = merge_home_away(nba2007)
nba2008_trans = merge_home_away(nba2008)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Atlanta


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Boston
Charlotte
Chicago
Cleveland
Dallas
Denver
Detroit
Golden_State
Houston
Indiana
LA_Clippers
LA_Lakers
Memphis
Miami
Milwaukee
Minnesota
New_Jersey
New_Orleans
New_York
Orlando
Philadelphia
Phoenix
Portland
Sacramento
San_Antonio
Seattle
Toronto
Utah
Washington
Atlanta


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Boston
Charlotte
Chicago
Cleveland
Dallas
Denver
Detroit
Golden_State
Houston
Indiana
LA_Clippers
LA_Lakers
Memphis
Miami
Milwaukee
Minnesota
New_Jersey
New_Orleans
New_York
Orlando
Philadelphia
Phoenix
Portland
Sacramento
San_Antonio
Seattle
Toronto
Utah
Washington
Atlanta
Boston
Charlotte
Chicago
Cleveland
Dallas
Denver
Detroit
Golden_State
Houston
Indiana
LA_Clippers
LA_Lakers
Memphis
Miami
Milwaukee
Minnesota
New_Jersey
New_Orleans
New_York
Orlando
Philadelphia
Phoenix
Portland
Sacramento
San_Antonio
Seattle
Toronto
Utah
Washington
Atlanta
Boston
Charlotte
Chicago
Cleveland
Dallas
Denver
Detroit
Golden_State
Houston
Indiana
LA_Clippers
LA_Lakers
Memphis
Miami
Milwaukee
Minnesota
New_Jersey
New_Orleans
New_York
Orlando
Philadelphia
Phoenix
Portland
Sacramento
San_Antonio
Seattle
Toronto
Utah
Washington
Atlanta
Boston
Charlotte
Chicago
Cleveland
Dallas
Denver
Detroit
Golden_State
Houston
Indiana
LA_Clippers
LA_Lakers
Memphis
Miami
Milwaukee
Minnesota
New_Jersey
New_Orleans
New_York
O

### Calulating my dependent variable.  

Line_cv is the amount the game went over or under the spread, and then cover is 1 for a home team spread victory and 0 for an away team spread victory.  This leave us with to possible methods of prediction.  We can use regression to predict the cover amount or we can use categorization to predict which category it fall into.  We also could try and predict individual team scores using regression and use those as a prediction

In [44]:
nba2006_trans['line_cv'] = (nba2006_trans.away_score - 
                            nba2006_trans.f - nba2006_trans.spread)

nba2007_trans['line_cv'] = (nba2007_trans.away_score - 
                            nba2007_trans.f - nba2007_trans.spread)

nba2008_trans['line_cv'] = (nba2008_trans.away_score - 
                            nba2008_trans.f - nba2008_trans.spread)


In [45]:
nba2006_trans['cover'] = nba2006_trans['line_cv'].map(lambda x: 1 if x >0 else 0)
nba2007_trans['cover'] = nba2007_trans['line_cv'].map(lambda x: 1 if x >0 else 0)
nba2008_trans['cover'] = nba2008_trans['line_cv'].map(lambda x: 1 if x >0 else 0)

In [46]:
def merge_years(df1,df2):
    return pd.concat([df1,df2], axis = 0)
    

In [47]:
nba_combined = merge_years(nba2006_trans,nba2007_trans)


In [48]:
nba_combined = merge_years(nba_combined,nba2008_trans)

In [49]:
nba_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3858 entries, 556 to 113
Columns: 145 entries, index to cover
dtypes: float64(118), int64(2), object(25)
memory usage: 4.3+ MB


### Dropping columns that are no longer necessary.  

The NBA data that relates to the particular game that has just been played is not longer necessary so we can drop those columns to make sure that they are not used in the analysis as we do not have access to that data.   

In [50]:
nba_combined.drop(columns = ['away_1q', 'away_2q', 'away_3q', 'away_4q',
               'away_ot1', 'away_ot2', 'away_ot3', 'away_ot4', 
               'away_score', 'away_min', 'away_fg', 'away_fga', 
               'away_3p', 'away_3pa', 'away_ft','away_fta',
               'away_or', 'away_dr', 'away_total_reb', 
               'away_assists', 'away_fouls', 'away_steals', 
               'away_turnovers','away_blocks', 
               'away_off_eff','away_def_eff', '1q', '2q', '3q', '4q',
               'ot1', 'ot2', 'ot3', 'ot4', 'f', 'min', 'fg', 
               'fga', '3p', '3pa', 'ft','fta', 'or', 'dr', 'tot', 
               'a', 'pf', 'st', 'to', 'pts', 'poss',
               'pace', 'oeff', 'deff', 'bl'], inplace = True)

In [51]:
nba_combined.to_csv('./data/nba_combined.csv', index = False)