In [2]:
import pandas as pd

%matplotlib inline

In [3]:
nba2006 = pd.read_csv('./data/2006-2007_NBA_Box_Score_Team_Stats.csv')
nba2007 = pd.read_csv('./data/2007-2008_NBA_Box_Score_Team_Stats.csv')
nba2008 = pd.read_csv('./data/2008-2009_NBA_Box_Score_Team_Stats.csv')

In [16]:
nba2008.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2631 entries, 0 to 2630
Data columns (total 50 columns):
DATASET             2630 non-null object
DATE                2630 non-null object
TEAMS               2630 non-null object
VENUE               2630 non-null object
1Q                  2630 non-null float64
2Q                  2630 non-null float64
3Q                  2630 non-null float64
4Q                  2630 non-null float64
OT1                 154 non-null float64
OT2                 30 non-null float64
OT3                 6 non-null float64
OT4                 0 non-null float64
F                   2630 non-null float64
MIN                 2630 non-null float64
FG                  2630 non-null float64
FGA                 2630 non-null float64
3P                  2630 non-null float64
3PA                 2630 non-null float64
FT                  2630 non-null float64
FTA                 2630 non-null float64
OR                  2630 non-null float64
DR                  2630

In [4]:
nba2006['DATE'].dropna(inplace = True)


In [5]:
nba2006.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
DATASET,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season
DATE,10/31/06,10/31/06,10/31/06,10/31/06,11/01/06,11/01/06,11/01/06,11/01/06,11/01/06,11/01/06
TEAMS,Chicago,Miami,Phoenix,LA Lakers,Indiana,Charlotte,Chicago,Orlando,Atlanta,Philadelphia
VENUE,Road,Home,Road,Home,Road,Home,Road,Home,Road,Home
1Q,22,16,41,26,23,27,20,32,14,29
2Q,37,14,17,27,26,23,25,31,21,20
3Q,21,21,21,34,28,23,25,23,23,24
4Q,28,15,27,27,29,26,24,23,17,15
OT1,,,,,,,,,,
OT2,,,,,,,,,,


### NBA csv File Structure

Currently, the NBA data has two records for each game one with the home team data and one with the away team data.  The unit of analysis for this study is the game so the data needs to be rearranged to use the game as the unit of analysis.  Therefore, each even row must be appended to the odd row above it to make the game the unit of analysis

In [21]:


def merge_home_away(df):
    
    #make all column names lower snake case
    df.columns = [col.lower().replace(' ', '_') for col in df.columns]
    
    #replace missing data with zeros
    missing_list = ['ot1', 'ot2', 'ot3', 'ot4']
    
    
    #some missing moneylines going to set them to zero for now look explore later
    df['moneyline'].fillna(0, inplace = True)
    df['movements'].fillna('none', inplace = True)
    
    
    for ot in missing_list:
        df[ot].fillna(0, inplace = True)
    
    
    #split up the rows
    df1 =  df[df.index %2 == 0]
    df2 = df[df.index %2 == 1] 
    
    #drop unnessecary columns in df1
    
    df1.drop(columns = ['box_score', 
                        'odds',
                        'halftime'])
    
    #drop redunant columns in df2
    df2.drop(columns = ['dataset',
                        'date',
                        'venue', 
                        'pts', 
                        'poss', 
                        'pace', 
                        'box_score', 
                        'odds',
                        'opening_odds', 
                        'main_referee',
                        'halftime'], inplace = True)
    
    
    #rename columns df2 
    away =  {'teams'     : 'away_team',
             '1q'        : 'away_1q',
             '2q'        : 'away_2q',
             '3q'        : 'away_3q',
             '4q'        : 'away_4q',
             'ot1'       : 'away_ot1',
             'ot2'       : 'away_ot2',
             'ot3'       : 'away_ot3',
             'ot4'       : 'away_ot4',
             'f'         : 'away_score',
             'min'       : 'away_min',
             'fg'        : 'away_fg',
             'fga'       : 'away_fga',
             '3p'        : 'away_3p',
             '3pa'       : 'away_3pa',
             'ft'        : 'away_ft',
             'fta'       : 'away_fta',
             'or'        : 'away_or',
             'dr'        : 'away_dr',
             'tot'       : 'away_total_reb',
             'a'         : 'away_assists',
             'pf'        : 'away_fouls',
             'st'        : 'away_steals',
             'or'        : 'away_or',
             'to'        : 'away_turnovers',
             'bl'        : 'away_blocks',
             'poss'      : 'away_poss',
             'pace'      : 'away_pace',
             'oeff'      : 'away_off_eff',
             'deff'      : 'away_def_eff',
             'rest_days' : 'away_rest',
             'starting_lineup' : 'away_starter1',
             'unnamed:_35'     : 'away_starter2',
             'unnamed:_36'     : 'away_starter3',
             'unnamed:_37'     : 'away_starter4',
             'unnamed:_38'     : 'away_starter5',
             'crew_referees'   : 'ref_3',
             'spread'          : 'away_line',
             'total'           : 'away_total',
             'movements'       : 'total_moves',
             'closing'         : 'closing total',
             'moneyline'       : 'away_moneyline'
             }
    df2.rename(columns = away, inplace = True)
    
    #reset the indexs to merge the files
    df1.reset_index(inplace = True)
    df2.reset_index(inplace = True)
                         
    #merge data so game becomes rather than team unit of analysis                      
    new =  pd.concat([df1,df2], axis = 1)
    
    #call second function
    new2 = create_home_mov_ave(new)
    new3 = create_away_mov_ave(new2)

In [22]:
def create_home_mov_ave(df):
    

    
    #add underscores to teams
    df['teams'] = [str(team).replace(' ', '_') for team in df['teams']]
    
    #loop through teams
    teams = ['Atlanta','Boston','Charlotte','Chicago',
             'Cleveland','Dallas','Denver','Detroit',
             'Golden_State','Houston','Indiana','LA_Clippers',
             'LA_Lakers','Memphis','Miami','Milwaukee',
             'Minnesota','New_Jersey','New_Orleans','New_York',
             'Orlando','Philadelphia','Phoenix','Portland',
             'Sacramento','San_Antonio','Seattle','Toronto',
             'Utah','Washington']
    

    columns = ['1q', '2q', '3q', '4q',
               'ot1', 'ot2', 'ot3', 'ot4', 'f', 'min', 'fg', 
               'fga', '3p', '3pa', 'ft','fta', 'or', 'dr', 'tot', 
               'a', 'pf', 'st', 'to', 'pts', 'poss',
               'pace', 'oeff', 'deff', 'bl']
    #create empty data frame to put in results
    home_vars = pd.DataFrame()
    
    for team in teams:
        print(team)
        df_team = df[df['teams'] == team]
        
        for column in columns:
            column_new = 'mov_5_' + column
            df_team[column_new] = df_team[column].rolling(5).mean().shift(1)
        
        home_vars = home_vars.append(df_team)
    
        
    return home_vars 



In [23]:
def create_away_mov_ave(df):
    

    
    #add underscores to teams
    df['away_team'] = [str(team).replace(' ', '_') for team in df['away_team']]
    
    #loop through teams
    teams = ['Atlanta','Boston','Charlotte','Chicago',
             'Cleveland','Dallas','Denver','Detroit',
             'Golden_State','Houston','Indiana','LA_Clippers',
             'LA_Lakers','Memphis','Miami','Milwaukee',
             'Minnesota','New_Jersey','New_Orleans','New_York',
             'Orlando','Philadelphia','Phoenix','Portland',
             'Sacramento','San_Antonio','Seattle','Toronto',
             'Utah','Washington']
    

    columns = ['away_1q', 'away_2q', 'away_3q', 'away_4q',
               'away_ot1', 'away_ot2', 'away_ot3', 'away_ot4', 
               'away_score', 'away_min', 'away_fg', 'away_fga', 
               'away_3p', 'away_3pa', 'away_ft','away_fta',
               'away_or', 'away_dr', 'away_total_reb', 
               'away_assists', 'away_fouls', 'away_steals', 
               'away_turnovers','away_blocks', 
               'away_off_eff',
               'away_def_eff']
    #create empty data frame to put in results
    away_vars = pd.DataFrame()
    
    for team in teams:
        print(team)
        df_team = df[df['away_team'] == team]
        
        for column in columns:
            column_new = 'mov_5_' + column
            df_team[column_new] = df_team[column].rolling(5).mean().shift(1)
        
        away_vars = away_vars.append(df_team)
    
        
    return away_vars



In [24]:
nba2006_trans = merge_home_away(nba2006)

NameError: name 'true' is not defined

In [16]:
nba2006_trans['line_cv'] = nba2006_trans.away_score - nba2006_trans.f - nba2006_trans.spread

In [17]:
nba2006_trans.head(10).T

Unnamed: 0,556,1159,212,583,340,1087,226,413,1034,308
index,1112,2318,424,1166,680,2174,452,826,2068,616
dataset,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season,2006-2007 Regular Season
date,01/15/07,04/10/07,11/29/06,01/19/07,12/16/06,04/01/07,12/01/06,12/27/06,03/25/07,12/12/06
teams,Boston,Boston,Charlotte,Charlotte,Chicago,Chicago,Cleveland,Cleveland,Dallas,Denver
venue,Road,Road,Road,Road,Road,Road,Road,Road,Road,Road
1q,15,28,16,21,24,32,32,28,39,22
2q,25,31,25,32,27,20,30,22,19,21
3q,26,19,31,22,24,23,17,16,17,28
4q,30,18,18,21,18,30,27,23,29,29
ot1,,,,,13,,,,,


In [18]:
nba2006_trans['cover'] = nba2006_trans['line_cv'].map(lambda x: 1 if x >0 else 0)

In [19]:
nba2006_trans.cover.value_counts(normalize = True)

0    0.537051
1    0.462949
Name: cover, dtype: float64

In [20]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.grid_search import GridSearchCV
ad = AdaBoostClassifier()
ad_params = {'n_estimators' : [40, 50, 30],
             'learning_rate' : [.6,.7,.8]
             }

features = ['spread','mov_3_1q', 'mov_3_2q', 'mov_3_3q', 'mov_3_4q', 
         
            'mov_3_f', 'mov_3_min', 'mov_3_fg', 'mov_3_fga',
            'mov_3_3p', 'mov_3_3pa', 'mov_3_ft', 'mov_3_fta', 'mov_3_or',
            'mov_3_dr', 'mov_3_tot', 'mov_3_a', 'mov_3_pf', 'mov_3_st', 
            'mov_3_to',
       'mov_3_pts', 'mov_3_poss', 'mov_3_pace', 'mov_3_oeff','mov_3_deff', 
            'mov_3_bl',
       'mov_3_away_1q', 'mov_3_away_2q',
       'mov_3_away_3q', 'mov_3_away_4q' , 'mov_3_away_score',
       'mov_3_away_min', 'mov_3_away_fg', 'mov_3_away_fga', 'mov_3_away_3p',
       'mov_3_away_3pa', 'mov_3_away_ft', 'mov_3_away_fta', 'mov_3_away_or',
       'mov_3_away_dr', 'mov_3_away_total_reb', 'mov_3_away_assists', 'mov_3_away_fouls',
       'mov_3_away_steals', 'mov_3_away_turnovers', 'mov_3_away_blocks',
       'mov_3_away_off_eff', 'mov_3_away_def_eff', 'cover']

rid = nba2006_trans[features].dropna()

features2 = ['spread','mov_3_1q', 'mov_3_2q', 'mov_3_3q', 'mov_3_4q', 
             
            'mov_3_f', 'mov_3_min', 'mov_3_fg', 'mov_3_fga',
            'mov_3_3p', 'mov_3_3pa', 'mov_3_ft', 'mov_3_fta', 'mov_3_or',
            'mov_3_dr', 'mov_3_tot', 'mov_3_a', 'mov_3_pf', 'mov_3_st', 
            'mov_3_to',
       'mov_3_pts', 'mov_3_poss', 'mov_3_pace', 'mov_3_oeff','mov_3_deff', 
            'mov_3_bl',
       'mov_3_away_1q', 'mov_3_away_2q',
       'mov_3_away_3q', 'mov_3_away_4q', 
       'mov_3_away_min', 'mov_3_away_fg', 'mov_3_away_fga', 'mov_3_away_3p',
       'mov_3_away_3pa', 'mov_3_away_ft', 'mov_3_away_fta', 'mov_3_away_or',
       'mov_3_away_dr', 'mov_3_away_total_reb', 'mov_3_away_assists', 'mov_3_away_fouls',
       'mov_3_away_steals', 'mov_3_away_turnovers', 'mov_3_away_blocks',
       'mov_3_away_off_eff', 'mov_3_away_def_eff']


X = rid[features2]
y = rid['cover']






In [21]:
from sklearn.cross_validation import train_test_split, cross_val_score

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [34]:
gs = GridSearchCV(ad, 
                  param_grid=ad_params,
                  scoring= "accuracy")

gs.fit(X_train,y_train)
gs.best_score_


0.5134818288393904

In [35]:
gs.best_params_

{'learning_rate': 0.6, 'n_estimators': 30}

In [36]:
y_hat_train = gs.predict(X_train)
y_hat_test = gs.predict(X_test)
y_hat_train_proba = gs.predict_proba(X_train)
y_hat_test_proba  = gs.predict_proba(X_test)

In [39]:
cross_val_score(ad,X_train, y_train, cv = 4)

array([0.43457944, 0.42723005, 0.4600939 , 0.50704225])

In [40]:
gs.score(X_test, y_test)

0.5052631578947369

In [41]:
rid.cover.value_counts(normalize = True)

0    0.540422
1    0.459578
Name: cover, dtype: float64

In [42]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report( y_train, y_hat_train))

print(classification_report( y_test, y_hat_test))

             precision    recall  f1-score   support

          0       0.63      0.86      0.73       460
          1       0.71      0.42      0.53       393

avg / total       0.67      0.66      0.64       853

             precision    recall  f1-score   support

          0       0.53      0.72      0.61       155
          1       0.43      0.25      0.31       130

avg / total       0.48      0.51      0.48       285



In [29]:
#modifying train and testing data to allow for changing prob thresholds and printing the 
#roc curves
X_train.loc[:, 'actual_y'] = y_train
X_train.loc[:, 'predicted_label'] = y_hat_train
X_train.loc[:, 'predicted_proba'] = y_hat_train_proba[:, 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [30]:

X_test.loc[:, 'actual_y'] = y_test
X_test.loc[:, 'predicted_label'] = y_hat_test
X_test.loc[:, 'predicted_proba'] = y_hat_test_proba[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [31]:
#print out confusion matrices from .1 .5 by .02 for Xtest and Xtrain
for prob in range(490, 500, 1):
    proba = prob/1000
    
    X_test.loc[:, 'predicted_label']= X_test['predicted_proba'].map(
    lambda p: 1 if p > proba else 0)
    
    X_train.loc[:, 'predicted_label']= X_train['predicted_proba'].map(
    lambda p: 1 if p > proba else 0)
    
    X_test['predicted_label']
    print('Test  ' + str(proba))
    print(confusion_matrix(y_test, X_test['predicted_label']))
    print(' Train  ' + str(proba))
    print(confusion_matrix(y_train, X_train['predicted_label']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Test  0.49
[[ 10 136]
 [ 17 122]]
 Train  0.49
[[ 80 389]
 [ 15 369]]
Test  0.491
[[ 13 133]
 [ 23 116]]
 Train  0.491
[[109 360]
 [ 25 359]]
Test  0.492
[[ 16 130]
 [ 25 114]]
 Train  0.492
[[120 349]
 [ 27 357]]
Test  0.493
[[ 24 122]
 [ 33 106]]
 Train  0.493
[[147 322]
 [ 35 349]]
Test  0.494
[[ 42 104]
 [ 42  97]]
 Train  0.494
[[200 269]
 [ 53 331]]
Test  0.495
[[ 44 102]
 [ 44  95]]
 Train  0.495
[[213 256]
 [ 62 322]]
Test  0.496
[[52 94]
 [54 85]]
 Train  0.496
[[250 219]
 [ 84 300]]
Test  0.497
[[73 73]
 [77 62]]
 Train  0.497
[[316 153]
 [123 261]]
Test  0.498
[[75 71]
 [81 58]]
 Train  0.498
[[328 141]
 [129 255]]
Test  0.499
[[89 57]
 [95 44]]
 Train  0.499
[[371  98]
 [175 209]]
