In [59]:
import pandas as pd
from collections import defaultdict

import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
input_file = 'leagues_nba_2014_games.txt' # Source: https://www.basketball-reference.com/leagues/NBA_2014_games.html

In [3]:
# Load data
df = pd.read_csv(input_file, parse_dates = ['Date'])

# Rename existing columns
df.rename(columns = {'Visitor/Neutral': 'Visitor Team',
                     'Home/Neutral': 'Home Team',
                     'Unnamed: 6': 'Score Type',
                     'Unnamed: 7': 'OT?',
                     'PTS': 'VisitorPts',
                     'PTS.1': 'HomePts'}, inplace = True)
df.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Notes
0,2013-10-29,7:00 pm,Orlando Magic,87,Indiana Pacers,97,Box Score,,
1,2013-10-29,10:30 pm,Los Angeles Clippers,103,Los Angeles Lakers,116,Box Score,,
2,2013-10-29,8:00 pm,Chicago Bulls,95,Miami Heat,107,Box Score,,
3,2013-10-30,7:00 pm,Brooklyn Nets,94,Cleveland Cavaliers,98,Box Score,,
4,2013-10-30,8:30 pm,Atlanta Hawks,109,Dallas Mavericks,118,Box Score,,


In [4]:
df.describe()

Unnamed: 0,VisitorPts,HomePts
count,1319.0,1319.0
mean,99.603487,102.216831
std,11.684056,11.888199
min,66.0,63.0
25%,91.0,94.0
50%,99.0,102.0
75%,107.0,110.0
max,145.0,143.0


In [5]:
# Create a new column `HomeWin` which is boolean that indicates if the Home is the winning
# team or Visitor
df['HomeWin'] = df['HomePts'] > df['VisitorPts']
df.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Notes,HomeWin
0,2013-10-29,7:00 pm,Orlando Magic,87,Indiana Pacers,97,Box Score,,,True
1,2013-10-29,10:30 pm,Los Angeles Clippers,103,Los Angeles Lakers,116,Box Score,,,True
2,2013-10-29,8:00 pm,Chicago Bulls,95,Miami Heat,107,Box Score,,,True
3,2013-10-30,7:00 pm,Brooklyn Nets,94,Cleveland Cavaliers,98,Box Score,,,True
4,2013-10-30,8:30 pm,Atlanta Hawks,109,Dallas Mavericks,118,Box Score,,,True


In [14]:
# Store the team's last result
won_last = defaultdict(int)

df['HomeLastWin'] = False
df['VisitorLastWin'] = False
for i, row in df.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']

    row['HomeLastWin'] = won_last[home_team]
    row['VisitorLastWin'] = won_last[visitor_team]
    
    df.loc[index, 'HomeLastWin'] = won_last[home_team]
    df.loc[index, 'VisitorLastWin'] = won_last[visitor_team]
    

    won_last[home_team] = row['HomeWin']
    won_last[visitor_team] = not row['HomeWin']

# Print the data for the row 20 to 25
print(df.loc[20:25,])

         Date Start (ET)            Visitor Team  VisitorPts  \
20 2013-11-01    7:30 pm         Milwaukee Bucks         105   
21 2013-11-01    8:00 pm              Miami Heat         100   
22 2013-11-01    7:00 pm     Cleveland Cavaliers          84   
23 2013-11-01    9:00 pm  Portland Trail Blazers         113   
24 2013-11-01    8:00 pm        Dallas Mavericks         105   
25 2013-11-01   10:30 pm       San Antonio Spurs          91   

             Home Team  HomePts Score Type  OT? Notes  HomeWin  HomeLastWin  \
20      Boston Celtics       98  Box Score  NaN   NaN    False        False   
21       Brooklyn Nets      101  Box Score  NaN   NaN     True        False   
22   Charlotte Bobcats       90  Box Score  NaN   NaN     True        False   
23      Denver Nuggets       98  Box Score  NaN   NaN    False        False   
24     Houston Rockets      113  Box Score  NaN   NaN     True        False   
25  Los Angeles Lakers       85  Box Score  NaN   NaN    False        False  

In [15]:
# Create a new classifier
clf = DecisionTreeClassifier(random_state=4)

In [16]:
X_previous_wins = df[['HomeLastWin', 'VisitorLastWin']].values
y_true = df['HomeWin'].values

In [17]:
scores = cross_val_score(clf, X_previous_wins, y_true, scoring='accuracy')
print('Accuracy: {:.1f}%'.format(np.mean(scores) * 100))

Accuracy: 57.9%


In [18]:
# Load the standings data
standings_filename = 'leagues_nba_2013_standings.txt'

In [19]:
standings = pd.read_csv(standings_filename, skiprows=[0])
standings.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Miami Heat,66-16,37-4,29-12,41-11,25-5,14-4,12-6,15-1,...,30-2,9-3,39-8,1-0,10-3,10-5,8-5,12-1,17-1,8-1
1,2,Oklahoma City Thunder,60-22,34-7,26-15,21-9,39-13,7-3,8-2,6-4,...,21-8,3-6,44-6,,13-4,11-2,11-5,7-4,12-5,6-2
2,3,San Antonio Spurs,58-24,35-6,23-18,25-5,33-19,8-2,9-1,8-2,...,16-12,9-5,31-10,1-0,12-4,12-4,12-3,8-3,10-4,3-6
3,4,Denver Nuggets,57-25,38-3,19-22,19-11,38-14,5-5,10-0,4-6,...,24-4,11-7,28-8,0-1,8-8,9-6,12-3,8-4,13-2,7-1
4,5,Los Angeles Clippers,56-26,32-9,24-17,21-9,35-17,7-3,8-2,6-4,...,17-9,3-5,38-12,1-0,8-6,16-0,9-7,8-5,7-7,7-1


In [20]:
df['HomeTeamRanksHigher'] = 0

# Note that iterrows create a copy, so you can update the column with it
for index, row in df.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    
    # Correct the 2013 team name that has been renamed in 2014
    if home_team == 'New Orleans Pelicans':
        home_team = 'New Orleans Hornets'
    elif visitor_team == 'New Orleans Pelicans':
        visitor_team = 'New Orleans Hornets'
    
    home_rank = standings[standings['Team'] == home_team]['Rk'].values[0]
    visitor_rank = standings[standings['Team'] == visitor_team]['Rk'].values[0]
    
    # Overwrite the values for the column
    df.loc[index, 'HomeTeamRanksHigher'] = int(home_rank > visitor_rank)
print('done')

done


In [23]:
df.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeTeamRanksHigher
0,2013-10-29,7:00 pm,Orlando Magic,87,Indiana Pacers,97,Box Score,,,True,False,False,0
1,2013-10-29,10:30 pm,Los Angeles Clippers,103,Los Angeles Lakers,116,Box Score,,,True,False,False,1
2,2013-10-29,8:00 pm,Chicago Bulls,95,Miami Heat,107,Box Score,,,True,False,False,0
3,2013-10-30,7:00 pm,Brooklyn Nets,94,Cleveland Cavaliers,98,Box Score,,,True,False,False,1
4,2013-10-30,8:30 pm,Atlanta Hawks,109,Dallas Mavericks,118,Box Score,,,True,False,False,1


In [33]:
X_home_higher = df[['HomeLastWin', 'VisitorLastWin', 'HomeTeamRanksHigher']].values
print(X_home_higher)

[[False False 0]
 [False False 1]
 [False False 0]
 ..., 
 [False False 0]
 [False False 0]
 [True False 1]]


In [36]:
# Create classifier
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_home_higher, y_true, scoring='accuracy')
print('Accuracy is: {:.1f}%'.format(np.mean(scores) * 100))

Accuracy is: 57.4%


In [39]:
# Create a dictionary that store the winner of the past game and
# create a new feature in our data frame
last_match_winner = defaultdict(int)
df['HomeTeamLastWon'] = 0

for index, row in df.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    
    teams = tuple(sorted([home_team, visitor_team]))
    row['HomeTeamWonLast'] = 1 if last_match_winner[teams] == row['Home Team'] else 0 
    
    # Update the column
    df.loc[index, 'HomeTeamWonLast'] = row['HomeTeamWonLast']
    
    winner = row['Home Team'] if row['HomeWin'] else row['Visitor Team']
    last_match_winner[teams] = winner
print('done')

done


In [40]:
X_last_winner = df[['HomeTeamRanksHigher', 'HomeTeamWonLast']].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_last_winner, y_true, scoring='accuracy')
print('Accuracy: {:.1f}%'.format(np.mean(scores) * 100))

Accuracy: 57.8%


In [43]:
# Create a new label encoder to convert between string-based team names into integers
encoding = LabelEncoder()

# Fit the transformer to the home teams
encoding.fit(df['Home Team'].values)

LabelEncoder()

In [45]:
home_teams = encoding.transform(df['Home Team'].values)
visitor_teams = encoding.transform(df['Visitor Team'].values)
X_teams = np.vstack([home_teams, visitor_teams]).T

In [47]:
# Create one hot encoder to encode integers into numbers of integer values
onehot = OneHotEncoder()

In [48]:
X_teams_expanded = onehot.fit_transform(X_teams).todense()

In [50]:
clf = DecisionTreeClassifier(random_state=14)

scores = cross_val_score(clf, X_teams_expanded, y_true, scoring='accuracy')
print('Accuracy is {:.1f}%'.format(np.mean(scores) * 100))

Accuracy is 59.5%


In [56]:
clf = RandomForestClassifier(random_state=14)

scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print('Accuracy is {:.1f}%'.format(np.mean(scores) * 100))

Accuracy is 58.4%


In [58]:
X_all = np.hstack([X_home_higher, X_teams])

clf = RandomForestClassifier(random_state=14)

scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print('Accuracy is {:.1f}%'.format(np.mean(scores) * 100))

Accuracy is 58.5%


In [65]:
# Perform grid search
parameter_space = {
    'max_features': [2, 10, 'auto'],
    'n_estimators': [100,],
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [2, 4, 6]
}
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_teams_expanded, y_true)
print('Accuracy of grid search: {0:.1f}%'.format(grid.best_score_ * 100))

Accuracy of grid search: 65.0%


In [66]:
print(grid.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=14, verbose=0, warm_start=False)
