In [152]:
%matplotlib inline

import numpy as np
import pandas as pd

from collections import defaultdict

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier


from scipy.sparse import hstack # np.vstack/hstack does not work.

In [37]:
df = pd.read_csv('leagues_nba_2014_games.txt',
                 parse_dates=['Date']) # We have a column called "Date".
df.iloc[:3]

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Notes
0,2013-10-29,7:00 pm,Orlando Magic,87,Indiana Pacers,97,Box Score,,
1,2013-10-29,10:30 pm,Los Angeles Clippers,103,Los Angeles Lakers,116,Box Score,,
2,2013-10-29,8:00 pm,Chicago Bulls,95,Miami Heat,107,Box Score,,


In [40]:
original_columns = df.columns.tolist()
original_columns

['Date',
 'Start (ET)',
 'Visitor/Neutral',
 'PTS',
 'Home/Neutral',
 'PTS.1',
 'Unnamed: 6',
 'Unnamed: 7',
 'Notes']

In [46]:
renamed_columns = ['Date', 
                   'Start (ET)', 
                   'Visitor Team', 
                   'VisitorPts', 
                   'Home Team',
                   'HomePts',
                   'Score Type',
                   'OT?',
                   'Notes']
df = df.rename(columns=dict(zip(original_columns, renamed_columns)))
df.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Notes
0,2013-10-29,7:00 pm,Orlando Magic,87,Indiana Pacers,97,Box Score,,
1,2013-10-29,10:30 pm,Los Angeles Clippers,103,Los Angeles Lakers,116,Box Score,,
2,2013-10-29,8:00 pm,Chicago Bulls,95,Miami Heat,107,Box Score,,
3,2013-10-30,7:00 pm,Brooklyn Nets,94,Cleveland Cavaliers,98,Box Score,,
4,2013-10-30,8:30 pm,Atlanta Hawks,109,Dallas Mavericks,118,Box Score,,


In [47]:
# Adding new features.
df['HomeWin'] = df['HomePts'] > df['VisitorPts']

In [50]:
y_true = df['HomeWin'].values
y_true

array([ True,  True,  True, ..., False, False,  True])

In [66]:
won_last = defaultdict(int)

for index, row in df.iterrows():
    home, visitor = row['Home Team'], row['Visitor Team']
    row['HomeLastWin'] = won_last[home]
    row['VisitorLastWin'] = won_last[visitor]
    df.loc[index] = row
    
    won_last[home] = row['HomeWin']
    won_last[visitor] = not row['HomeWin']

True

True

In [67]:
df.iloc[20:25]

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Notes,HomeWin,HomeLastWin,VisitorLastWin
20,2013-11-01,7:30 pm,Milwaukee Bucks,105,Boston Celtics,98,Box Score,,,False,False,False
21,2013-11-01,8:00 pm,Miami Heat,100,Brooklyn Nets,101,Box Score,,,True,False,False
22,2013-11-01,7:00 pm,Cleveland Cavaliers,84,Charlotte Bobcats,90,Box Score,,,True,False,True
23,2013-11-01,9:00 pm,Portland Trail Blazers,113,Denver Nuggets,98,Box Score,,,False,False,False
24,2013-11-01,8:00 pm,Dallas Mavericks,105,Houston Rockets,113,Box Score,,,True,True,True


In [69]:
clf = DecisionTreeClassifier(random_state=14)
X_previous_wins = df[['HomeLastWin', 'VisitorLastWin']].values
scores = cross_val_score(clf, X_previous_wins, y_true, cv=5)
print(f'Accuracy: {np.mean(scores) * 100:.2f}%')

Accuracy: 56.18%


In [79]:
df_standings = pd.read_csv('leagues_nba_2013_standings.txt', skiprows=[0])
df_standings.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Miami Heat,66-16,37-4,29-12,41-11,25-5,14-4,12-6,15-1,...,30-2,9-3,39-8,1-0,10-3,10-5,8-5,12-1,17-1,8-1
1,2,Oklahoma City Thunder,60-22,34-7,26-15,21-9,39-13,7-3,8-2,6-4,...,21-8,3-6,44-6,,13-4,11-2,11-5,7-4,12-5,6-2
2,3,San Antonio Spurs,58-24,35-6,23-18,25-5,33-19,8-2,9-1,8-2,...,16-12,9-5,31-10,1-0,12-4,12-4,12-3,8-3,10-4,3-6
3,4,Denver Nuggets,57-25,38-3,19-22,19-11,38-14,5-5,10-0,4-6,...,24-4,11-7,28-8,0-1,8-8,9-6,12-3,8-4,13-2,7-1
4,5,Los Angeles Clippers,56-26,32-9,24-17,21-9,35-17,7-3,8-2,6-4,...,17-9,3-5,38-12,1-0,8-6,16-0,9-7,8-5,7-7,7-1


In [80]:
df['HomeTeamRanksHigher'] = 0
for index, row in df.iterrows():
    home, visitor = row['Home Team'], row['Visitor Team']
    
    # The team was renamed between the 2013 and 2014 seasons!
    # But it was still the same team.
    if home == 'New Orleans Pelicans':
        home = 'New Orleans Hornets'
    elif visitor == 'New Orleans Pelicans':
        visitor = 'New Orleans Hornets'
        
    home_rank = df_standings[df_standings['Team'] == home]['Rk'].values[0]
    visitor_rank = df_standings[df_standings['Team'] == visitor]['Rk'].values[0]
    row['HomeTeamRanksHigher'] = int(home_rank > visitor_rank)
    df.loc[index] = row
True

True

In [81]:
X_home_higher = df[['HomeLastWin', 'VisitorLastWin', 'HomeTeamRanksHigher']].values

In [82]:
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_home_higher, y_true, cv=5)
print(f'Accuracy: {np.mean(scores) * 100:.2f}%')

Accuracy: 60.04%


In [83]:
last_match_winner = defaultdict(int)
df['HomeTeamLastWon'] = 0

In [87]:
for index, row in df.iterrows():
    home, visitor = row['Home Team'], row['Visitor Team']
    teams = tuple(sorted([home, visitor]))
    
    row['HomeTeamLastWon'] = 1 if last_match_winner[teams] == row['Home Team'] else 0
    df.loc[index] = row
    winner = row['Home Team'] if row['HomeWin'] else row['Visitor Team']
    last_match_winner[teams] = winner

In [89]:
X_last_winner = df[['HomeTeamRanksHigher', 'HomeTeamLastWon']].values

In [90]:
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_last_winner, y_true, cv=5)
print(f'Accuracy: {np.mean(scores) * 100:.2f}%')

Accuracy: 61.33%


In [119]:
# encoder = LabelEncoder()
# encoder.fit(df['Home Team'].values)
# home_teams = encoder.transform(df['Home Team'].values)
# visitor_teams = encoder.transform(df['Visitor Team'].values)
# X_teams = np.vstack([home_teams, visitor_teams]).T

(array([11, 13, 15, ..., 15, 15, 26]), array([[11, 21],
        [13, 12],
        [15,  4],
        ...,
        [15, 26],
        [15, 26],
        [26, 15]]))

In [147]:
encoder = OneHotEncoder()
encoder.fit(df['Home Team'].values.reshape(-1, 1))
home_teams = encoder.transform(df['Home Team'].values.reshape(-1, 1))
visitor_teams = encoder.transform(df['Visitor Team'].values.reshape(-1, 1))
X_teams_expanded = hstack((home_teams, visitor_teams))
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams_expanded, y_true, cv=5)
print(f'Accuracy: {np.mean(scores) * 100:.2f}%')

Accuracy: 60.57%


In [150]:
clf = RandomForestClassifier(random_state=14, n_estimators=100)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy', cv=5)
print(f'Accuracy: {np.mean(scores) * 100:.2f}%')

Accuracy: 58.60%


In [157]:
# Random forest classifier should be able to learn more effectively with more features.
X_all = np.hstack([X_home_higher, X_teams])
clf = RandomForestClassifier(random_state=14, n_estimators=100)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy', cv=5)
print(f'Accuracy: {np.mean(scores) * 100:.2f}%')

Accuracy: 60.35%


In [158]:
X_all.shape # n_features

(1319, 5)

In [159]:
parameter_space = {
    'max_features': [2, 3, 5],
    'n_estimators': [100, 1000],
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [2, 4, 6]
}
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)



AttributeError: 'GridSearchCV' object has no attribute 'best_score'

In [160]:
print(f'Accuracy: {grid.best_score_ * 100:.2f}%')

Accuracy: 62.93%
