In [68]:
import pandas as pd
from collections import defaultdict

import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split

In [8]:
input_file = 'leagues_nba_2014_games.txt' # Source: https://www.basketball-reference.com/leagues/NBA_2014_games.html

In [26]:
# Load data
df = pd.read_csv(input_file, parse_dates = ['Date'])

# Rename existing columns
df.rename(columns = {'Visitor/Neutral': 'Visitor Team',
                     'Home/Neutral': 'Home Team',
                     'Unnamed: 6': 'Score Type',
                     'Unnamed: 7': 'OT?',
                     'PTS': 'VisitorPts',
                     'PTS.1': 'HomePts'}, inplace = True)
df.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Notes
0,2013-10-29,7:00 pm,Orlando Magic,87,Indiana Pacers,97,Box Score,,
1,2013-10-29,10:30 pm,Los Angeles Clippers,103,Los Angeles Lakers,116,Box Score,,
2,2013-10-29,8:00 pm,Chicago Bulls,95,Miami Heat,107,Box Score,,
3,2013-10-30,7:00 pm,Brooklyn Nets,94,Cleveland Cavaliers,98,Box Score,,
4,2013-10-30,8:30 pm,Atlanta Hawks,109,Dallas Mavericks,118,Box Score,,


In [28]:
df.describe()

Unnamed: 0,VisitorPts,HomePts
count,1319.0,1319.0
mean,99.603487,102.216831
std,11.684056,11.888199
min,66.0,63.0
25%,91.0,94.0
50%,99.0,102.0
75%,107.0,110.0
max,145.0,143.0


In [33]:
# Create a new column `HomeWin` which is boolean that indicates if the Home is the winning
# team or Visitor
df['HomeWin'] = df['HomePts'] > df['VisitorPts']
df.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Notes,HomeWin
0,2013-10-29,7:00 pm,Orlando Magic,87,Indiana Pacers,97,Box Score,,,True
1,2013-10-29,10:30 pm,Los Angeles Clippers,103,Los Angeles Lakers,116,Box Score,,,True
2,2013-10-29,8:00 pm,Chicago Bulls,95,Miami Heat,107,Box Score,,,True
3,2013-10-30,7:00 pm,Brooklyn Nets,94,Cleveland Cavaliers,98,Box Score,,,True
4,2013-10-30,8:30 pm,Atlanta Hawks,109,Dallas Mavericks,118,Box Score,,,True


In [54]:
# Store the team's last result
won_last = defaultdict(int)

df['HomeLastWin'] = False
df['VisitorLastWin'] = False
for i, row in df.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']

    row['HomeLastWin'] = won_last[home_team]
    row['VisitorLastWin'] = won_last[visitor_team]    

    won_last[home_team] = row['HomeWin']
    won_last[visitor_team] = not row['HomeWin']

# Print the data for the row 20 to 25
print(df.loc[20:25,])

         Date Start (ET)            Visitor Team  VisitorPts  \
20 2013-11-01    7:30 pm         Milwaukee Bucks         105   
21 2013-11-01    8:00 pm              Miami Heat         100   
22 2013-11-01    7:00 pm     Cleveland Cavaliers          84   
23 2013-11-01    9:00 pm  Portland Trail Blazers         113   
24 2013-11-01    8:00 pm        Dallas Mavericks         105   
25 2013-11-01   10:30 pm       San Antonio Spurs          91   

             Home Team  HomePts Score Type  OT? Notes  HomeWin  HomeLastWin  \
20      Boston Celtics       98  Box Score  NaN   NaN    False        False   
21       Brooklyn Nets      101  Box Score  NaN   NaN     True        False   
22   Charlotte Bobcats       90  Box Score  NaN   NaN     True        False   
23      Denver Nuggets       98  Box Score  NaN   NaN    False        False   
24     Houston Rockets      113  Box Score  NaN   NaN     True        False   
25  Los Angeles Lakers       85  Box Score  NaN   NaN    False        False  

In [57]:
# Create a new classifier
clf = DecisionTreeClassifier(random_state=4)

In [65]:
X_previous_wins = df[['HomeLastWin', 'VisitorLastWin']].values
y_true = df['HomeWin'].values

In [70]:
scores = cross_val_score(clf, X_previous_wins, y_true, scoring='accuracy')
print('Accuracy: {:.1f}%'.format(np.mean(scores) * 100))

Accuracy: 57.9%
