In [109]:
import pandas as pd

path = 'basketball.csv'
dataset = pd.read_csv(path, parse_dates=['Date'])
dataset.columns = [
    'Date', 'Start(ET)', 'Visitor Team', 'VisitorPts', 'Home Team', 'HomePts',
    'OT?', 'Score Type', 'Notes'
]
dataset.head()

Unnamed: 0,Date,Start(ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Notes
0,2018-10-16,8:00p,Philadelphia 76ers,87,Boston Celtics,105,Box Score,,
1,2018-10-16,10:30p,Oklahoma City Thunder,100,Golden State Warriors,108,Box Score,,
2,2018-10-17,7:00p,Milwaukee Bucks,113,Charlotte Hornets,112,Box Score,,
3,2018-10-17,7:00p,Brooklyn Nets,100,Detroit Pistons,103,Box Score,,
4,2018-10-17,7:00p,Memphis Grizzlies,83,Indiana Pacers,111,Box Score,,


In [110]:
# 提取新特征

dataset['HomeWin'] = dataset['VisitorPts'] < dataset['HomePts']

# 保留成label
y_true = dataset['HomeWin'].values
y_true

array([ True,  True, False, ..., False, False, False])

In [111]:
dataset['HomeWin'].mean()

0.5907012195121951

In [131]:
from collections import defaultdict

won_last = defaultdict(int)
dataset['HomeLastWin'] = 0
dataset['VisitorLastWin'] = 0

# 时间是无序的时候dataset.sort('Date').iterrows()
for index, row in dataset.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    dataset.at[index, 'HomeLastWin'] = won_last[home_team]
    dataset.at[index, 'VisitorLastWin'] = won_last[visitor_team]
    won_last[home_team] = int(row['HomeWin'])
    won_last[visitor_team] = 1 - int(row['HomeWin'])

X_previouswins = dataset[['HomeLastWin', 'VisitorLastWin']].values
X_previouswins

array([[0, 0],
       [0, 0],
       [0, 0],
       ...,
       [0, 1],
       [1, 0],
       [1, 0]])

In [113]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

clf = DecisionTreeClassifier(random_state=42)

score = cross_val_score(clf, X_previouswins, y_true, scoring = 'accuracy')
print('score is:{0:.1f}%'.format(np.mean(score)*100))

score is:59.1%


In [126]:
X_previouswins

array([[0, 0],
       [0, 0],
       [0, 0],
       ...,
       [0, 1],
       [1, 0],
       [1, 0]])

In [132]:
standings = pd.read_excel('standings.xls')

standings.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Houston Rockets,65-17,34-7,31-10,24-6,41-11,6-4,9-1,9-1,...,21-4,5-3,38-8,5-3,12-1,9-5,10-4,12-0,14-1,3-3
1,2,Toronto Raptors,59-23,34-7,25-16,40-12,19-11,12-4,14-4,14-4,...,18-7,5-7,33-5,4-2,9-5,11-3,10-5,9-2,12-4,4-2
2,3,Golden State Warriors,58-24,29-12,29-12,24-6,34-18,9-1,6-4,9-1,...,14-10,5-1,38-13,5-3,11-3,13-2,11-3,8-3,7-7,3-3
3,4,Boston Celtics,55-27,27-14,28-13,33-19,22-8,12-4,10-8,11-7,...,15-8,11-8,25-9,5-2,14-2,11-6,7-5,7-4,9-4,2-4
4,5,Philadelphia 76ers,52-30,30-11,22-19,34-18,18-12,9-7,11-7,14-4,...,22-5,4-7,31-11,3-4,9-5,5-10,7-5,8-3,13-3,7-0


In [133]:
# 加入上赛季数据
standings = pd.read_excel('standings.xls')
# 建立新的特征（排名情况）
dataset['HomeTeamRanksHigher'] = 0

for index, row in dataset.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    home_rank = standings[standings['Team'] == home_team]['Rk'].values[0]
    visitor_rank = standings[standings['Team'] == visitor_team]['Rk'].values[0]
    row['HomeTeamRanksHigher'] = int(home_rank > visitor_rank)
    dataset.at[index, 'HomeTeamRanksHigher'] = int(home_rank < visitor_rank)

In [134]:
clf = DecisionTreeClassifier(random_state=42)
X_homehigher = dataset[[
    'HomeLastWin', 'VisitorLastWin', 'HomeTeamRanksHigher'
]].values

score = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')
print('score is:{0:.1f}%'.format(np.mean(score) * 100))

score is:62.2%


In [143]:
last_match_winner = defaultdict(int)
dataset['HomeTeamWonLast'] = 0
for index, row in dataset.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    teams = tuple(sorted([home_team, visitor_team]))
    home_team_won_last = 1 if last_match_winner[teams] == row[
        'Home Team'] else 0
    dataset.at[index, 'HomeTeamWonLast'] = home_team_won_last
    winner = row['Home Team'] if row['HomeWin'] else row['Visitor Team']
    last_match_winner[teams] = winner

In [144]:
clf = DecisionTreeClassifier(random_state=42)
X_lastwinner = dataset[[
    'HomeLastWin', 'VisitorLastWin', 'HomeTeamRanksHigher', 'HomeTeamWonLast'
]].values

score = cross_val_score(clf, X_lastwinner, y_true, scoring='accuracy')
print('score is:{0:.1f}%'.format(np.mean(score) * 100))

score is:61.7%


In [153]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

encoding = LabelEncoder()
onehot = OneHotEncoder()

encoding.fit(dataset['Home Team'].values)
home_teams = encoding.transform(dataset['Home Team'].values)
visitor_teams = encoding.transform(dataset['Visitor Team'].values)
X_teams = np.vstack([home_teams, visitor_teams]).T
X_teams = onehot.fit_transform(X_teams).todense()
X_all = np.hstack([X_lastwinner, X_teams])

In [154]:
clf = DecisionTreeClassifier(random_state=42)
score = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print('score is:{0:.1f}%'.format(np.mean(score) * 100))

score is:60.7%


In [139]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42)
score = cross_val_score(clf, X_teams, y_true, scoring = 'accuracy')
print('score is:{0:.1f}%'.format(np.mean(score)*100))


score is:61.6%


In [149]:
clf = RandomForestClassifier(random_state=42)
score = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print('score is:{0:.1f}%'.format(np.mean(score) * 100))

score is:63.6%


In [141]:
from sklearn.model_selection import GridSearchCV

parameter_space = {
    'max_features': [2, 10, 'auto'],
    'n_estimators': [100, 200],
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [2, 4, 6]
}
clf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print('score is:{0:.1f}%'.format(np.mean(score) * 100))

score is:63.6%
