# Predicting the Winner of NBA Games Using Team Statistics

In [2]:
#Imports
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
np.random.seed(42)

In [5]:
#Reading dataset from csv
games = pd.read_csv('games.csv')
games

Unnamed: 0,GAME_DATE,HOME_TEAM_NAME,VISITOR_TEAM_NAME,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,3/12/2022,Miami Heat,Minnesota Timberwolves,2021,104,0.398,0.760,0.333,23,53,113,0.422,0.875,0.357,21,46,0
1,3/12/2022,Chicago Bulls,Cleveland Cavaliers,2021,101,0.443,0.933,0.429,20,46,91,0.419,0.824,0.208,19,40,1
2,3/12/2022,San Antonio Spurs,Indiana Pacers,2021,108,0.412,0.813,0.324,28,52,119,0.489,1.000,0.389,23,47,0
3,3/12/2022,Golden State Warriors,Milwaukee Bucks,2021,122,0.484,0.933,0.400,33,55,109,0.413,0.696,0.386,27,39,1
4,3/12/2022,Denver Nuggets,Toronto Raptors,2021,115,0.551,0.750,0.407,32,39,127,0.471,0.760,0.387,28,50,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24406,10/22/2004,Detroit Pistons,Washington Wizards,2004,100,0.549,0.810,0.500,24,34,82,0.411,0.724,0.071,15,33,1
24407,10/22/2004,Boston Celtics,Brooklyn Nets,2004,83,0.431,0.706,0.273,14,34,96,0.449,0.885,0.231,21,35,0
24408,10/22/2004,Charlotte Hornets,Portland Trailblazers,2004,69,0.377,0.571,0.300,24,36,63,0.311,0.741,0.217,9,44,1
24409,10/22/2004,Utah Jazz,Sacramento Kings,2004,103,0.507,0.641,0.667,25,43,88,0.362,0.814,0.250,16,33,1


X will need to contain only the numerical statistics, not including points scored, because points always determine the winner.

y is 1 if the home team wins, 0 if the away team wins.

In [3]:
X = games.iloc[:, [5,6,7,8,9,11,12,13,14,15]]
y = games.loc[:, 'HOME_TEAM_WINS']
X.head()

Unnamed: 0,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away
0,0.398,0.76,0.333,23,53,0.422,0.875,0.357,21,46
1,0.443,0.933,0.429,20,46,0.419,0.824,0.208,19,40
2,0.412,0.813,0.324,28,52,0.489,1.0,0.389,23,47
3,0.484,0.933,0.4,33,55,0.413,0.696,0.386,27,39
4,0.551,0.75,0.407,32,39,0.471,0.76,0.387,28,50


# Logistic Regression

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train.values, y_train)

y_pred = model.predict(X_test.values)

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1539  458]
 [ 315 2571]]


In [5]:
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]
TP = cm[1,1]
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1Score = 2*((Precision*Recall)/(Precision+Recall))
print('Precision: ',Precision, 'Recall: ',Recall, 'F1Score: ',F1Score)

Precision:  0.8487949818421922 Recall:  0.8908523908523909 F1Score:  0.869315300084531


In [6]:
#Home team win percentage
home_win_percentage = y.mean() * 100
print(home_win_percentage)

58.805456556470446


Since the home team wins over 58% of the time, the model should prefer the home team winning when all the values in X are the same, even though home/away win percentage isn't included in the dataset. This is tested below, by predicting the winner if the away and home stats are all equal. Then, we increase the FG% of the away team by 1% (.45 to .46) and the home team is still picked to win. Although, if we raise the away team's FG% to .47, the away team is picked to win. So, we can see that the home team is favored by the model, but only slightly.

In [7]:
same_stats = model.predict([[.45, .78, .27, 30, 40, .45, .78, .27, 30, 40]])
print(same_stats)
away_one_higher_fg = model.predict([[.45, .78, .27, 30, 40, .46, .78, .27, 30, 40]])
print(away_one_higher_fg)
away_two_higher_fg = model.predict([[.45, .78, .27, 30, 40, .47, .78, .27, 30, 40]])
print(away_two_higher_fg)

[1]
[1]
[0]


Below is sample data from NBA Preseason games that took place in the beginning of October, 2022. These samples are not included in the dataset, as the dataset only includes games as recent as March 12, 2022. We can use the recorded team stats from the 4 games to see if the model correctly predicts the winner. In each case below, the model correctly picked the winner of each game.

In [8]:
kings_at_lakers = model.predict([[.312,.625,.233,17,49, .408,.632,.231,25,54]])
print("Expected: 0")
print(kings_at_lakers)

sixers_at_nets = model.predict([[.462,.735,.333,25,43, .407,.960,.341,26,56]])
print("Expected: 0")
print(sixers_at_nets)

trailblazers_at_clippers = model.predict([[.383,.758,.395,21,44, .397,.771,.286,18,46]])
print("Expected: 1")
print(trailblazers_at_clippers)

thunder_at_nuggets = model.predict([[.411,.739,.303,20,43, .488,.680,.371,25,52]])
print("Expected: 0")
print(thunder_at_nuggets)

Expected: 0
[0]
Expected: 0
[0]
Expected: 1
[1]
Expected: 0
[0]


Below are the corresponding probabilities for the predictions above. The first case is the probability of 0 (away team winning).

In [9]:
print(model.predict_proba([[.312,.625,.233,17,49, .408,.632,.231,25,54]]))
print(model.predict_proba([[.462,.735,.333,25,43, .407,.960,.341,26,56]]))
print(model.predict_proba([[.383,.758,.395,21,44, .397,.771,.286,18,46]]))
print(model.predict_proba([[.411,.739,.303,20,43, .488,.680,.371,25,52]]))

[[0.93546874 0.06453126]]
[[0.70355317 0.29644683]]
[[0.371295 0.628705]]
[[0.92316702 0.07683298]]


# Decision Tree Classifier

In [16]:
tree_model = DecisionTreeClassifier(max_depth=7, random_state=42, criterion="entropy")
tree_model.fit(X_train.values, y_train)
tree_y_pred = tree_model.predict(X_test.values)

cm = confusion_matrix(y_test, tree_y_pred)
print(cm)

[[1420  577]
 [ 404 2482]]


Here we run the same predictions as the Logistic Regression model above, along with the probabilities. The second game was incorrectly predicted as a win for the home team, when in reality the away team won.

In [11]:
kings_at_lakers = tree_model.predict([[.312,.625,.233,17,49, .408,.632,.231,25,54]])
print("Expected: 0")
print(kings_at_lakers)
sixers_at_nets = tree_model.predict([[.462,.735,.333,25,43, .407,.960,.341,26,56]])
print("Expected: 0")
print(sixers_at_nets)
trailblazers_at_clippers = tree_model.predict([[.383,.758,.395,21,44, .397,.771,.286,18,46]])
print("Expected: 1")
print(trailblazers_at_clippers)
thunder_at_nuggets = tree_model.predict([[.411,.739,.303,20,43, .488,.680,.371,25,52]])
print("Expected: 0")
print(thunder_at_nuggets)

Expected: 0
[0]
Expected: 0
[1]
Expected: 1
[1]
Expected: 0
[0]


In [12]:
print(tree_model.predict_proba([[.312,.625,.233,17,49, .408,.632,.231,25,54]]))
print(tree_model.predict_proba([[.462,.735,.333,25,43, .407,.960,.341,26,56]]))
print(tree_model.predict_proba([[.383,.758,.395,21,44, .397,.771,.286,18,46]]))
print(tree_model.predict_proba([[.411,.739,.303,20,43, .488,.680,.371,25,52]]))

[[1. 0.]]
[[0.24074074 0.75925926]]
[[0.34285714 0.65714286]]
[[0.9112628 0.0887372]]


In [13]:
tree_y_pred = tree_model.predict(X_test.values)

cm = confusion_matrix(y_test, tree_y_pred)
print(cm)

[[1420  577]
 [ 404 2482]]


In [14]:
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]
TP = cm[1,1]
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1Score = 2*((Precision*Recall)/(Precision+Recall))
print('Precision: ',Precision, 'Recall: ',Recall, 'F1Score: ',F1Score)

Precision:  0.8113762667538411 Recall:  0.86001386001386 F1Score:  0.8349873843566021


## Analysis

Logistic Regression Model - Precision: 0.849 Recall: 0.891 F1Score: 0.869

Decision Tree Classifier  - Precision: 0.811 Recall: 0.860 F1Score: 0.835

Comparing the Logistic Regression and Decision Tree Classifier models, it is apparent that the Logistic Regression model performs better. The Precision, Recall, and F1Score of the Logistic Regression model were better than those of the Decision Tree Classifier. The Logistic Regression model correctly classified each preseason game, while the Decision Tree got one incorrect. 