In [127]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import sys
sys.path.insert(0, "./../../")
from src.data_loader import add_recent_stats, load_game_data_zan
pd.set_option('display.max_columns', None)

%reload_ext autoreload
%autoreload 2

In [129]:
# game_data = load_game_data_zan()
# game_data_recent = add_recent_stats(game_data, recent_range=15)

In [186]:
path_game_data_recent = "/home/matej/Documents/Projects/nba-data-mining/data/processed/game-data-recent-15.csv"
game_data_recent = pd.read_csv(path_game_data_recent)
game_data_recent["season_name"] = game_data_recent["season_name"].str.split("-").str[0].astype(int)

* * *
### Analysis of features (AFTER GAME)

Following pairs of features are highly correlated:
- home_final_score & home_fg_made
- visitor_final_score & visitor_fg_made
- home_final_score & home_TSP
- visitor_final_score & visitor_TSP
- home_rebound & visitor_fg_missed
- visitor_rebound & home_fg_missed

Suggested features to be included in the model:
- home_TSP
- home_fg_made (????)
- home_timeout
- visitor TSP
- visitor_fg_made (????)
- visitor_timeout
- home_final_score_diff


In [35]:
correlations = game_data_recent.corr()
correlations_home_win = correlations["home_win"].sort_values()
correlations_home_win[correlations_home_win > 0.2]

home_rebound             0.238655
visitor_timeout          0.285202
visitor_fg_missed        0.328124
home_fg_made             0.347027
home_final_score         0.416214
home_TSP                 0.452732
home_final_score_diff    0.802811
home_win                 1.000000
Name: home_win, dtype: float64

In [36]:
correlations_positive = game_data_recent[["home_rebound", "visitor_timeout", "visitor_fg_missed", "home_fg_made", "home_final_score", "home_TSP", "home_final_score_diff"]].corr()

fig = px.imshow(correlations_positive)
fig.show()

In [34]:
correlations_home_win[correlations_home_win < -0.2]

visitor_final_score_diff        -0.802811
visitor_TSP                     -0.454685
home_timeout                    -0.422791
visitor_final_score             -0.413958
visitor_fg_made                 -0.338052
home_fg_missed                  -0.326205
visitor_scoring_leader_points   -0.236186
visitor_rebound                 -0.233071
Name: home_win, dtype: float64

In [37]:
correlations_negative = game_data_recent[["visitor_rebound", "home_timeout", "home_fg_missed", "visitor_fg_made", "visitor_final_score", "visitor_TSP", "visitor_final_score_diff", "visitor_scoring_leader_points"]].corr()

fig = px.imshow(correlations_negative)
fig.show()

* * *
### Analysis of recent features (BEFORE GAME)

Following pairs of recent features are highly correlated:
- home_recent_fg_made & home_recent_points
- home_recent_fg_made & home_recent_3PT_made
- home_recent_3PT_made & home_recent_points
- home_recent_TSP & home_recent_ft_made

Suggested recent features to be included in the model:
- home_recent_TSP
- home_recent_timeout
- visitor_recent_TSP
- visitor_recent_timeout

In [138]:
data_recent = game_data_recent.loc[:, "home_recent_home_game_ratio" : "visitor_recent_foul"].copy()
data_recent["home_win"] = game_data_recent["home_win"]

In [152]:
correlations_recent = data_recent.corr()
correlations_recent_home_win = correlations_recent["home_win"].sort_values()
correlations_recent_home_win[correlations_recent_home_win > 0.04]

visitor_recent_foul         0.047181
home_recent_ft_made         0.049522
home_recent_3PT_made        0.051208
home_recent_rebound         0.052945
visitor_recent_turnover     0.065748
home_recent_fg_made         0.074790
visitor_recent_fg_missed    0.084823
home_recent_points          0.090952
visitor_recent_timeout      0.103375
home_recent_TSP             0.115796
home_win                    1.000000
Name: home_win, dtype: float64

In [155]:
correlations_recent_subset = data_recent[["visitor_recent_foul", "home_recent_ft_made", "home_recent_3PT_made", "home_recent_rebound",
                                            "visitor_recent_turnover", "home_recent_fg_made", "visitor_recent_fg_missed", "home_recent_points",
                                            "visitor_recent_timeout", "home_recent_TSP"]].corr()

fig = px.imshow(correlations_recent_subset, width=600, height=600)
fig.show()                                            

## Predictions

In [156]:
scaler = MinMaxScaler()
label_encoder = LabelEncoder()
model = LogisticRegression()

### Prediction before the game

In [199]:
# last three seasons
data = game_data_recent[game_data_recent["season_name"] >= 2014][["home_team_id", "visitor_team_id", "home_recent_TSP", "visitor_recent_TSP", "home_recent_timeout", "visitor_recent_timeout",
                                "home_recent_points", "visitor_recent_points", "home_win"]].copy()
data["home_team_id"] = label_encoder.fit_transform(game_data_recent[game_data_recent["season_name"] >= 2014]["home_team_id"])
data["visitor_team_id"] = label_encoder.fit_transform(game_data_recent[game_data_recent["season_name"] >= 2014]["visitor_team_id"])
data[["home_recent_TSP", "home_recent_timeout", "visitor_recent_TSP", "visitor_recent_timeout", "home_recent_points", "visitor_recent_points"]] = scaler.fit_transform(data[["home_recent_TSP", "visitor_recent_TSP", "home_recent_timeout",
                                                                                                                                                                            "visitor_recent_timeout", "home_recent_points", "visitor_recent_points"]])
data

Unnamed: 0,home_team_id,visitor_team_id,home_recent_TSP,visitor_recent_TSP,home_recent_timeout,visitor_recent_timeout,home_recent_points,visitor_recent_points,home_win
16815,3,16,0.696852,0.50,0.506770,0.50,0.317073,0.195122,1
16816,22,5,0.532789,0.25,0.597526,1.00,0.512195,0.512195,1
16817,10,8,0.530567,0.50,0.719699,0.50,0.512195,0.658537,0
16818,29,12,0.464036,0.50,0.767927,0.75,0.365854,0.317073,1
16819,17,18,0.376304,0.75,0.442706,0.75,0.024390,0.341463,1
...,...,...,...,...,...,...,...,...,...
22960,12,23,0.593213,0.50,0.253085,0.50,0.853659,0.560976,0
22961,22,5,0.581376,0.50,0.420659,0.75,0.609756,0.682927,1
22962,6,13,0.294328,0.50,0.467019,0.50,0.439024,0.585366,1
22963,9,25,0.824276,0.50,0.774478,0.25,0.756098,0.780488,1


In [208]:
# all seasons
data = game_data_recent.loc[14:,["home_team_id", "visitor_team_id", "home_recent_TSP", "visitor_recent_TSP", "home_recent_timeout", "visitor_recent_timeout",
                                "home_recent_points", "visitor_recent_points", "home_win"]].copy()
data["home_team_id"] = label_encoder.fit_transform(game_data_recent.loc[14:,"home_team_id"])
data["visitor_team_id"] = label_encoder.fit_transform(game_data_recent.loc[14:,"visitor_team_id"])
data[["home_recent_TSP", "home_recent_timeout", "visitor_recent_TSP", "visitor_recent_timeout", "home_recent_points", "visitor_recent_points"]] = scaler.fit_transform(data[["home_recent_TSP", "visitor_recent_TSP", "home_recent_timeout",
                                                                                                                                                                            "visitor_recent_timeout", "home_recent_points", "visitor_recent_points"]])
data

Unnamed: 0,home_team_id,visitor_team_id,home_recent_TSP,visitor_recent_TSP,home_recent_timeout,visitor_recent_timeout,home_recent_points,visitor_recent_points,home_win
14,18,24,0.904037,0.333333,0.687601,0.875,0.795276,0.748031,1
15,11,16,0.000000,0.000000,0.819906,0.625,0.000000,0.763780,1
16,29,27,1.000000,0.555556,0.833042,0.875,0.834646,0.677165,0
17,2,21,0.660024,0.666667,0.923863,0.625,0.677165,0.787402,1
18,10,25,0.880562,0.444444,0.870440,0.625,0.755906,0.842520,0
...,...,...,...,...,...,...,...,...,...
22960,12,23,0.693254,0.555556,0.730773,0.625,0.952756,0.858268,0
22961,22,5,0.690898,0.555556,0.770237,0.750,0.874016,0.897638,1
22962,6,13,0.633768,0.555556,0.781155,0.625,0.818898,0.866142,1
22963,9,25,0.739241,0.555556,0.853562,0.500,0.921260,0.929134,1


In [209]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["home_win"]), data["home_win"], test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

print("Train dataset: {}%".format((X_train.shape[0] / data.shape[0]) * 100))
print("Validate dataset: {}%".format((X_val.shape[0] / data.shape[0]) * 100))
print("Test dataset: {}%".format((X_test.shape[0] / data.shape[0]) * 100))

Train dataset: 59.997385734826366%
Validate dataset: 19.999128578275457%
Test dataset: 20.003485686898173%


In [210]:
model.fit(X_train, y_train)
predictions = model.predict(X_val)

fpr, tpr, thresholds = roc_curve(y_val, predictions)
print(classification_report(y_val, predictions))

              precision    recall  f1-score   support

           0       0.58      0.23      0.33      1867
           1       0.63      0.89      0.73      2723

    accuracy                           0.62      4590
   macro avg       0.61      0.56      0.53      4590
weighted avg       0.61      0.62      0.57      4590



### Prediction after the game

In [86]:
data = game_data_recent[["home_team_id", "visitor_team_id", "home_TSP", "home_timeout", "visitor_TSP", "visitor_timeout", "home_win"]].copy()
data["home_team_id"] = label_encoder.fit_transform(game_data_recent["home_team_id"])
data["visitor_team_id"] = label_encoder.fit_transform(game_data_recent["visitor_team_id"])
data[["home_TSP", "home_timeout", "visitor_TSP", "visitor_timeout"]] = scaler.fit_transform(data[["home_TSP", "home_timeout", "visitor_TSP", "visitor_timeout"]])
data

Unnamed: 0,home_team_id,visitor_team_id,home_TSP,home_timeout,visitor_TSP,visitor_timeout,home_win
0,15,18,0.294900,0.400000,0.750004,0.117647,0
1,14,2,0.255317,0.466667,0.399886,0.294118,0
2,16,27,0.440890,0.266667,0.475567,0.352941,1
3,0,29,0.271960,0.266667,0.887694,0.235294,0
4,24,28,0.280250,0.400000,0.397672,0.235294,0
...,...,...,...,...,...,...,...
22960,12,23,0.297271,0.333333,0.376546,0.117647,0
22961,22,5,0.386106,0.333333,0.235184,0.294118,1
22962,6,13,0.306490,0.266667,0.260060,0.352941,1
22963,9,25,0.512595,0.333333,0.460439,0.352941,1


In [88]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["home_win"]), data["home_win"], test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

print("Train dataset: {}%".format((X_train.shape[0] / data.shape[0]) * 100))
print("Validate dataset: {}%".format((X_val.shape[0] / data.shape[0]) * 100))
print("Test dataset: {}%".format((X_test.shape[0] / data.shape[0]) * 100))

Train dataset: 60.0%
Validate dataset: 20.0%
Test dataset: 20.0%


In [89]:
model.fit(X_train, y_train)
predictions = model.predict(X_val)

fpr, tpr, thresholds = roc_curve(y_val, predictions)
print(classification_report(y_val, predictions))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85      1842
           1       0.89      0.91      0.90      2751

    accuracy                           0.88      4593
   macro avg       0.88      0.87      0.87      4593
weighted avg       0.88      0.88      0.88      4593

