In [66]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
import sys
sys.path.insert(0, "./../../")
from src.data_loader import add_recent_stats
pd.set_option('display.max_columns', None)

%reload_ext autoreload
%autoreload 2

### Save new feature to pickle file

In [51]:
game_data_pickle = pd.read_pickle("../../data/processed/load_data_games_arr_v2_zan.pkl")

path_lineups = "/home/matej/Documents/Projects/nba-data-mining/data/processed/lineups-all-seasons.csv"
path_game_data = "/home/matej/Documents/Projects/nba-data-mining/data/processed/game-data-extended.csv"
path_players = "/home/matej/Documents/Projects/nba-data-mining/data/raw/player-data/player_info.csv"

lineups = pd.read_csv(path_lineups)
game_data = pd.read_csv(path_game_data)
players = pd.read_csv(path_players)
players = players[players["Season"] != "2019-20"]

# convert string to int -> season_name
game_data["season_name"] = game_data["season_name"].str.split("-").str[0].astype(int)
game_data_pickle["season_name"] = game_data_pickle["season_name"].str.split("-").str[0].astype(int)

# added starting lineup
game_data["home_team_lineup"] = game_data[game_data.columns[63:68]].apply(lambda x: ",".join(x.astype(str)), axis=1)
game_data["visitor_team_lineup"] = game_data[game_data.columns[68:73]].apply(lambda x: ",".join(x.astype(str)), axis=1)
game_data["home_team_common_lineup"] = np.nan
game_data["visitor_team_common_lineup"] = np.nan

df_home_lineups = game_data.groupby(["season_name", "home_team_id"])["home_team_lineup"].agg(pd.Series.mode).astype(str)
df_visitor_lineups = game_data.groupby(["season_name", "visitor_team_id"])["visitor_team_lineup"].agg(pd.Series.mode).astype(str)

# iterate through starting lineups
for i, row in game_data.iterrows():
    game_data.at[i, "home_team_common_lineup"] = np.asarray(df_home_lineups.iloc[(df_home_lineups.index.get_level_values("season_name") == row["season_name"]) & (df_home_lineups.index.get_level_values("home_team_id") == row["home_team_id"])] == row["home_team_lineup"])
    game_data.at[i, "visitor_team_common_lineup"] = np.asarray(df_visitor_lineups.iloc[(df_visitor_lineups.index.get_level_values("season_name") == row["season_name"]) & (df_visitor_lineups.index.get_level_values("visitor_team_id") == row["visitor_team_id"])] == row["visitor_team_lineup"])

# convert feature type to int
game_data["home_team_common_lineup"] = game_data["home_team_common_lineup"].astype(int)
game_data["visitor_team_common_lineup"] = game_data["visitor_team_common_lineup"].astype(int)

# merge files
game_data.rename(columns = {'game_id':'GAME_ID'}, inplace = True)
to_merge = game_data[["GAME_ID", "home_team_common_lineup", "visitor_team_common_lineup"]]
to_merge.set_index("GAME_ID", inplace=True)
merged = pd.concat([game_data_pickle, to_merge], axis=1)

# save pickle file
merged.to_pickle('../../data/processed/load_data_games_arr_v2_zan.pkl')

### Data preprocessing

In [67]:
game_data = pd.read_pickle("../../data/processed/load_data_games_arr_v2_zan.pkl")
game_data["season_name"] = game_data["season_name"].str.split("-").str[0].astype(int)
game_data["home_tip_off"] = (game_data["tip_off_winner"] == "HOME_PLAYER").astype(int)
game_data.drop(columns=["tip_off_winner"], inplace=True)


le = LabelEncoder()
game_data["home_label"] = le.fit_transform(game_data["home_team_id"])
game_data["visitor_label"] = le.fit_transform(game_data["visitor_team_id"])
game_data

Unnamed: 0_level_0,play_count,home_team_id,visitor_team_id,home_record_wins,home_record_losses,season_name,visitor_team_city,visitor_team_nickname,home_final_score,visitor_final_score,home_win,home_team_city,home_team_nickname,periods,minutes_played,visitor_players_deployed,home_players_deployed,visitor_fg_made,visitor_fg_missed,visitor_3PT_made,visitor_3PT_missed,home_fg_made,home_fg_missed,home_3PT_made,home_3PT_missed,visitor_ft_made,visitor_ft_missed,home_ft_made,home_ft_missed,visitor_rebound,home_rebound,visitor_team_rebound,home_team_rebound,visitor_turnover,home_turnover,visitor_team_turnover,home_team_turnover,visitor_foul,home_foul,visitor_subs,home_subs,visitor_timeout,home_timeout,visitor_jump_balls_won,home_jump_balls_won,visitor_ejection,home_ejection,visitor_team_ejection,home_team_ejection,home_scoring_leader,home_scoring_leader_points,visitor_scoring_leader,visitor_scoring_leader_points,home_made_max_shot_distance,visitor_made_max_shot_distance,home_made_min_shot_distance,visitor_made_min_shot_distance,home_made_mean_shot_distance,visitor_made_mean_shot_distance,visitor_record_wins,visitor_record_losses,home_common_lineup,visitor_common_lineup,home_tip_off,home_label,visitor_label
GAME_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1
20000001,429,1610612752,1610612755,0,1,2000,Philadelphia,76ers,72,101,0,New York,Knicks,4,48,12,12,38,28,3,5,25,45,3,8,22,8,19,5,37,37,2,3,13,22,1,0,25,30,20,17,3,7,1,2,0,0,0,0,275,21,947,25,26,26,0,0,10.720000,9.263158,1,0,1,0,1,15,18
20000002,510,1610612751,1610612739,0,1,2000,Cleveland,Cavaliers,82,86,0,New Jersey,Nets,4,48,11,10,32,46,2,5,31,54,3,7,20,12,17,9,52,47,6,6,19,12,0,3,28,31,24,21,6,8,1,1,0,0,0,0,1425,20,441,17,27,27,0,0,8.129032,9.500000,1,0,0,1,1,14,2
20000003,478,1610612753,1610612764,1,0,2000,Washington,Wizards,97,86,1,Orlando,Magic,4,48,12,10,33,39,4,3,34,45,6,10,16,8,23,10,44,37,5,7,26,15,1,0,28,24,28,18,7,5,0,1,0,0,0,0,1503,32,1732,16,25,25,0,0,10.794118,7.696970,0,1,0,0,1,16,27
20000004,448,1610612737,1610612766,0,1,2000,Charlotte,Hornets,82,106,0,Atlanta,Hawks,4,48,11,11,35,27,5,4,30,51,6,9,31,9,16,5,44,29,3,7,17,13,0,0,22,32,17,27,5,5,1,1,0,0,0,0,673,23,469,23,26,31,0,0,9.133333,8.657143,1,0,0,1,1,0,29
20000005,505,1610612761,1610612765,0,1,2000,Detroit,Pistons,95,104,0,Toronto,Raptors,4,48,12,11,44,49,3,9,35,58,5,12,13,5,20,11,45,48,9,13,12,15,0,0,27,21,30,18,5,7,1,0,0,0,0,0,1713,26,711,44,26,26,0,0,12.400000,9.454545,1,0,1,0,0,24,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21801226,474,1610612749,1610612760,60,22,2018,Oklahoma City,Thunder,116,127,0,Milwaukee,Bucks,4,48,11,8,48,51,23,31,43,57,15,31,8,5,15,6,53,53,1,7,12,14,1,1,20,14,23,17,3,6,1,0,0,0,0,0,204025,29,203471,32,28,28,1,1,12.418605,14.000000,49,33,0,0,0,12,23
21801227,443,1610612759,1610612742,48,34,2018,Dallas,Mavericks,105,94,1,San Antonio,Spurs,4,48,10,13,37,54,11,26,41,47,8,16,9,3,15,3,42,53,4,5,7,10,0,1,19,14,23,28,6,6,0,2,0,0,0,0,200746,34,1717,20,27,28,1,1,12.341463,14.513514,33,49,0,0,1,22,5
21801228,434,1610612743,1610612750,54,28,2018,Minnesota,Timberwolves,99,95,1,Denver,Nuggets,4,48,10,9,39,52,13,19,39,48,10,23,4,2,11,6,41,53,6,5,10,12,0,1,22,13,21,19,7,5,2,1,0,0,0,0,203999,29,203952,25,26,27,1,1,11.384615,11.948718,36,46,1,0,1,6,13
21801229,555,1610612746,1610612762,48,34,2018,Utah,Jazz,143,137,1,LA,Clippers,5,50,10,13,47,59,14,21,54,52,12,16,29,4,23,7,57,52,6,3,17,12,0,0,24,27,26,24,7,6,1,1,0,0,0,0,1626149,24,1628960,40,28,28,0,1,9.555556,12.085106,50,32,0,0,0,9,25


In [68]:
useful_data = game_data.drop(columns=["play_count", "home_team_id", "visitor_team_id", "visitor_team_city", "home_team_city", "visitor_team_nickname", "home_team_nickname", "periods",
                                    "minutes_played", "visitor_players_deployed", "home_players_deployed", "visitor_subs", "home_subs",  
                                    "home_made_max_shot_distance", "visitor_made_max_shot_distance", "home_made_min_shot_distance", "visitor_made_min_shot_distance"])

useful_data["home_record"] = game_data[game_data["season_name"] > 2015]["home_record_wins"] - game_data[game_data["season_name"] < 2015]["home_record_losses"]
useful_data["visitor_record"] = game_data[game_data["season_name"] > 2015]["visitor_record_wins"] - game_data[game_data["season_name"] < 2015]["visitor_record_losses"]

useful_data = useful_data[useful_data["season_name"] >= 2015]

In [69]:
# LESS FEATURES
useful_data = game_data.drop(columns=["play_count", "visitor_team_city", "home_team_city", "visitor_team_nickname", "home_team_nickname", "periods", "minutes_played", 
                                    "visitor_players_deployed", "home_players_deployed", "visitor_subs", "home_subs", "visitor_timeout", "home_timeout",
                                    "visitor_jump_balls_won", "home_jump_balls_won", "visitor_ejection", "home_ejection", "visitor_team_ejection", "home_team_ejection",
                                    "home_made_max_shot_distance", "visitor_made_max_shot_distance", "home_made_min_shot_distance", "visitor_made_min_shot_distance",
                                    "visitor_team_turnover", "home_team_turnover", "home_team_id", "visitor_team_id",
                                    "visitor_3PT_made", "visitor_3PT_missed", "home_3PT_made", "home_3PT_missed"]) # TO VRSTICO SI TESTIRAL

useful_data["home_record"] = useful_data["home_record_wins"] - useful_data["home_record_losses"]
useful_data["visitor_record"] = useful_data["visitor_record_wins"] - useful_data["visitor_record_losses"]

useful_data.drop(columns=["visitor_record_wins", "visitor_record_losses", "home_record_wins", "home_record_losses"], inplace=True)

useful_data = useful_data[useful_data["season_name"] >= 2016]



In [70]:
useful_data

Unnamed: 0_level_0,season_name,home_final_score,visitor_final_score,home_win,visitor_fg_made,visitor_fg_missed,home_fg_made,home_fg_missed,visitor_ft_made,visitor_ft_missed,home_ft_made,home_ft_missed,visitor_rebound,home_rebound,visitor_team_rebound,home_team_rebound,visitor_turnover,home_turnover,visitor_foul,home_foul,home_scoring_leader,home_scoring_leader_points,visitor_scoring_leader,visitor_scoring_leader_points,home_made_mean_shot_distance,visitor_made_mean_shot_distance,home_common_lineup,visitor_common_lineup,home_tip_off,home_label,visitor_label,home_record,visitor_record
GAME_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
21600001,2016,117,88,1,32,55,45,49,15,5,14,5,42,51,7,10,18,14,23,22,202681,29,2546,19,10.022222,12.593750,1,1,0,2,15,1,-1
21600002,2016,113,104,1,40,42,39,36,16,0,22,0,31,34,8,5,11,12,19,20,203081,39,2207,29,14.897436,10.725000,0,0,0,20,25,1,-1
21600003,2016,100,129,0,47,51,40,45,23,3,13,5,55,35,8,3,13,16,19,20,201142,27,202695,35,10.000000,11.765957,1,0,0,7,22,-1,1
21600004,2016,96,108,0,47,50,34,55,10,6,22,6,52,45,8,4,10,11,25,15,203095,20,202355,18,12.441176,6.042553,0,0,0,16,11,-1,1
21600005,2016,130,121,1,45,59,47,46,13,5,26,8,49,52,6,5,15,16,27,23,1626167,30,101114,25,10.255319,14.088889,1,1,1,17,5,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21801226,2018,116,127,0,48,51,43,57,8,5,15,6,53,53,1,7,12,14,20,14,204025,29,203471,32,12.418605,14.000000,0,0,0,12,23,38,16
21801227,2018,105,94,1,37,54,41,47,9,3,15,3,42,53,4,5,7,10,19,14,200746,34,1717,20,12.341463,14.513514,0,0,1,22,5,14,-16
21801228,2018,99,95,1,39,52,39,48,4,2,11,6,41,53,6,5,10,12,22,13,203999,29,203952,25,11.384615,11.948718,1,0,1,6,13,26,-10
21801229,2018,143,137,1,47,59,54,52,29,4,23,7,57,52,6,3,17,12,24,27,1626149,24,1628960,40,9.555556,12.085106,0,0,0,9,25,14,18


## Data preprocessing - Sliding window

In [35]:
game_data_recent = add_recent_stats(game_data)

helloE


In [69]:
useful_data_recent = game_data_recent[["home_team_id", "visitor_team_id", "season_name", "visitor_fg_made", "home_fg_made","visitor_recent_fg_made", "home_recent_fg_made",
                                        "visitor_timeout", "home_timeout", "home_win", "home_common_lineup", "visitor_common_lineup"]].copy()

useful_data_recent["home_recent_record"] = game_data_recent["home_record_wins"] - game_data_recent["home_record_losses"]
useful_data_recent["visitor_recent_record"] = game_data_recent["visitor_record_wins"] - game_data_recent["visitor_record_losses"]

In [81]:
useful_data_recent = useful_data_recent[(useful_data_recent["season_name"] >= 2016)]


In [82]:
useful_data_recent

Unnamed: 0,home_team_id,visitor_team_id,season_name,visitor_fg_made,home_fg_made,visitor_recent_fg_made,home_recent_fg_made,visitor_timeout,home_timeout,home_win,home_common_lineup,visitor_common_lineup,home_recent_record,visitor_recent_record
19275,1610612739,1610612752,2016,32,45,36,39,6,5,1,1,1,1,-1
19276,1610612757,1610612762,2016,40,39,35,40,7,5,1,0,0,1,-1
19277,1610612744,1610612759,2016,47,40,36,42,6,4,0,1,0,-1,1
19278,1610612753,1610612748,2016,47,34,41,41,5,6,0,0,0,-1,1
19279,1610612754,1610612742,2016,45,47,32,42,11,7,1,1,1,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22960,1610612749,1610612760,2018,48,43,44,45,3,6,0,0,0,38,16
22961,1610612759,1610612742,2018,37,41,43,42,6,6,1,0,0,14,-16
22962,1610612743,1610612750,2018,39,39,43,41,7,5,1,1,0,26,-10
22963,1610612746,1610612762,2018,47,54,41,41,7,6,1,0,0,14,18


In [83]:
X_train = useful_data_recent[useful_data_recent["season_name"] < 2018].drop(columns=["home_win", "season_name"])
y_train = useful_data_recent[useful_data_recent["season_name"] < 2018]["home_win"]
X_val, X_test, y_val, y_test = train_test_split(useful_data_recent[useful_data_recent["season_name"] == 2018].drop(columns=["home_win", "season_name"]), useful_data_recent[useful_data_recent["season_name"] == 2018]["home_win"], test_size=0.5, shuffle=False)

In [84]:
model_recent = LogisticRegression()
model_recent.fit(X_train, y_train)

predictions = model_recent.predict(X_val)
print(classification_report(y_val, predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       242
           1       0.61      1.00      0.76       373

    accuracy                           0.61       615
   macro avg       0.30      0.50      0.38       615
weighted avg       0.37      0.61      0.46       615



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Golden State predictions

- Team ID = 1610612744

In [13]:
team_id = 1610612744

In [14]:
golden_state_games = useful_data[(useful_data["home_team_id"] == team_id) | (useful_data["visitor_team_id"] == team_id)].copy()

In [27]:
golden_state_games["warriors_win"] = (((golden_state_games["home_team_id"] == team_id) & (golden_state_games["home_win"] == 1))
                                    | ((golden_state_games["visitor_team_id"] == team_id) & (golden_state_games["home_win"] == 0)))

golden_state_games.drop(columns=["home_win"], inplace=True)

In [28]:
# GOLDEN STATE GAMES
X_train = golden_state_games.drop(columns=["warriors_win"])[golden_state_games["season_name"] < 2018]
y_train = golden_state_games[golden_state_games["season_name"] < 2018]["warriors_win"]

X_val, X_test, y_val, y_test = train_test_split(golden_state_games.drop(columns=["warriors_win"])[golden_state_games["season_name"] == 2018], golden_state_games[golden_state_games["season_name"] == 2018]["warriors_win"], test_size=0.5, shuffle=False)

In [29]:
model_gs = LogisticRegression()
model_gs.fit(X_train, y_train)

predictions = model_gs.predict(X_val)
print(classification_report(y_val, predictions))

              precision    recall  f1-score   support

       False       0.00      0.00      0.00        14
        True       0.66      1.00      0.79        27

    accuracy                           0.66        41
   macro avg       0.33      0.50      0.40        41
weighted avg       0.43      0.66      0.52        41




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



### Other predictions

In [71]:
# ALL GAMES...
X_train = useful_data.drop(columns=["home_win"])[useful_data["season_name"] < 2018]
y_train = useful_data[useful_data["season_name"] < 2018]["home_win"]

X_val, X_test, y_val, y_test = train_test_split(useful_data.drop(columns=["home_win"])[useful_data["season_name"] == 2018], useful_data[useful_data["season_name"] == 2018]["home_win"], test_size=0.5, shuffle=False)

In [72]:
model_1 = LogisticRegression()
model_1.fit(X_train, y_train)

predictions = model_1.predict(X_val)
fpr, tpr, thresholds = roc_curve(y_val, predictions)
print(classification_report(y_val, predictions))

              precision    recall  f1-score   support

           0       0.50      0.23      0.31       242
           1       0.63      0.85      0.72       373

    accuracy                           0.60       615
   macro avg       0.56      0.54      0.52       615
weighted avg       0.58      0.60      0.56       615



In [73]:
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

### Correlations

In [52]:
#Selecting highly (positive) correlated features
cor = game_data_recent.corr()
cor_target = abs(cor["home_win"])
relevant_features = cor_target[cor_target>0.2]


In [53]:
relevant_features

home_final_score                 0.416214
visitor_final_score              0.413958
home_win                         1.000000
visitor_fg_made                  0.338052
visitor_fg_missed                0.328124
home_fg_made                     0.347027
home_fg_missed                   0.326205
visitor_rebound                  0.233071
home_rebound                     0.238655
visitor_timeout                  0.285202
home_timeout                     0.422791
visitor_scoring_leader_points    0.236186
Name: home_win, dtype: float64