In [40]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from nba_api.stats.endpoints import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import time
from datetime import datetime
from nba_api.stats.library.http import NBAStatsHTTP
import joblib

# Set the timeout
NBAStatsHTTP.timeout = 120  # Set to 30 seconds; adjust as needed

In [2]:
def player_id(player_name):
    players = leaguedashplayerbiostats.LeagueDashPlayerBioStats()
    players_data = players.get_data_frames()[0]
    player_id = players_data[players_data['PLAYER_NAME'] == player_name].get('PLAYER_ID')
    return player_id
player_id('Luka Dončić')

333    1629029
Name: PLAYER_ID, dtype: int64

In [4]:
def team_id(team_abbrev):
    teams = teamgamelogs.TeamGameLogs()
    teams_data = teams.get_data_frames()[0]
    team_id = teams_data.loc[teams_data['TEAM_ABBREVIATION'] == team_abbrev, 'TEAM_ID'].iloc[0]
    return team_id
# Example usage, should return: 1610612747
team_id('LAL')

1610612747

In [5]:
#function to get player gamelog based on Player Name rather than ID

def player_gamelog(player_name):
    id = player_id(player_name)
    gamelog = playergamelog.PlayerGameLog(player_id= id)
    return gamelog.get_data_frames()[0]


In [6]:
def playerVplayer(player_name, vs_player_name):
    id = player_id(player_name)
    vid = player_id(vs_player_name)
    gamelog = playervsplayer.PlayerVsPlayer(vs_player_id=vid, player_id=id)
    return gamelog.get_data_frames()

# playerVplayer('Nikola Jokic', 'Rudy Gobert')

In [7]:
# Will add a Home_Away column to the gamelog so that the ML model can use that as a variable
def homeOrAway(player_gamelog):
    player_gamelog['Home_Away'] = player_gamelog['MATCHUP'].apply(lambda x: 0 if '@' in x else 1)
    return player_gamelog

In [9]:
def extract_opponent(matchup):
    # The opponent's team abbreviation is after "vs. " or "@ "
    if " vs. " in matchup:
        # Split the string at " vs. " and take the second part
        return matchup.split(" vs. ")[1]
    elif " @ " in matchup:
        # Split the string at " @ " and take the second part
        return matchup.split(" @ ")[1]
    else:
        return "Invalid Matchup Format"
extract_opponent('LAL @ NYK')

'NYK'

In [11]:
def againstThisTeam(player_gamelog, home_team_abbrev, opp_team_abbrev):
    awaySearchString = home_team_abbrev + ' vs. ' + opp_team_abbrev
    homeSearchString = home_team_abbrev + ' @ ' + opp_team_abbrev
    newGameLog = player_gamelog.loc[player_gamelog.MATCHUP.isin([homeSearchString, awaySearchString])]
    return newGameLog

#Test
kingStats = player_gamelog('LeBron James')
print(againstThisTeam(kingStats, 'LAL', 'ATL'))

  SEASON_ID  Player_ID     Game_ID     GAME_DATE      MATCHUP WL  MIN  FGM  \
0     22024       2544  0022400477  JAN 03, 2025  LAL vs. ATL  W   30   13   
8     22024       2544  0022400334  DEC 06, 2024    LAL @ ATL  L   43   14   

   FGA  FG_PCT  ...  DREB  REB  AST  STL  BLK  TOV  PF  PTS  PLUS_MINUS  \
0   20    0.65  ...     3    3    8    0    0    3   3   30           4   
8   25    0.56  ...    10   10   11    2    3    4   3   39           3   

   VIDEO_AVAILABLE  
0                1  
8                1  

[2 rows x 27 columns]


In [19]:
def boxStatsAgainstThisTeam(player_gamelog, home_team_abbrev, opp_team_abbrev):
    new_log = againstThisTeam(player_gamelog, home_team_abbrev, opp_team_abbrev)
    new_log = new_log.loc[:, ['GAME_DATE', 'MATCHUP', 'REB', 'AST', 'STL', 'BLK', 'PTS', 'MIN']]
    print(new_log)
#test
kingStats = player_gamelog('LeBron James')
boxStatsAgainstThisTeam(kingStats, 'LAL', 'ATL')

katStats = player_gamelog("De'Aaron Fox")
boxStatsAgainstThisTeam(katStats, 'SAC', 'MEM')

      GAME_DATE      MATCHUP  REB  AST  STL  BLK  PTS  MIN
0  JAN 03, 2025  LAL vs. ATL    3    8    0    0   30   30
8  DEC 06, 2024    LAL @ ATL   10   11    2    3   39   43
       GAME_DATE      MATCHUP  REB  AST  STL  BLK  PTS  MIN
0   JAN 03, 2025  SAC vs. MEM    3    5    3    0   23   40
12  DEC 05, 2024    SAC @ MEM    6    3    6    0   18   38


In [18]:
def variancePTS(player_gamelog):
    pts = player_gamelog['MIN']
    print(pts.describe())

#test
dearonStats = player_gamelog("De'Aaron Fox")
variancePTS(dearonStats)

count    35.000000
mean     37.342857
std       3.086430
min      31.000000
25%      36.000000
50%      37.000000
75%      39.000000
max      44.000000
Name: MIN, dtype: float64


In [20]:
def getTotalAllowedPTS(team_id):
    team_stats = leaguestandingsv3.LeagueStandingsV3()
    df_teamstats = team_stats.get_data_frames()[0]
    team_row = df_teamstats[df_teamstats['TeamID'] == team_id]
    if not team_row.empty:
        totalPTSA = team_row.iloc[0]['OppTotalPoints']
        return totalPTSA
    else:
        print("No data found for the specified city")
        return None
def getPPG_allowed(team_id):
    totalPTSA = getTotalAllowedPTS(team_id)
    team_gamelog = teamgamelog.TeamGameLog(team_id)
    df_teamgamelog = team_gamelog.get_data_frames()[0]
    games_played = df_teamgamelog.shape[0]
    ppg_allowed = totalPTSA / games_played
    return ppg_allowed
# Example Usage, Output: 113.70588235294117
getPPG_allowed(team_id('LAL'))

113.70588235294117

In [38]:
def importantStats(player_name):
    player_gameLog = player_gamelog(player_name)
    player_gameLog = player_gameLog.iloc[::-1].reset_index(drop=True)
    window_size = 5
    stats_to_average = ['PTS', 'FGM', 'FGA', 'STL', 'BLK', 'TOV', 'FG_PCT', 'MIN']

    for stat in stats_to_average:
        player_gameLog[f'MA_{stat}'] = player_gameLog[stat].rolling(window=window_size, min_periods=1).mean().shift(1)
    player_gameLog['average_pts'] = player_gameLog['PTS'].expanding().mean().shift(1)

    # Extract opponent team abbreviation
    player_gameLog['OPP_TEAM'] = player_gameLog['MATCHUP'].apply(extract_opponent)
    player_gameLog['OPP_TEAMID'] = player_gameLog['OPP_TEAM'].apply(team_id)
    # Fetch the PPG allowed by the opponent team
    player_gameLog['OPP_PPG_ALLOWED'] = player_gameLog['OPP_TEAMID'].apply(getPPG_allowed)

    columns_to_keep = ['MATCHUP', 'GAME_DATE', 'PTS' 'average_pts', 'OPP_PPG_ALLOWED'] + [f'MA_{stat}' for stat in stats_to_average]
    log_cleaned = player_gameLog[columns_to_keep]
    homeOrAway(log_cleaned)
    return log_cleaned
# Example Usage, runtime ~18s
importantStats('Trae Young')

ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)

In [42]:
df = pd.read_csv('stats.csv')
df = df.drop(df.index[0]).reset_index(drop=True)
# Create a binary target variable where 'PTS' above average = 1, else = 0
# This requires you to know the average beforehand or compute it
df['target'] = (df['PTS'] > df['average_pts']).astype(int)

categorical_features = ['MATCHUP', 'Home_Away']
one_hot = OneHotEncoder()

# Combine one-hot encoder and model in a pipeline
classifier = Pipeline([
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
        remainder='passthrough')),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Select features and target
X = df[['MATCHUP', 'OPP_PPG_ALLOWED', 'MA_PTS', 'MA_FGM', 'MA_FGA', 'MA_STL', 'MA_BLK', 'MA_TOV', 'MA_FG_PCT', 'MA_MIN', 'Home_Away']]  # Features
y = df['target']                # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Print the classification report and accuracy score
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

joblib.dump(classifier, 'nba_pred_v1.pkl')



              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.75      0.75      0.75         4

    accuracy                           0.71         7
   macro avg       0.71      0.71      0.71         7
weighted avg       0.71      0.71      0.71         7

Accuracy: 0.7142857142857143


['nba_pred_v1.pkl']