In [49]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.endpoints import leaguedashplayerbiostats
from nba_api.stats.endpoints import playerdashboardbygeneralsplits
from nba_api.stats.endpoints import playervsplayer
from nba_api.stats.endpoints import leaguedashteamstats
from nba_api.stats.static import teams
from nba_api.stats.endpoints import teamgamelog, boxscoresummaryv2, boxscoretraditionalv2
import time

In [5]:
def player_id(player_name):
    players = leaguedashplayerbiostats.LeagueDashPlayerBioStats()
    players_data = players.get_data_frames()[0]
    player_id = players_data[players_data['PLAYER_NAME'] == player_name].get('PLAYER_ID')
    return player_id

In [7]:
#function to get player gamelog based on Player Name rather than ID

def player_gamelog(player_name):
    id = player_id(player_name)
    gamelog = playergamelog.PlayerGameLog(player_id= id)
    return gamelog.get_data_frames()[0]

In [13]:
def playerVplayer(player_name, vs_player_name):
    id = player_id(player_name)
    vid = player_id(vs_player_name)
    gamelog = playervsplayer.PlayerVsPlayer(vs_player_id=vid, player_id=id)
    return gamelog.get_data_frames()

# playerVplayer('Nikola Jokic', 'Rudy Gobert')

In [22]:
# Will add a Home_Away column to the gamelog so that the ML model can use that as a variable
def homeOrAway(player_gamelog):
    player_gamelog['Home_Away'] = player_gamelog['MATCHUP'].apply(lambda x: 0 if '@' in x else 1)
    return player_gamelog

In [36]:
def get_nba_teams():
    nba_teams = teams.get_teams()  # Fetch data
    # Filter to get only active NBA teams and return as DataFrame
    return pd.DataFrame(nba_teams)

nba_teams = get_nba_teams()
print(nba_teams[['id', 'abbreviation', 'full_name']])

            id abbreviation               full_name
0   1610612737          ATL           Atlanta Hawks
1   1610612738          BOS          Boston Celtics
2   1610612739          CLE     Cleveland Cavaliers
3   1610612740          NOP    New Orleans Pelicans
4   1610612741          CHI           Chicago Bulls
5   1610612742          DAL        Dallas Mavericks
6   1610612743          DEN          Denver Nuggets
7   1610612744          GSW   Golden State Warriors
8   1610612745          HOU         Houston Rockets
9   1610612746          LAC    Los Angeles Clippers
10  1610612747          LAL      Los Angeles Lakers
11  1610612748          MIA              Miami Heat
12  1610612749          MIL         Milwaukee Bucks
13  1610612750          MIN  Minnesota Timberwolves
14  1610612751          BKN           Brooklyn Nets
15  1610612752          NYK         New York Knicks
16  1610612753          ORL           Orlando Magic
17  1610612754          IND          Indiana Pacers
18  16106127

In [55]:
def fetch_team_defensive_stats(team_abbreviation, season='2022-23'):
    # Fetch NBA teams to find the corresponding team ID
    nba_teams = teams.get_teams()
    team = next((team for team in nba_teams if team['abbreviation'] == team_abbreviation), None)
    if not team:
        return f"No team found with abbreviation {team_abbreviation}"

    # Fetch league dashboard team stats specifically filtering by the team's ID
    try:
        stats = leaguedashteamstats.LeagueDashTeamStats(team_id_nullable=team['id'],
                                                        season=season,
                                                        per_mode_detailed='PerGame')
        df_stats = stats.get_data_frames()[0]
        
        # Debug: Print available columns to check for the expected data
        print("Available columns in the DataFrame:", df_stats.columns.tolist())

        # If 'OPP_PTS' is not the correct column, we may need to use another indicator
        if 'OPP_PTS' in df_stats.columns:
            pts_against_per_game = df_stats.loc[df_stats['TEAM_ID'] == team['id'], 'OPP_PTS'].iloc[0]
        else:
            # Alternatively, calculate it manually if the direct column isn't available
            if 'PTS_AGAINST' in df_stats.columns:
                total_pts_against = df_stats.loc[df_stats['TEAM_ID'] == team['id'], 'PTS_AGAINST'].iloc[0]
                games_played = df_stats.loc[df_stats['TEAM_ID'] == team['id'], 'GP'].iloc[0]
                pts_against_per_game = total_pts_against / games_played
            else:
                pts_against_per_game = "Data not available"

        return pd.DataFrame({
            'Team': [team_abbreviation],
            'Season': [season],
            'PTS_AGAINST_PER_GAME': [pts_against_per_game]
        })

    except Exception as e:
        return f"Error retrieving data: {e}"

# Example usage
lakers_def_stats = fetch_team_defensive_stats('LAL', '2022-23')
print(lakers_def_stats)


Available columns in the DataFrame: ['TEAM_ID', 'TEAM_NAME', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS', 'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 'FTM_RANK', 'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK', 'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK', 'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK']
  Team   Season PTS_AGAINST_PER_GAME
0  LAL  2022-23   Data not available


In [40]:
def calculate_team_ppg_allowed_by_date(team_abbreviation, season, specific_date):
    # Convert specific_date to pandas datetime for comparison
    specific_date = pd.to_datetime(specific_date)
    
    # Fetch NBA teams and find the team ID based on the abbreviation
    nba_teams = teams.get_teams()
    team_data = next(team for team in nba_teams if team['abbreviation'] == team_abbreviation)
    team_id = team_data['id']

    # Fetch game logs for the team for the specified season
    gamelog = teamgamelog.TeamGameLog(team_id=team_id, season=season)
    df_gamelog = gamelog.get_data_frames()[0]

    # Convert 'GAME_DATE' to datetime for filtering
    df_gamelog['GAME_DATE'] = pd.to_datetime(df_gamelog['GAME_DATE'])

    # Filter games up to the specific date
    df_filtered = df_gamelog[df_gamelog['GAME_DATE'] <= specific_date]

    # Calculate cumulative points allowed if there are any games before the specified date
    if not df_filtered.empty:
        cumulative_pts_allowed = df_filtered['PTS'].cumsum().iloc[-1] / len(df_filtered)
    else:
        cumulative_pts_allowed = None  # No games before the specific date

    # Return a formatted result
    result = {
        'TEAM_ABB': team_abbreviation,
        'CUM_PTS_AGAINST': cumulative_pts_allowed,
        'UP_TO_DATE': specific_date.strftime('%Y-%m-%d')
    }

    return result

# Example usage: Get cumulative PPG allowed by Los Angeles Lakers up to December 22, 2024
lakers_ppg_by_date = calculate_team_ppg_allowed_by_date('LAL', '2024-25', '2024-12-22')
print(lakers_ppg_by_date)


{'TEAM_ABB': 'LAL', 'CUM_PTS_AGAINST': 111.07142857142857, 'UP_TO_DATE': '2024-12-22'}


In [24]:
# Testing accuracy for different ML models on point predictions
kingStats = player_gamelog('LeBron James')
print(kingStats)
avgPTS = kingStats['PTS'].mean()
#Add Home_Away column
homeOrAway(kingStats)
# target
y = kingStats.PTS
# Features should be considered carefully, I just threw in ones I think make sense
# Obviously some of these like REB, AST, STL, etc probably don't make sense since we can't know these things before the game, although we might know what his average stats are for those categories before the game
kingStats_features = ['PTS', 'Home_Away', ]
X = kingStats[kingStats_features]
# Splitting data to test for accuracy
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

# DecisionTreeRegressor test
kingStats_model = DecisionTreeRegressor()
kingStats_model.fit(train_X, train_y)
val_predictions = kingStats_model.predict(val_X)
# ~2.833 MAE, <- haven't really put anything in so would not trust that
print(mean_absolute_error(val_y, val_predictions))


   SEASON_ID  Player_ID     Game_ID     GAME_DATE      MATCHUP WL  MIN  FGM  \
0      22024       2544  0022400372  DEC 19, 2024    LAL @ SAC  W   34    8   
1      22024       2544  0022401220  DEC 15, 2024  LAL vs. MEM  W   34    7   
2      22024       2544  0022400334  DEC 06, 2024    LAL @ ATL  L   43   14   
3      22024       2544  0022400321  DEC 04, 2024    LAL @ MIA  L   29   12   
4      22024       2544  0022400318  DEC 02, 2024    LAL @ MIN  L   31    4   
5      22024       2544  0022400311  DEC 01, 2024    LAL @ UTA  W   36   12   
6      22024       2544  0022400048  NOV 29, 2024  LAL vs. OKC  L   34    5   
7      22024       2544  0022400296  NOV 27, 2024    LAL @ SAS  W   33    8   
8      22024       2544  0022400039  NOV 26, 2024    LAL @ PHX  L   35    7   
9      22024       2544  0022400270  NOV 23, 2024  LAL vs. DEN  L   35    7   
10     22024       2544  0022400263  NOV 21, 2024  LAL vs. ORL  L   37   12   
11     22024       2544  0022400026  NOV 19, 2024  L