
## Introduction:

In this analysis, we will be calculating the young stars who have the best potential. We will use current data to define the age of a young player, their efficiency based on others, and other criteria. Our goal is to identify the players with the highest potential and analyze their performance in various aspects.

Let's get started!


Source:   
This work was highly inspired by this work: https://www.kaggle.com/code/vivovinco/nba-rising-stars-2022-2023

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

: 

In [None]:
from nba_api.stats.static import players
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.endpoints import teamestimatedmetrics

: 

#

## Fetch stats of players per match

In [None]:
def fetch_nba_player_stats(season, existing_player_ids=[]):
    nba_players = players.get_active_players()
    all_players_stats_list = []  

    for player in nba_players:
        if player['is_active'] and player['id'] not in existing_player_ids:  # only active players not in existing data
            player_id = player['id']
            try:
                gamelog = playergamelog.PlayerGameLog(player_id=player_id, season=season)
                df = gamelog.get_data_frames()[0]
                all_players_stats_list.append(df)
            except Exception as e:
                print(f"Erreur lors de la récupération des données pour le joueur {player['full_name']} (ID: {player_id}): {e}")

    all_players_stats = pd.concat(all_players_stats_list, ignore_index=True)
    return all_players_stats

: 

In [None]:
today = datetime.now().strftime("%Y-%m-%d")

file_path_stats_per_match = f'../dataset/nba_player_stats_{today}.csv'

# Verify if dataset alrealdy exists
if os.path.exists(file_path_stats_per_match):
    existing_data = pd.read_csv(file_path_stats_per_match)
    existing_player_ids = existing_data['Game_ID'].unique()  
else:
    existing_data = pd.DataFrame()
    existing_player_ids = []

: 

In [None]:
season = '2023-24'
player_stats = fetch_nba_player_stats(season)

: 

In [None]:
player_stats.columns

: 

In [None]:
player_stats['GAME_DATE'] = pd.to_datetime(player_stats['GAME_DATE'], format='%b %d, %Y')

col_type = player_stats["GAME_DATE"].dtypes
print(col_type)

: 

In [None]:
player_stats.drop('VIDEO_AVAILABLE', axis=1, inplace=True)

: 

### Register on disk

In [None]:
player_stats.to_csv(file_path_stats_per_match, index=False) #the file will be overwrite if already existing 

: 

#

## Get IDs of active players

In [None]:
def fetch_active_nba_players():
    nba_players = players.get_active_players()
    player_dict = {}

    for player in nba_players:
        player_id = player['id']
        player_name = player['full_name']
        player_dict[player_id] = player_name

    return player_dict

nba_players_dict = fetch_active_nba_players()
for player_name in nba_players_dict.values():
    print(f"{player_name}")

: 

#

## Get global stats of active players in 2023-24

In [None]:
def fetch_stats_players_for_2023_24(player_dict, season='2023-24'):
    all_players_stats = pd.DataFrame()
    
    for player_id, player_name in player_dict.items():
        career = playercareerstats.PlayerCareerStats(player_id=player_id)
        career_df = career.get_data_frames()[0]
        # Filtrer pour la saison 2023-24
        season_stats = career_df[career_df['SEASON_ID'] == season]
        # Ajouter le nom complet du joueur
        season_stats['full_name'] = player_name
        all_players_stats = pd.concat([all_players_stats, season_stats], ignore_index=True)
    
    # S'assurer que 'full_name' est la deuxième colonne
    cols = list(all_players_stats.columns)
    cols.insert(1, cols.pop(cols.index('full_name')))
    all_players_stats = all_players_stats.loc[:, cols]
    
    return all_players_stats

: 

In [None]:
players_stats_2023_24 = fetch_stats_players_for_2023_24(nba_players_dict)

: 

In [None]:
players_stats_2023_24.head()

: 

In [None]:
players_stats_2023_24.shape

: 

In [None]:
players_stats_2023_24.drop("SEASON_ID", axis=1, inplace=True)
players_stats_2023_24.drop("LEAGUE_ID", axis=1, inplace=True)

: 

In [None]:
players_stats_2023_24.columns

: 

### Keep only Total on season for traded players

In [None]:
def filter_players_to_tot(df):
    # Trouver les joueurs avec plusieurs entrées (équipes) dans la même saison
    multiple_teams = df[df.duplicated(subset=['PLAYER_ID'], keep=False)]
    
    # Identifier les ID des joueurs qui ont 'TOT' comme abréviation d'équipe
    players_with_tot = multiple_teams[multiple_teams['TEAM_ABBREVIATION'] == 'TOT']['PLAYER_ID'].unique()
    
    # Filtrer pour garder les lignes 'TOT' pour ces joueurs
    tot_rows = df[(df['PLAYER_ID'].isin(players_with_tot)) & (df['TEAM_ABBREVIATION'] == 'TOT')]
    
    # Filtrer pour garder les joueurs qui n'ont pas changé d'équipe (n'apparaissent pas dans 'players_with_tot')
    single_team_rows = df[~df['PLAYER_ID'].isin(players_with_tot)]
    
    # Concaténer les deux groupes de lignes
    final_df = pd.concat([tot_rows, single_team_rows], ignore_index=True)
    
    return final_df

: 

In [None]:
players_stats_2023_24 = filter_players_to_tot(players_stats_2023_24)

: 

In [None]:
players_stats_2023_24.head() 

: 

In [None]:
players_stats_2023_24.columns

: 

In [None]:
today = datetime.now().strftime("%Y-%m-%d")

file_path_global_stats = f'../dataset/nba_player_global_stats_{today}.csv'

# Verify if dataset alrealdy exists
if os.path.exists(file_path_global_stats):
    existing_data = pd.read_csv(file_path_global_stats)
    existing_player_ids = existing_data['Game_ID'].unique()  
else:
    existing_data = pd.DataFrame()
    existing_player_ids = []


players_stats_2023_24.to_csv(file_path_global_stats, index=False) #the file will be overwrite if already existing 

: 

#

## Get global stats from teams

In [None]:
today = datetime.now().strftime("%Y-%m-%d")

file_path_teams_stats = f'../dataset/nba_teams_global_stats_{today}.csv'

# Verify if dataset alrealdy exists
if os.path.exists(file_path_teams_stats):
    existing_data = pd.read_csv(file_path_teams_stats)
    existing_player_ids = existing_data['Game_ID'].unique()  
else:
    existing_data = pd.DataFrame()
    existing_player_ids = []


# Query for games where the Celtics were playing
stat_team_finder = teamestimatedmetrics.TeamEstimatedMetrics(season = "2023-24")
# The first DataFrame of those returned is what we want.
stat_team = stat_team_finder.get_data_frames()[0]
stat_team.to_csv(file_path_teams_stats, index=False)

: 

# Get team stats of each game

In [None]:
from nba_api.stats.endpoints import teamgamelogs

# Query for games where the Celtics were playing
stat_team_game = teamgamelogs.TeamGameLogs(season_nullable= "2023-24")
# The first DataFrame of those returned is what we want.
stat_team_game = stat_team_game.get_data_frames()[0]
stat_team_game  

: 

In [None]:
stat_team_game.drop("SEASON_YEAR", axis=1, inplace=True)
stat_team_game.drop("MIN", axis=1, inplace=True)

: 

In [None]:
stat_team_game.columns

: 

In [None]:
file_path_game_stats = f'../dataset/nba_games_stats_{today}.csv'

stat_team_game.to_csv(file_path_game_stats, index=False) #the file will be overwrite if already existing 

: 

In [None]:
stat_team_game['GAME_DATE'] = pd.to_datetime(stat_team_game['GAME_DATE'])

col_type = stat_team_game["GAME_DATE"].dtypes
print(col_type)

: 

In [None]:
stat_team_game.dtypes

: 

## Create target column Win/Lose

In [None]:
stat_team_game['Win/Lose'] = np.where(stat_team_game['WL'] == 'W', 1, 0)
stat_team_game.drop("WL", axis = 1, inplace=True)
stat_team_game.drop("TEAM_ID", axis = 1, inplace=True)
stat_team_game.drop("TEAM_NAME", axis = 1, inplace=True)
stat_team_game.drop("GAME_ID", axis = 1, inplace=True)

stat_team_game.head()

: 

: 

In [None]:
player_stats.dtypes

: 

In [None]:
player_stats.head(10)

: 

In [None]:
# Exemple hypothétique basé sur la structure de l'API
season_2023_24_stats = career.season_totals_regular_season.get_data_frame()[career.season_totals_regular_season.get_data_frame()['SEASON_ID'] == '2023-24']

: 

In [None]:
type(season_2023_24_stats)

: 

In [None]:
season_2023_24_stats.head(10)

: 

In [None]:
assert sum(players.isnull().sum()) == 0, "There are not null values in the dataset"


: 

In [None]:
players["EFF"] = players.PTS + players.TRB + players.AST + players.STL + players.BLK - (players.FGA - players.FG) - (players.FTA - players.FT) - players.TOV

: 

In [None]:
plt.figure(figsize=(14,6))
sns.swarmplot(
    x=players["Pos"],
    y=players["EFF"]
)

: 

In [None]:
ages = players.Age.describe().round(decimals=1) # used to specify the first 25%, defining what is a young player
points = players.PTS.describe().round(decimals=1)

: 

In [None]:
sns.boxplot(data=players, y="PTS");

: 

In [None]:
plt.figure(figsize=(14,6))
sns.boxplot(data=players, x="Age", y="PTS");

: 

With the graph below, we can see that within <23y (what we have defined to be a young age), if we have more than 15 points we are special. Those data will then be used to filter the current base player and keep only special ones.

In [None]:
young_age = ages["25%"]
futur_super_star_def = f"(EFF >= 12) & (PTS >= 15) & (Age <= {young_age})"
players.query(futur_super_star_def).sort_values("EFF", ascending=False).sort_values(["Age", "EFF"], ascending=True)

: 