In [None]:
#imports
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import teamdetails, teamplayerdashboard, commonallplayers, playergamelog, commonteamroster, playercareerstats
from nba_api.stats.static import players, teams
from datetime import timedelta
from plotnine import *
import time

In [None]:
#code from Tuukka to get list of players
def get_team_ids():
    nba_teams = teams.get_teams()
    team_ids = {}
    for team in nba_teams:
        team_ids[team["full_name"]] = team['id']
    return team_ids

def get_list_players(year):
    season = str(year) + '-' + str(year + 1)[2:4]
    player_list = pd.DataFrame()
    for team_id in get_team_ids().values():
        roster = commonteamroster.CommonTeamRoster(team_id, season)
        roster_df = roster.get_data_frames()[0]
        player_list = pd.concat([player_list, roster_df], ignore_index = True)
        time.sleep(1)

    return player_list

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
#Player List for last seven seasons
players_list = []
for i in [2017,2018,2019,2020,2021,2022,2023]:
  w = get_list_players(i)
  w = w.append(w)
w

In [None]:
#importing dataset that has birthdays for players that are in season
bdays = pd.read_csv("NBA_DOB_InSeason.csv")

bdays['Birthdate'] = pd.to_datetime(bdays['Birthdate'])

bdays['Birthdate'] = bdays['Birthdate'].astype(str)

bdays[['Birth Year', 'Birth Day']] = bdays['Birthdate'].str.split('-', n=1, expand=True)

bdays['Birth Day'] = pd.to_datetime(bdays['Birth Day'], format = '%m-%d', errors='coerce').dt.strftime('%m-%d')

bdays['Birth Year'] = pd.to_datetime(bdays['Birth Year'], format = '%Y').dt.strftime('%Y')

bdays = bdays[bdays['Birth Year'] >= '1975']

help_df = bdays.copy().reset_index()

help_df = help_df[['id', 'Birth Day']]

In [None]:
#subsetting to only get players that played in the last seven seasons
in_bdays = bdays.merge(w, how = 'inner', left_on = 'id', right_on = 'PLAYER_ID')

In [None]:
#function to find the difference in stats for two seperate one row dataframes
def calculate_differences(df1, df2):

    differences = {'Player_ID': []}

    for col in df1.columns:
        if col in df2.columns:

            if df1[col].dtype in ['float64', 'int64']:
                differences[col] = [df1[col].iloc[0] - df2[col].iloc[0]]
            else:
                differences[col] = ['Not numerical']
        else:
            differences[col] = ['Column not found in df2']


    differences['Player_ID'] = [df1['Player_ID'].iloc[0]]

    differences_df = pd.DataFrame(differences)

    return differences_df

In [None]:
#function using the calculate_differences function to get the stat difference averages a player 45 days before their birthday and then 7 days after their birthday
def get_stat_differences(player_id, season, birthday):
    try:
        x = pd.DataFrame(playergamelog.PlayerGameLog(player_id=player_id, season=season, season_type_all_star='Regular Season').get_data_frames()[0])
    except KeyError as e:
        return None

    if x.empty:
        return None

    x['GAME_DATE'] = pd.to_datetime(x['GAME_DATE'])
    x['GAME_DATE'] = x['GAME_DATE'].astype(str)
    x[['Game Year', 'Game Day']] = x['GAME_DATE'].str.split('-', n=1, expand=True)
    x['Game Day'] = pd.to_datetime(x['Game Day'], format='%m-%d', errors='coerce').dt.strftime('%m-%d')
    x['Game Year'] = x['Game Year'].astype(str)
    x = x[['Player_ID', 'Game_ID', 'WL', 'MIN', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', "Game Year", "Game Day"]]

    birthday = pd.to_datetime(birthday, format='%m-%d', errors='coerce')
    day_after = birthday + timedelta(1)
    day_after = pd.to_datetime(day_after, format='%m-%d', errors='coerce').strftime('%m-%d')
    week_after = birthday + timedelta(8)
    week_after = pd.to_datetime(week_after, format='%m-%d', errors='coerce').strftime('%m-%d')
    month_hbefore = birthday - timedelta(45)
    month_hbefore = pd.to_datetime(month_hbefore, format='%m-%d', errors='coerce').strftime('%m-%d')
    games_after = x[(x['Game Day']>= day_after) & (x['Game Day'] <= week_after)]
    games_after = pd.DataFrame(games_after.mean()).transpose().drop(['Game_ID', 'Game Year'], axis=1)
    games_after['Player_ID'] = games_after['Player_ID'].astype(str)
    games_before = x[(x['Game Day']<= day_after) & (x['Game Day'] >= month_hbefore)]
    games_before = pd.DataFrame(games_before.mean()).transpose().drop(['Game_ID', 'Game Year'], axis=1)
    differences = calculate_differences(games_after, games_before)
    differences[['Player_id', 'o']]= differences['Player_ID'].str.split('.', n=1, expand=True)
    differences['Season'] = season
    return differences.drop(['Player_ID','o'],axis=1)

In [None]:
#seasons we want to look at
seasons = ['2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

In [None]:
#Function to run through each row of a df with a birthday and player id column and return the differences for that player each season
import time
def all_diffs(birthday_df, seasons):
    all_differences = []
    for player_id in birthday_df['id']:
        player_differences = []
        for season in seasons:
            birthday = birthday_df[birthday_df['id'] == player_id]['Birth Day'].iloc[0]
            try:

                df = get_stat_differences(player_id, season, birthday)
                time.sleep(.2)


                if df is not None and not df.empty:
                    player_differences.append(df)
            except Exception:
                pass

        if player_differences:
            player_differences_concatenated = pd.concat(player_differences, ignore_index=True)
            all_differences.append(player_differences_concatenated)

    if all_differences:
        appended_df = pd.concat(all_differences, ignore_index=True)
        return appended_df
    else:
        return None

In [None]:
#df to run through the all_diffs function
help_df = in_bdays[['id', 'Birth Day', 'SEASON']]

In [None]:
#All differences for every player for the last seven seasons
diffs = all_diffs(help_df, seasons)
based = diffs.copy()

In [None]:
#dropping duplicates (edge cases)
based.drop_duplicates(subset=['Player_id', 'Season'], keep='first', inplace=True)

In [None]:
#getting season stats for all players to filter the based df by a condition
player_stats = []
for index,row in based.iterrows():
  mid = pd.DataFrame(playercareerstats.PlayerCareerStats(per_mode36 = 'PerGame', player_id = row['Player_id']).get_data_frames()[0])
  mid = mid[mid['SEASON_ID'].isin(['2017-18','2018-19','2019-20','2020-21', '2021-22','2022-23', '2023-24'])]
  player_stats.append(mid)
  time.sleep(.2)
player_stats = pd.concat(player_stats, ignore_index=True)

In [None]:
player_stats.sort_values(['PLAYER_ID', 'SEASON_ID'], ascending=True)

In [None]:
#If players played for multiple teams in a season, wanted to average those rows
player_stats_grouped = player_stats.groupby(['PLAYER_ID', 'SEASON_ID']).mean().reset_index()

In [None]:
#Dropping duplicate rows (edge cases)
player_stats_grouped.drop_duplicates(subset=['PLAYER_ID', 'SEASON_ID'], keep='first', inplace=True)

In [None]:
#subsetting this df to only have players that started at least 50 percent of the games that they played in
high_min = player_stats_grouped[player_stats_grouped['GS']/player_stats_grouped['GP'] > .5]
high_min = high_min[['PLAYER_ID', 'SEASON_ID']]

In [None]:
based.sort_values(['Player_id', 'Season'], ascending=True)

In [None]:
#making sure dtypes are the same for merge
based['Player_id'] = based['Player_id'].astype(str)
high_min['PLAYER_ID'] = high_min['PLAYER_ID'].astype(str)

In [None]:
#Only include difference data for players that started more than half of the games that they played in (important players)
merged = based.merge(high_min, how = 'inner', left_on = ['Player_id', 'Season'], right_on = ['PLAYER_ID', 'SEASON_ID'])

In [None]:
merged.sort_values(['PLAYER_ID', 'SEASON_ID'], ascending=True)

In [None]:
#for loop to make many histograms of each numerical column at the same time
for i in merged.drop(['Player_id',  'Season', 'SEASON_ID', 'PLAYER_ID'], axis=1).columns:
  if max(merged[i]) - min(merged[i]) < 20:
    plot = (ggplot(merged) +
    aes(x=i)+
    geom_histogram(binwidth=.1, color='black', fill='orange') +
    labs(title='Histogram of Difference of ' + i, x='Value', y='Frequency') +
    theme_bw())
    print(plot)
  else:
    plot = (ggplot(merged) +
    aes(x=i)+
    geom_histogram(binwidth=.5, color='black', fill='lightblue') +
    labs(title='Histogram of Difference of ' + i, x='Value', y='Frequency') +
    theme_bw())
    print(plot)

In [1]:
#Finding the means of each column in the differences table for plot
column_means = round(merged.drop(['Player_id', 'Season', 'SEASON_ID', 'PLAYER_ID', "FG_PCT", "FG3_PCT", "FT_PCT"], axis=1).median(), 4)
column_means

NameError: name 'merged' is not defined

In [None]:
#Barplot showing column averages on the same axis
plot_data = pd.DataFrame({'column': column_means.index, 'average_value': column_means.values})

(ggplot(plot_data, aes(x='column', y='average_value', fill = 'column')) +
 geom_bar(stat='identity', position='identity', color='black') +
 labs(title='Barplot of Column Averages') +
 xlab('Column') +
 ylab('Average Value')
 +theme_minimal()
 +theme(figure_size=(8, 6))
 +theme(axis_text_x=element_text(angle=45))
 +ylim(-.25, .5))

In [None]:
by_year = merged.groupby('Season').mean().reset_index()
by_year = by_year.drop(['Unnamed: 0', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'PF'], axis = 1)
by_year

In [None]:
#Melting the df for a facet wrap plot
year_melted = pd.melt(by_year, id_vars='Season', var_name='variable')
year_melted['variable'] = year_melted['variable'].replace({'MIN': 'MINUTES PLAYED', 'AST': 'ASSISTS', 'PTS': 'POINTS', 'STL': 'STEALS', 'BLK': 'BLOCKS', 'PLUS_MINUS': 'PLUS MINUS', 'REB': 'REBOUNDS', 'TOV': 'TURNOVERS'})

In [None]:
#Line plot of each difference in statistic over the seasons
(ggplot(year_melted, aes(x='Season', y='value', group='variable', color='variable'))
 + geom_line()
 + geom_hline(yintercept=0, linetype='dashed', color='black')
 + facet_wrap('~ variable', scales='free_y', ncol=2)
 + labs(x='Season', y='Difference', title='Difference in Statistics Over the Last Seven Seasons (Avg of 7 days after bday - Avg of 45 days before)')
 +theme_minimal()
 +theme(figure_size=(12, 6))
 + theme(
     plot_title=element_text(hjust=0.5),
     )
 +theme(axis_text_x=element_text(angle=45)))