# Import Packages and Download Player Data

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np

In [2]:
# Download 2024/2025 NBA Player Data from basketball-reference.com
url = 'https://www.basketball-reference.com/leagues/NBA_2025_per_game.html#per_game_stats'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
# Preview HTML output
print(soup)

# Find and gather the applicable table sections of the webpage

In [4]:
# Search for data table headers and labels
soup.find_all('th')

[<th aria-label="Rk" class="ranker poptip center" data-stat="ranker" data-tip="Rank" scope="col">Rk</th>,
 <th aria-label="Player" class="poptip sort_default_asc center" data-stat="name_display" scope="col">Player</th>,
 <th aria-label="Age" class="poptip center" data-stat="age" data-tip="Player's age on February 1 of the season" scope="col">Age</th>,
 <th aria-label="Team" class="poptip sort_default_asc center" data-stat="team_name_abbr" scope="col">Team</th>,
 <th aria-label="Pos" class="poptip center" data-stat="pos" data-tip="Position" scope="col">Pos</th>,
 <th aria-label="G" class="poptip center" data-stat="games" data-tip="Games" scope="col">G</th>,
 <th aria-label="GS" class="poptip center" data-stat="games_started" data-tip="Games Started" scope="col">GS</th>,
 <th aria-label="MP" class="poptip hide_non_quals center" data-filter="1" data-name="" data-stat="mp_per_g" data-tip="Minutes Played" scope="col">MP</th>,
 <th aria-label="FG" class="poptip hide_non_quals center" data-fi

In [5]:
# Search for data values
soup.find_all('td')

[<td class="left" csk="Gilgeous-Alexander Shai-1" data-append-csv="gilgesh01" data-stat="name_display"><a href="/players/g/gilgesh01.html">Shai Gilgeous-Alexander</a></td>,
 <td class="right" data-stat="age">26</td>,
 <td class="left" data-stat="team_name_abbr"><a href="/teams/OKC/2025.html">OKC</a></td>,
 <td class="center" data-stat="pos">PG</td>,
 <td class="right" data-stat="games">76</td>,
 <td class="right" data-stat="games_started">76</td>,
 <td class="right" csk="34.1842105263" data-stat="mp_per_g">34.2</td>,
 <td class="right" csk="11.3157894737" data-stat="fg_per_g">11.3</td>,
 <td class="right" csk="21.7894736842" data-stat="fga_per_g"><strong>21.8</strong></td>,
 <td class="right" csk="0.5193236715" data-stat="fg_pct">.519</td>,
 <td class="right" csk="2.1447368421" data-stat="fg3_per_g">2.1</td>,
 <td class="right" csk="5.7236842105" data-stat="fg3a_per_g">5.7</td>,
 <td class="right" csk="0.3747126437" data-stat="fg3_pct">.375</td>,
 <td class="right" csk="9.1710526316" d

# Scraping the data from the website

In [6]:
# Narrow down search to collect only the entries where the scope = col, this will collect the column labels and set to variable "nba_stat_titles"
nba_stat_titles = soup.find_all('th', attrs={'scope': 'col'})

In [7]:
# Preview output
print(nba_stat_titles)

[<th aria-label="Rk" class="ranker poptip center" data-stat="ranker" data-tip="Rank" scope="col">Rk</th>, <th aria-label="Player" class="poptip sort_default_asc center" data-stat="name_display" scope="col">Player</th>, <th aria-label="Age" class="poptip center" data-stat="age" data-tip="Player's age on February 1 of the season" scope="col">Age</th>, <th aria-label="Team" class="poptip sort_default_asc center" data-stat="team_name_abbr" scope="col">Team</th>, <th aria-label="Pos" class="poptip center" data-stat="pos" data-tip="Position" scope="col">Pos</th>, <th aria-label="G" class="poptip center" data-stat="games" data-tip="Games" scope="col">G</th>, <th aria-label="GS" class="poptip center" data-stat="games_started" data-tip="Games Started" scope="col">GS</th>, <th aria-label="MP" class="poptip hide_non_quals center" data-filter="1" data-name="" data-stat="mp_per_g" data-tip="Minutes Played" scope="col">MP</th>, <th aria-label="FG" class="poptip hide_non_quals center" data-filter="1"

In [8]:
# Collect only the "text" value of each entry and set the output to variable, "set" is used here to remove duplicates
stat_label_list = set([title.text.strip() for title in nba_stat_titles])

# Preview label outputs
print(stat_label_list)

{'2P%', '3P%', 'FGA', 'FG%', 'MP', 'BLK', 'G', 'TOV', 'FG', 'TRB', 'STL', 'Awards', 'Team', 'eFG%', '3P', 'Player', 'ORB', 'GS', 'AST', 'Pos', 'PTS', '2P', 'FTA', 'DRB', 'PF', 'Age', '3PA', 'FT', '2PA', 'Rk', 'FT%'}


In [9]:
df = pd.DataFrame(columns = stat_label_list)

df

Unnamed: 0,2P%,3P%,FGA,FG%,MP,BLK,G,TOV,FG,TRB,...,2P,FTA,DRB,PF,Age,3PA,FT,2PA,Rk,FT%


In [10]:
# Reorder the label names to match the data from table and preview
df_reordered = df.loc[:, ['Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards']]

df = df_reordered
df

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards


In [11]:
# Search for the table data to insert to dataframe using "tr" elements
soup.find_all('tr')

[<tr> <th aria-label="Rk" class="ranker poptip center" data-stat="ranker" data-tip="Rank" scope="col">Rk</th> <th aria-label="Player" class="poptip sort_default_asc center" data-stat="name_display" scope="col">Player</th> <th aria-label="Age" class="poptip center" data-stat="age" data-tip="Player's age on February 1 of the season" scope="col">Age</th> <th aria-label="Team" class="poptip sort_default_asc center" data-stat="team_name_abbr" scope="col">Team</th> <th aria-label="Pos" class="poptip center" data-stat="pos" data-tip="Position" scope="col">Pos</th> <th aria-label="G" class="poptip center" data-stat="games" data-tip="Games" scope="col">G</th> <th aria-label="GS" class="poptip center" data-stat="games_started" data-tip="Games Started" scope="col">GS</th> <th aria-label="MP" class="poptip hide_non_quals center" data-filter="1" data-name="" data-stat="mp_per_g" data-tip="Minutes Played" scope="col">MP</th> <th aria-label="FG" class="poptip hide_non_quals center" data-filter="1" da

In [12]:
# Set data to variable
column_data = soup.find_all('tr')

In [13]:
#loop to find only the individual data inside the "tr" elements using the "td" element and use loop to insert data into the dataframe
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    
    # Check if the number of elements matches the DataFrame columns
    if len(individual_row_data) == len(df.columns):
        #enter each individual row data into the dataframe from before while looping through the rows
        length = len(df)
        df.loc[length] = individual_row_data
    else:
        # Option 1: Skip rows that don't match
        print(f"Skipping row with {len(individual_row_data)} elements (expected {len(df.columns)})")

Skipping row with 0 elements (expected 30)


In [14]:
df

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards
0,Shai Gilgeous-Alexander,26,OKC,PG,76,76,34.2,11.3,21.8,.519,...,0.9,4.1,5.0,6.4,1.7,1.0,2.4,2.2,32.7,"MVP-1,DPOY-10,CPOY-8,AS,NBA1"
1,Giannis Antetokounmpo,30,MIL,PF,67,67,34.2,11.8,19.7,.601,...,2.2,9.7,11.9,6.5,0.9,1.2,3.1,2.3,30.4,"MVP-3,DPOY-8,AS,NBA1"
2,Nikola JokiÄ,29,DEN,C,70,70,36.7,11.2,19.5,.576,...,2.9,9.9,12.7,10.2,1.8,0.6,3.3,2.3,29.6,"MVP-2,CPOY-2,AS,NBA1"
3,Luka DonÄiÄ,25,2TM,PG,50,50,35.4,9.2,20.5,.450,...,0.8,7.4,8.2,7.7,1.8,0.4,3.6,2.5,28.2,
4,Luka DonÄiÄ,25,DAL,PG,22,22,35.7,9.8,21.2,.464,...,0.7,7.6,8.3,7.8,2.0,0.4,3.4,2.6,28.1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
951,P.J. Tucker,39,NYK,SF,1,0,4.0,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,
952,Jordan Walsh,20,BOS,PF,5,0,3.0,0.0,0.2,.000,...,0.0,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.0,
953,Cam Whitmore,20,HOU,SF,3,0,1.7,0.0,0.7,.000,...,0.0,0.3,0.3,0.0,0.0,0.0,0.0,0.0,0.0,
954,Jeenathan Williams,25,HOU,SG,3,0,1.7,0.0,0.3,.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.3,0.0,


# Cleaning data

In [15]:
# Drop row with "league average"
df = df.drop(df[df['Player'] == 'League Average'].index)

df

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards
0,Shai Gilgeous-Alexander,26,OKC,PG,76,76,34.2,11.3,21.8,.519,...,0.9,4.1,5.0,6.4,1.7,1.0,2.4,2.2,32.7,"MVP-1,DPOY-10,CPOY-8,AS,NBA1"
1,Giannis Antetokounmpo,30,MIL,PF,67,67,34.2,11.8,19.7,.601,...,2.2,9.7,11.9,6.5,0.9,1.2,3.1,2.3,30.4,"MVP-3,DPOY-8,AS,NBA1"
2,Nikola JokiÄ,29,DEN,C,70,70,36.7,11.2,19.5,.576,...,2.9,9.9,12.7,10.2,1.8,0.6,3.3,2.3,29.6,"MVP-2,CPOY-2,AS,NBA1"
3,Luka DonÄiÄ,25,2TM,PG,50,50,35.4,9.2,20.5,.450,...,0.8,7.4,8.2,7.7,1.8,0.4,3.6,2.5,28.2,
4,Luka DonÄiÄ,25,DAL,PG,22,22,35.7,9.8,21.2,.464,...,0.7,7.6,8.3,7.8,2.0,0.4,3.4,2.6,28.1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950,Reed Sheppard,20,HOU,PG,3,0,3.3,0.0,0.7,.000,...,0.0,0.3,0.3,0.3,0.7,0.3,0.3,0.3,0.0,
951,P.J. Tucker,39,NYK,SF,1,0,4.0,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,
952,Jordan Walsh,20,BOS,PF,5,0,3.0,0.0,0.2,.000,...,0.0,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.0,
953,Cam Whitmore,20,HOU,SF,3,0,1.7,0.0,0.7,.000,...,0.0,0.3,0.3,0.0,0.0,0.0,0.0,0.0,0.0,


In [16]:
#Check for duplicates
duplicate = df[df.duplicated('Player')]
duplicate

#Leaving duplicates as they contain team specific data for players who were traded, but in some cases they should be removed or cleaned

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards
4,Luka DonÄiÄ,25,DAL,PG,22,22,35.7,9.8,21.2,.464,...,0.7,7.6,8.3,7.8,2.0,0.4,3.4,2.6,28.1,
5,Luka DonÄiÄ,25,LAL,PG,28,28,35.1,8.8,20.0,.438,...,0.9,7.2,8.1,7.5,1.6,0.4,3.7,2.4,28.2,
17,Anthony Davis,31,LAL,C,42,42,34.3,9.5,18.0,.528,...,2.8,9.0,11.9,3.4,1.3,2.1,2.2,2.0,25.7,
18,Anthony Davis,31,DAL,PF,9,9,29.6,7.8,16.9,.461,...,1.7,8.4,10.1,4.4,0.6,2.2,2.2,1.8,20.0,
32,De'Aaron Fox,27,SAC,PG,45,45,37.0,9.2,19.6,.469,...,1.0,4.0,5.0,6.1,1.5,0.4,3.0,2.6,25.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950,Reed Sheppard,20,HOU,PG,3,0,3.3,0.0,0.7,.000,...,0.0,0.3,0.3,0.3,0.7,0.3,0.3,0.3,0.0,
951,P.J. Tucker,39,NYK,SF,1,0,4.0,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,
952,Jordan Walsh,20,BOS,PF,5,0,3.0,0.0,0.2,.000,...,0.0,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.0,
953,Cam Whitmore,20,HOU,SF,3,0,1.7,0.0,0.7,.000,...,0.0,0.3,0.3,0.0,0.0,0.0,0.0,0.0,0.0,


In [17]:
# Show all columns
pd.set_option('display.max_columns', None)

# Show all rows
pd.set_option('display.max_rows', None)

# Prevent column width cutoff
pd.set_option('display.max_colwidth', None)

pd.set_option('display.width', None)

In [18]:
#Use Fuzz to fix incorrectly formatted names
target_players = ['Nikola JokiÄ', 'Luka DonÄiÄ', 'Kristaps PorziÅÄ£is', 'Alperen ÅengÃ¼n', 'Nikola VuÄeviÄ', 'Dennis SchrÃ¶der', 'Bogdan BogdanoviÄ', 'Jonas ValanÄiÅ«nas', 'Nikola JoviÄ', 'Jusuf NurkiÄ', 'Vasilije MiciÄ', 'Karlo MatkoviÄ', 'Lester QuiÃ±ones', 'Tidjane SalaÃ¼n', 'Moussa DiabatÃ©', 'Dario Å ariÄ', 'Armel TraorÃ©', 'Skal LabissiÃ¨re', 'Vlatko ÄanÄar']
valid_players = ['Nikola Jokic', 'Luka Doncic', 'Kristaps Porzingis', 'Alperen Sengun', 'Nikola Vucevic', 'Dennis Schroder', 'Bogdan Bogdanovic', 'Jonas Valanciunas', 'Nikola Jovic', 'Jusuf Nurkic', 'Vasilije Micic', 'Karlo Matkovic', 'Lester Quinones', 'Tidjane Salaun', 'Moussa Diabate', 'Dario Saric', 'Armel Traore', 'Skal Labissiere', 'Vlatko Cancar']

from fuzzywuzzy import process

# Function to apply fuzzy fix
def fuzzy_fix_player(val, valid_list, threshold=85):
    match, score = process.extractOne(val, valid_list)
    return match if score >= threshold else val

# Apply fix ONLY to rows where 'player' is in target_players
df['Player'] = df['Player'].apply(
    lambda x: fuzzy_fix_player(x, valid_players) if x in target_players else x
)

In [19]:
#Fix player names where the fuzz is not working
name_mapping = {
    "Nikola Joki": "Nikola Jokic",
    "Vasilije Mici": "Vasilije Micic",
    "Dario Å": "Dario Saric"
}

# Loop through each mapping and update rows that start with the key
for start_str, new_name in name_mapping.items():
    mask = df["Player"].str.startswith(start_str, na=False)
    df.loc[mask, "Player"] = new_name

In [20]:
# Preview dataframe

df

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards
0,Shai Gilgeous-Alexander,26,OKC,PG,76,76,34.2,11.3,21.8,0.519,2.1,5.7,0.375,9.2,16.1,0.571,0.569,7.9,8.8,0.898,0.9,4.1,5.0,6.4,1.7,1.0,2.4,2.2,32.7,"MVP-1,DPOY-10,CPOY-8,AS,NBA1"
1,Giannis Antetokounmpo,30,MIL,PF,67,67,34.2,11.8,19.7,0.601,0.2,0.9,0.222,11.6,18.7,0.62,0.607,6.5,10.6,0.617,2.2,9.7,11.9,6.5,0.9,1.2,3.1,2.3,30.4,"MVP-3,DPOY-8,AS,NBA1"
2,Nikola Jokic,29,DEN,C,70,70,36.7,11.2,19.5,0.576,2.0,4.7,0.417,9.3,14.8,0.627,0.627,5.2,6.4,0.8,2.9,9.9,12.7,10.2,1.8,0.6,3.3,2.3,29.6,"MVP-2,CPOY-2,AS,NBA1"
3,Luka Doncic,25,2TM,PG,50,50,35.4,9.2,20.5,0.45,3.5,9.6,0.368,5.7,10.9,0.522,0.536,6.2,7.9,0.782,0.8,7.4,8.2,7.7,1.8,0.4,3.6,2.5,28.2,
4,Luka Doncic,25,DAL,PG,22,22,35.7,9.8,21.2,0.464,3.4,9.6,0.354,6.4,11.5,0.555,0.544,5.1,6.6,0.767,0.7,7.6,8.3,7.8,2.0,0.4,3.4,2.6,28.1,
5,Luka Doncic,25,LAL,PG,28,28,35.1,8.8,20.0,0.438,3.6,9.6,0.379,5.1,10.4,0.493,0.53,7.0,8.9,0.791,0.9,7.2,8.1,7.5,1.6,0.4,3.7,2.4,28.2,
6,Anthony Edwards,23,MIN,SG,79,79,36.3,9.1,20.4,0.447,4.1,10.3,0.395,5.1,10.1,0.501,0.547,5.3,6.3,0.837,0.8,4.9,5.7,4.5,1.2,0.6,3.2,1.9,27.6,"MVP-7,CPOY-3,AS,NBA2"
7,Jayson Tatum,26,BOS,PF,72,72,36.4,9.2,20.3,0.452,3.5,10.1,0.343,5.7,10.2,0.559,0.537,5.0,6.1,0.814,0.7,8.0,8.7,6.0,1.1,0.5,2.9,2.2,26.8,"MVP-4,CPOY-10,AS,NBA1"
8,Kevin Durant,36,PHO,PF,62,62,36.5,9.5,18.1,0.527,2.6,6.0,0.43,7.0,12.1,0.574,0.598,4.9,5.8,0.839,0.4,5.7,6.0,4.2,0.8,1.2,3.1,1.7,26.6,AS
9,Tyrese Maxey,24,PHI,PG,52,52,37.7,9.2,21.0,0.437,3.1,9.2,0.337,6.1,11.8,0.515,0.511,4.9,5.6,0.879,0.3,3.1,3.3,6.1,1.8,0.4,2.4,2.2,26.3,CPOY-10


In [21]:
#Check for special charcters in "Players"
pattern = r'[^a-zA-Z0-9 .-]'

# Create a mask to identify rows with special characters
mask = df['Player'].str.contains(pattern, regex=True)

# Display rows that contain special characters
special_char_rows = df[mask]
special_char_rows

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards
31,De'Aaron Fox,27,2TM,PG,62,62,36.1,8.7,18.8,0.463,1.9,6.1,0.31,6.8,12.6,0.538,0.514,4.2,5.1,0.827,0.9,3.9,4.8,6.3,1.5,0.4,2.8,2.6,23.5,
32,De'Aaron Fox,27,SAC,PG,45,45,37.0,9.2,19.6,0.469,2.0,6.4,0.322,7.2,13.3,0.539,0.521,4.5,5.5,0.829,1.0,4.0,5.0,6.1,1.5,0.4,3.0,2.6,25.0,
33,De'Aaron Fox,27,SAS,PG,17,17,34.0,7.4,16.5,0.446,1.5,5.6,0.274,5.8,10.9,0.535,0.493,3.5,4.2,0.819,0.5,3.8,4.3,6.8,1.5,0.3,2.4,2.7,19.7,
88,De'Andre Hunter,27,2TM,SF,64,9,27.2,5.6,11.9,0.47,2.5,6.1,0.405,3.1,5.8,0.537,0.573,3.4,4.0,0.846,0.7,3.3,4.0,1.4,0.8,0.2,1.3,2.4,17.0,6MOY-4
89,De'Andre Hunter,27,ATL,SF,37,4,28.8,6.1,13.3,0.461,2.6,6.7,0.393,3.5,6.6,0.531,0.56,4.1,4.8,0.858,0.7,3.2,3.9,1.5,0.8,0.1,1.6,2.4,19.0,
90,De'Andre Hunter,27,CLE,SF,27,5,25.0,4.9,10.0,0.485,2.2,5.2,0.426,2.6,4.8,0.55,0.596,2.4,2.9,0.821,0.6,3.6,4.2,1.3,0.7,0.3,0.9,2.5,14.3,
150,D'Angelo Russell,28,2TM,PG,58,36,25.5,4.3,11.0,0.39,1.9,6.2,0.314,2.3,4.8,0.489,0.479,2.1,2.5,0.834,0.3,2.4,2.8,5.1,1.0,0.4,1.9,2.0,12.6,
151,D'Angelo Russell,28,LAL,PG,29,10,26.3,4.4,10.7,0.415,1.9,5.8,0.333,2.5,4.9,0.51,0.505,1.6,1.8,0.849,0.3,2.5,2.8,4.7,0.8,0.1,1.7,1.8,12.4,
152,D'Angelo Russell,28,BRK,PG,29,26,24.7,4.1,11.3,0.367,2.0,6.6,0.297,2.2,4.7,0.467,0.454,2.6,3.2,0.826,0.4,2.4,2.8,5.6,1.1,0.7,2.0,2.1,12.9,
213,De'Anthony Melton,26,GSW,SG,6,2,20.2,3.7,9.0,0.407,2.2,5.8,0.371,1.5,3.2,0.474,0.528,0.8,1.3,0.625,1.0,2.3,3.3,2.8,1.2,0.3,1.7,3.0,10.3,


In [22]:
#Fill empty entries with NaN(Null) values
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

#Remove "Awards" column
df = df.drop(columns=['Awards'])

In [23]:
# Preview Dataframe
df

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Shai Gilgeous-Alexander,26,OKC,PG,76,76,34.2,11.3,21.8,0.519,2.1,5.7,0.375,9.2,16.1,0.571,0.569,7.9,8.8,0.898,0.9,4.1,5.0,6.4,1.7,1.0,2.4,2.2,32.7
1,Giannis Antetokounmpo,30,MIL,PF,67,67,34.2,11.8,19.7,0.601,0.2,0.9,0.222,11.6,18.7,0.62,0.607,6.5,10.6,0.617,2.2,9.7,11.9,6.5,0.9,1.2,3.1,2.3,30.4
2,Nikola Jokic,29,DEN,C,70,70,36.7,11.2,19.5,0.576,2.0,4.7,0.417,9.3,14.8,0.627,0.627,5.2,6.4,0.8,2.9,9.9,12.7,10.2,1.8,0.6,3.3,2.3,29.6
3,Luka Doncic,25,2TM,PG,50,50,35.4,9.2,20.5,0.45,3.5,9.6,0.368,5.7,10.9,0.522,0.536,6.2,7.9,0.782,0.8,7.4,8.2,7.7,1.8,0.4,3.6,2.5,28.2
4,Luka Doncic,25,DAL,PG,22,22,35.7,9.8,21.2,0.464,3.4,9.6,0.354,6.4,11.5,0.555,0.544,5.1,6.6,0.767,0.7,7.6,8.3,7.8,2.0,0.4,3.4,2.6,28.1
5,Luka Doncic,25,LAL,PG,28,28,35.1,8.8,20.0,0.438,3.6,9.6,0.379,5.1,10.4,0.493,0.53,7.0,8.9,0.791,0.9,7.2,8.1,7.5,1.6,0.4,3.7,2.4,28.2
6,Anthony Edwards,23,MIN,SG,79,79,36.3,9.1,20.4,0.447,4.1,10.3,0.395,5.1,10.1,0.501,0.547,5.3,6.3,0.837,0.8,4.9,5.7,4.5,1.2,0.6,3.2,1.9,27.6
7,Jayson Tatum,26,BOS,PF,72,72,36.4,9.2,20.3,0.452,3.5,10.1,0.343,5.7,10.2,0.559,0.537,5.0,6.1,0.814,0.7,8.0,8.7,6.0,1.1,0.5,2.9,2.2,26.8
8,Kevin Durant,36,PHO,PF,62,62,36.5,9.5,18.1,0.527,2.6,6.0,0.43,7.0,12.1,0.574,0.598,4.9,5.8,0.839,0.4,5.7,6.0,4.2,0.8,1.2,3.1,1.7,26.6
9,Tyrese Maxey,24,PHI,PG,52,52,37.7,9.2,21.0,0.437,3.1,9.2,0.337,6.1,11.8,0.515,0.511,4.9,5.6,0.879,0.3,3.1,3.3,6.1,1.8,0.4,2.4,2.2,26.3


In [24]:
#Check df datatypes
print(df.dtypes)

Player    object
Age       object
Team      object
Pos       object
G         object
GS        object
MP        object
FG        object
FGA       object
FG%       object
3P        object
3PA       object
3P%       object
2P        object
2PA       object
2P%       object
eFG%      object
FT        object
FTA       object
FT%       object
ORB       object
DRB       object
TRB       object
AST       object
STL       object
BLK       object
TOV       object
PF        object
PTS       object
dtype: object


In [25]:
#correct the df datatypes
df = df.astype({
    'Player': 'object',
    'Age': 'int',
    'Team': 'object',
    'Pos': 'object',
    'G': 'int',
    'GS': 'int',
    'MP': 'float64',
    'FG': 'float64',
    'FGA': 'float64',
    'FG%': 'float64',
    '3P': 'float64',
    '3PA': 'float64',
    '3P%': 'float64',
    '2P': 'float64',
    '2PA': 'float64',
    '2P%': 'float64',
    'eFG%': 'float64',
    'FT': 'float64',
    'FTA': 'float64',
    'FT%': 'float64',
    'ORB': 'float64',
    'DRB': 'float64',
    'TRB': 'float64',
    'AST': 'float64',
    'STL': 'float64',
    'BLK': 'float64',
    'TOV': 'float64',
    'PF': 'float64',
    'PTS': 'float64'
})

In [26]:
#Check df datatypes
print(df.dtypes)

Player     object
Age         int64
Team       object
Pos        object
G           int64
GS          int64
MP        float64
FG        float64
FGA       float64
FG%       float64
3P        float64
3PA       float64
3P%       float64
2P        float64
2PA       float64
2P%       float64
eFG%      float64
FT        float64
FTA       float64
FT%       float64
ORB       float64
DRB       float64
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV       float64
PF        float64
PTS       float64
dtype: object


# Data Manipulation

In [27]:
# Preview Datadframe

df.head()

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Shai Gilgeous-Alexander,26,OKC,PG,76,76,34.2,11.3,21.8,0.519,2.1,5.7,0.375,9.2,16.1,0.571,0.569,7.9,8.8,0.898,0.9,4.1,5.0,6.4,1.7,1.0,2.4,2.2,32.7
1,Giannis Antetokounmpo,30,MIL,PF,67,67,34.2,11.8,19.7,0.601,0.2,0.9,0.222,11.6,18.7,0.62,0.607,6.5,10.6,0.617,2.2,9.7,11.9,6.5,0.9,1.2,3.1,2.3,30.4
2,Nikola Jokic,29,DEN,C,70,70,36.7,11.2,19.5,0.576,2.0,4.7,0.417,9.3,14.8,0.627,0.627,5.2,6.4,0.8,2.9,9.9,12.7,10.2,1.8,0.6,3.3,2.3,29.6
3,Luka Doncic,25,2TM,PG,50,50,35.4,9.2,20.5,0.45,3.5,9.6,0.368,5.7,10.9,0.522,0.536,6.2,7.9,0.782,0.8,7.4,8.2,7.7,1.8,0.4,3.6,2.5,28.2
4,Luka Doncic,25,DAL,PG,22,22,35.7,9.8,21.2,0.464,3.4,9.6,0.354,6.4,11.5,0.555,0.544,5.1,6.6,0.767,0.7,7.6,8.3,7.8,2.0,0.4,3.4,2.6,28.1


In [28]:
#Create Total columns, and round to zero decimals
#use .astype() to to assign new columns as integer

df = df.assign(
    total_PTS = (df['PTS'] * df['G']).round(0).astype(int),
    total_RBS = (df['TRB'] * df['G']).round(0).astype(int),
    total_AST = (df['AST'] * df['G']).round(0).astype(int),
    total_MP = (df['MP'] * df['G']).round(0).astype(int),
    total_BLK = (df['BLK'] * df['G']).round(0).astype(int),
    total_STL = (df['STL'] * df['G']).round(0).astype(int),
    total_3PT = (df['3P'] * df['G']).round(0).astype(int),
    total_2P = (df['2P'] * df['G']).round(0).astype(int)
)

df.head()

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,total_PTS,total_RBS,total_AST,total_MP,total_BLK,total_STL,total_3PT,total_2P
0,Shai Gilgeous-Alexander,26,OKC,PG,76,76,34.2,11.3,21.8,0.519,2.1,5.7,0.375,9.2,16.1,0.571,0.569,7.9,8.8,0.898,0.9,4.1,5.0,6.4,1.7,1.0,2.4,2.2,32.7,2485,380,486,2599,76,129,160,699
1,Giannis Antetokounmpo,30,MIL,PF,67,67,34.2,11.8,19.7,0.601,0.2,0.9,0.222,11.6,18.7,0.62,0.607,6.5,10.6,0.617,2.2,9.7,11.9,6.5,0.9,1.2,3.1,2.3,30.4,2037,797,436,2291,80,60,13,777
2,Nikola Jokic,29,DEN,C,70,70,36.7,11.2,19.5,0.576,2.0,4.7,0.417,9.3,14.8,0.627,0.627,5.2,6.4,0.8,2.9,9.9,12.7,10.2,1.8,0.6,3.3,2.3,29.6,2072,889,714,2569,42,126,140,651
3,Luka Doncic,25,2TM,PG,50,50,35.4,9.2,20.5,0.45,3.5,9.6,0.368,5.7,10.9,0.522,0.536,6.2,7.9,0.782,0.8,7.4,8.2,7.7,1.8,0.4,3.6,2.5,28.2,1410,410,385,1770,20,90,175,285
4,Luka Doncic,25,DAL,PG,22,22,35.7,9.8,21.2,0.464,3.4,9.6,0.354,6.4,11.5,0.555,0.544,5.1,6.6,0.767,0.7,7.6,8.3,7.8,2.0,0.4,3.4,2.6,28.1,618,183,172,785,9,44,75,141


# Export NBA player data via GitHub API or local file save

Please choose either method 1 or 2 to export data

## 1. Export via GitHub API

In [42]:
import base64

# ==== CONFIGURATION ====
#For security reasons, please enter your personal github repository information to export NBA25_output.csv

GITHUB_TOKEN = "YOUR_GITHUB_PERSONAL_ACCESS_TOKEN"  #Enter Token
OWNER = "your-username-or-org"       #Enter Github Username 
REPO = "your-repo-name"              #Enter Repository Name
BRANCH = "main"                      #Enter Branch
FILE_PATH = "NBA25_output.csv"

# Convert DataFrame to CSV string
csv_data = df.to_csv(index=False)

# ==== PREPARE API REQUEST ====
api_url = f"https://api.github.com/repos/{OWNER}/{REPO}/contents/{FILE_PATH}"
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

# Check if file exists to decide between update or create
res = requests.get(api_url, headers=headers, params={"ref": BRANCH})

if res.status_code == 200:
    # File exists → update
    sha = res.json()["sha"]
    message = "Update CSV from Python script"
    payload = {
        "message": message,
        "branch": BRANCH,
        "content": base64.b64encode(csv_data.encode()).decode(),
        "sha": sha
    }
    r = requests.put(api_url, headers=headers, json=payload)

elif res.status_code == 404:
    # File does not exist → create
    message = "Add CSV from Python script"
    payload = {
        "message": message,
        "branch": BRANCH,
        "content": base64.b64encode(csv_data.encode()).decode()
    }
    r = requests.put(api_url, headers=headers, json=payload)

else:
    raise Exception(f"Error checking file: {res.status_code}, {res.text}")

# ==== RESULT ====
if r.status_code in (200, 201):
    print(f"✅ CSV uploaded to GitHub at {FILE_PATH}")
else:
    print(f"❌ Error: {r.status_code}, {r.text}")

✅ CSV uploaded to GitHub at NBA25_output.csv


## 2. Export via local file save

In [None]:
# Enter path to desired file download location
df.to_csv('{ENTER_PATH_TO_FOLDER}/NBA25_output.csv', index=False)