# Import Packages and Download Player Data

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np

In [None]:
# Download 2024/2025 NBA Player Data from basketball-reference.com
url = 'https://www.basketball-reference.com/leagues/NBA_2025_per_game.html#per_game_stats'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
# Preview HTML output
print(soup)

# Find and gather the applicable table sections of the webpage

In [None]:
# Search for data table headers and labels
soup.find_all('th')

In [None]:
# Search for data values
soup.find_all('td')

# Scraping the data from the website

In [None]:
# Narrow down search to collect only the entries where the scope = col, this will collect the column labels and set to variable "nba_stat_titles"
nba_stat_titles = soup.find_all('th', attrs={'scope': 'col'})

In [None]:
# Preview output
print(nba_stat_titles)

In [None]:
# Collect only the "text" value of each entry and set the output to variable, "set" is used here to remove duplicates
stat_label_list = set([title.text.strip() for title in nba_stat_titles])

# Preview label outputs
print(stat_label_list)

In [None]:
df = pd.DataFrame(columns = stat_label_list)

df

In [None]:
# Reorder the label names to match the data from table and preview
df_reordered = df.loc[:, ['Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards']]

df = df_reordered
df

In [None]:
# Search for the table data to insert to dataframe using "tr" elements
soup.find_all('tr')

In [None]:
# Set data to variable
column_data = soup.find_all('tr')

In [None]:
#loop to find only the individual data inside the "tr" elements using the "td" element and use loop to insert data into the dataframe
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    
    # Check if the number of elements matches the DataFrame columns
    if len(individual_row_data) == len(df.columns):
        #enter each individual row data into the dataframe from before while looping through the rows
        length = len(df)
        df.loc[length] = individual_row_data
    else:
        # Option 1: Skip rows that don't match
        print(f"Skipping row with {len(individual_row_data)} elements (expected {len(df.columns)})")

In [None]:
df

# Cleaning data

In [None]:
# Drop row with "league average"
df = df.drop(df[df['Player'] == 'League Average'].index)

df

In [None]:
#Check for duplicates
duplicate = df[df.duplicated('Player')]
duplicate

#Leaving duplicates as they contain team specific data for players who were traded, but in some cases they should be removed or cleaned

In [None]:
# Show all columns
pd.set_option('display.max_columns', None)

# Show all rows
pd.set_option('display.max_rows', None)

# Prevent column width cutoff
pd.set_option('display.max_colwidth', None)

pd.set_option('display.width', None)

In [None]:
#Use Fuzz to fix incorrectly formatted names
target_players = ['Nikola JokiÄ', 'Luka DonÄiÄ', 'Kristaps PorziÅÄ£is', 'Alperen ÅengÃ¼n', 'Nikola VuÄeviÄ', 'Dennis SchrÃ¶der', 'Bogdan BogdanoviÄ', 'Jonas ValanÄiÅ«nas', 'Nikola JoviÄ', 'Jusuf NurkiÄ', 'Vasilije MiciÄ', 'Karlo MatkoviÄ', 'Lester QuiÃ±ones', 'Tidjane SalaÃ¼n', 'Moussa DiabatÃ©', 'Dario Å ariÄ', 'Armel TraorÃ©', 'Skal LabissiÃ¨re', 'Vlatko ÄanÄar']
valid_players = ['Nikola Jokic', 'Luka Doncic', 'Kristaps Porzingis', 'Alperen Sengun', 'Nikola Vucevic', 'Dennis Schroder', 'Bogdan Bogdanovic', 'Jonas Valanciunas', 'Nikola Jovic', 'Jusuf Nurkic', 'Vasilije Micic', 'Karlo Matkovic', 'Lester Quinones', 'Tidjane Salaun', 'Moussa Diabate', 'Dario Saric', 'Armel Traore', 'Skal Labissiere', 'Vlatko Cancar']

from fuzzywuzzy import process

# Function to apply fuzzy fix
def fuzzy_fix_player(val, valid_list, threshold=85):
    match, score = process.extractOne(val, valid_list)
    return match if score >= threshold else val

# Apply fix ONLY to rows where 'player' is in target_players
df['Player'] = df['Player'].apply(
    lambda x: fuzzy_fix_player(x, valid_players) if x in target_players else x
)

In [None]:
#Fix player names where the fuzz is not working
name_mapping = {
    "Nikola Joki": "Nikola Jokic",
    "Vasilije Mici": "Vasilije Micic",
    "Dario Å": "Dario Saric"
}

# Loop through each mapping and update rows that start with the key
for start_str, new_name in name_mapping.items():
    mask = df["Player"].str.startswith(start_str, na=False)
    df.loc[mask, "Player"] = new_name

In [None]:
# Preview dataframe

df

In [None]:
#Check for special charcters in "Players"
pattern = r'[^a-zA-Z0-9 .-]'

# Create a mask to identify rows with special characters
mask = df['Player'].str.contains(pattern, regex=True)

# Display rows that contain special characters
special_char_rows = df[mask]
special_char_rows

In [None]:
#Fill empty entries with NaN(Null) values
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

#Remove "Awards" column
df = df.drop(columns=['Awards'])

In [None]:
# Preview Dataframe
df

In [None]:
#Check df datatypes
print(df.dtypes)

In [None]:
#correct the df datatypes
df = df.astype({
    'Player': 'object',
    'Age': 'int',
    'Team': 'object',
    'Pos': 'object',
    'G': 'int',
    'GS': 'int',
    'MP': 'float64',
    'FG': 'float64',
    'FGA': 'float64',
    'FG%': 'float64',
    '3P': 'float64',
    '3PA': 'float64',
    '3P%': 'float64',
    '2P': 'float64',
    '2PA': 'float64',
    '2P%': 'float64',
    'eFG%': 'float64',
    'FT': 'float64',
    'FTA': 'float64',
    'FT%': 'float64',
    'ORB': 'float64',
    'DRB': 'float64',
    'TRB': 'float64',
    'AST': 'float64',
    'STL': 'float64',
    'BLK': 'float64',
    'TOV': 'float64',
    'PF': 'float64',
    'PTS': 'float64'
})

In [None]:
#Check df datatypes
print(df.dtypes)

# Data Manipulation

In [None]:
# Preview Datadframe

df.head()

In [None]:
#Create Total columns, and round to zero decimals
#use .astype() to to assign new columns as integer

df = df.assign(
    total_PTS = (df['PTS'] * df['G']).round(0).astype(int),
    total_RBS = (df['TRB'] * df['G']).round(0).astype(int),
    total_AST = (df['AST'] * df['G']).round(0).astype(int),
    total_MP = (df['MP'] * df['G']).round(0).astype(int),
    total_BLK = (df['BLK'] * df['G']).round(0).astype(int),
    total_STL = (df['STL'] * df['G']).round(0).astype(int),
    total_3PT = (df['3P'] * df['G']).round(0).astype(int),
    total_2P = (df['2P'] * df['G']).round(0).astype(int)
)

df.head()

# Export NBA player data via GitHub API or local file save

Please choose either method 1 or 2 to export data

## 1. Export via GitHub API

In [None]:
import base64

# ==== CONFIGURATION ====
#For security reasons, please enter your personal github repository information to export NBA25_output.csv

GITHUB_TOKEN = "YOUR_GITHUB_PERSONAL_ACCESS_TOKEN"  #Enter Token
OWNER = "your-username-or-org"       #Enter Github Username 
REPO = "your-repo-name"              #Enter Repository Name
BRANCH = "main"                      #Enter Branch
FILE_PATH = "NBA25_output.csv"

# Convert DataFrame to CSV string
csv_data = df.to_csv(index=False)

# ==== PREPARE API REQUEST ====
api_url = f"https://api.github.com/repos/{OWNER}/{REPO}/contents/{FILE_PATH}"
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

# Check if file exists to decide between update or create
res = requests.get(api_url, headers=headers, params={"ref": BRANCH})

if res.status_code == 200:
    # File exists → update
    sha = res.json()["sha"]
    message = "Update CSV from Python script"
    payload = {
        "message": message,
        "branch": BRANCH,
        "content": base64.b64encode(csv_data.encode()).decode(),
        "sha": sha
    }
    r = requests.put(api_url, headers=headers, json=payload)

elif res.status_code == 404:
    # File does not exist → create
    message = "Add CSV from Python script"
    payload = {
        "message": message,
        "branch": BRANCH,
        "content": base64.b64encode(csv_data.encode()).decode()
    }
    r = requests.put(api_url, headers=headers, json=payload)

else:
    raise Exception(f"Error checking file: {res.status_code}, {res.text}")

# ==== RESULT ====
if r.status_code in (200, 201):
    print(f"✅ CSV uploaded to GitHub at {FILE_PATH}")
else:
    print(f"❌ Error: {r.status_code}, {r.text}")

## 2. Export via local file save

In [None]:
# Enter path to desired file download location
df.to_csv('{ENTER_PATH_TO_FOLDER}/NBA25_output.csv', index=False)