<a href="https://colab.research.google.com/github/b-harr/dmcb/blob/colab/DMCB_Python_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## scrape_bbref.py

Run this script as often as necessary to get live* statistics from BBRef

*Update frequency determined by 3rd party

In [None]:
import requests
import re
import unicodedata
import pandas as pd
from bs4 import BeautifulSoup

# Function to clean a player's name and generate a unique player key to join across sites
# Normalizes the name (e.g., replaces accents with ASCII), converts to lowercase, strips trailing spaces,
# replaces spaces with hyphens, removes special characters (e.g., periods and apostrophes), and strips
# suffixes (e.g., "-jr", "-iii").
def make_player_key(name):
    normalized_text = unicodedata.normalize("NFD", name).encode("ascii", "ignore").decode("utf-8")  # Remove accents
    cleaned_name = normalized_text.lower().strip()  # Convert to lowercase and remove trailing spaces
    cleaned_name = re.sub(r"\s+", "-", cleaned_name)  # Replace spaces with hyphens
    cleaned_name = re.sub(r"[^\w-]", "", cleaned_name)  # Remove non-alphanumeric characters
    player_key = re.sub(r"-(sr|jr|ii|iii|iv|v|vi|vii)$", "", cleaned_name)  # Remove common suffixes
    return player_key

# Define the URL
url = "https://www.basketball-reference.com/leagues/NBA_2025_totals.html"

# Send a GET request to the URL
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code != 200:
    print(f"Failed to fetch data: {response.status_code}")
    exit()

# Parse the HTML
soup = BeautifulSoup(response.content, "html.parser")

# Find the stats table
table = soup.find("table", {"id": "totals_stats"})
if not table:
    print("Table not found. Ensure the page structure has not changed.")
    exit()

# Extract the table headers
headers = [th.get_text() for th in table.find("thead").find_all("th")]
headers = headers[1:]  # Remove the first blank column header

# Extract the rows
rows = table.find("tbody").find_all("tr")
data = []
for row in rows:
    # Skip rows without data (e.g., separator rows)
    if row.find("td"):
        row_data = [td.get_text() for td in row.find_all("td")]
        data.append(row_data)

# Create a DataFrame
df = pd.DataFrame(data, columns=headers)

# Filter out 'League Average' from the 'Player' column
df = df[df["Player"] != "League Average"]

# Add 'Player Key' column by applying the make_player_key function to the 'Player' column
df["Player Key"] = df["Player"].apply(make_player_key)

# Sort by 'Player Key' and 'Team' columns
df = df.sort_values(by=["Player Key", "Team"])

# Save to CSV
output_csv = "bbref_data.csv"
df.to_csv(output_csv, index=False, quoting=1)

# Get the current datetime in the local timezone
import pytz
import datetime
timezone = pytz.timezone("America/Chicago")  # Replace with your local timezone
current_time = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S %Z%z")

# Print the completion message with timestamp and timezone
print(f"Data saved to {output_csv} at {current_time}")


## scrape_salary.py

Run this script as often as needed to get multi-year salary data for all 30 NBA teams from Spotrac.

In [None]:
import requests
import re
import unicodedata
import pandas as pd
from bs4 import BeautifulSoup

# List of NBA teams to scrape salary data for
# Each entry corresponds to the team's Spotrac URL identifier, e.g., "https://www.spotrac.com/nba/atlanta-hawks/yearly"
teams = [
    "atlanta-hawks", "brooklyn-nets", "boston-celtics", "charlotte-hornets",
    "cleveland-cavaliers", "chicago-bulls", "dallas-mavericks", "denver-nuggets",
    "detroit-pistons", "golden-state-warriors", "houston-rockets", "indiana-pacers",
    "la-clippers", "los-angeles-lakers", "memphis-grizzlies", "miami-heat",
    "milwaukee-bucks", "minnesota-timberwolves", "new-york-knicks",
    "new-orleans-pelicans", "oklahoma-city-thunder", "orlando-magic",
    "philadelphia-76ers", "phoenix-suns", "portland-trail-blazers",
    "san-antonio-spurs", "sacramento-kings", "toronto-raptors",
    "utah-jazz", "washington-wizards"
]

# Function to clean a player's name and generate a unique player key to join across sites
# Normalizes the name (e.g., replaces accents with ASCII), converts to lowercase, strips trailing spaces,
# replaces spaces with hyphens, removes special characters (e.g., periods and apostrophes), and strips
# suffixes (e.g., "-jr", "-iii").
def make_player_key(name):
    normalized_text = unicodedata.normalize("NFD", name).encode("ascii", "ignore").decode("utf-8")  # Remove accents
    cleaned_name = normalized_text.lower().strip()  # Convert to lowercase and remove trailing spaces
    cleaned_name = re.sub(r"\s+", "-", cleaned_name)  # Replace spaces with hyphens
    cleaned_name = re.sub(r"[^\w-]", "", cleaned_name)  # Remove non-alphanumeric characters
    player_key = re.sub(r"-(sr|jr|ii|iii|iv|v|vi|vii)$", "", cleaned_name)  # Remove common suffixes
    return player_key

# Function to extract and clean the team name from the Spotrac URL
# Formats the team name from the URL (e.g., "san-antonio-spurs" -> "San Antonio Spurs")
def clean_team_name(url):
    team_key = url.split("/")[-2]  # Extracts the team identifier from the URL
    team_key_parts = team_key.split("-")  # Splits the identifier into components
    # Capitalizes each word, with special handling
    formatted_name = " ".join(
        part.upper() if part.lower() == "la"  # Capitalize "LA" specifically (e.g. "Los Angeles")
        else part.capitalize() if part.isalpha()  # Capitalize alphabetic parts only (e.g., "Spurs")
        else part  # Retain numeric parts as they are (e.g., "76ers")
        for part in team_key_parts
    )
    return formatted_name

# File path for saving the output CSV
output_csv = "salary_data.csv"

# List to store all salary data collected during scraping
all_data = []

# Function to extract dynamic season headers from a team's salary table
# This ensures the script captures season columns dynamically
def extract_season_headers(teams):
    for team in teams:
        url = f"https://www.spotrac.com/nba/{team}/yearly"
        response = requests.get(url)
        if response.status_code == 200:  # Check if the request is successful
            soup = BeautifulSoup(response.text, "html.parser")
            table = soup.select_one("table")  # Locate the first table in the page
            if table:
                header_row = table.find("tr")  # Find the header row
                if header_row:
                    headers = [th.get_text(strip=True) for th in header_row.find_all("th")]
                    # Filter headers matching the season format "YYYY-YY"
                    season_headers = [header for header in headers if re.match(r"^\d{4}-\d{2}$", header)]
                    if season_headers:  # Return headers if found
                        print(f"Season headers extracted from team: {clean_team_name(url)}")
                        return season_headers
    print("Failed to extract season headers. Please check the team URLs or table structure.")
    return []  # Return an empty list if no headers are found

# Extract headers dynamically from the list of teams
season_headers = extract_season_headers(teams)
if not season_headers:
    raise ValueError("Season headers could not be determined. Check table structure or team data.")

# Define CSV headers for the output file
headers = ["Player", "Player Link", "Player Key", "Team", "Team Link", "Position", "Age"] + season_headers
# Create an empty CSV file with the defined headers
pd.DataFrame(columns=headers).to_csv(output_csv, index=False, mode="w", encoding="utf-8", quoting=1)

# Loop through each team to scrape data
total_teams = len(teams)
for idx, team in enumerate(teams):
    url = f"https://www.spotrac.com/nba/{team}/yearly"  # Construct the team's URL
    team_name = clean_team_name(url)  # Extract and clean the team name
    response = requests.get(url)

    if response.status_code == 200:  # If the request is successful
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.select_one("table")  # Locate the salary table

        if table:
            rows = table.find_all("tr")  # Extract all rows from the table
            for row in rows[1:]:  # Skip the header row
                cols = row.find_all("td")  # Extract all columns for the row
                player_name = ""
                player_link = ""
                position = ""
                age = ""
                salary_data = []

                if len(cols) > 0:
                    player_name_tag = cols[0].find("a")  # Find the player link in the first column
                    if player_name_tag:
                        player_name = player_name_tag.get_text(strip=True)
                        player_link = player_name_tag["href"]
                    player_key = make_player_key(player_name)  # Generate the player key
                else:
                    player_key = ""

                if len(cols) > 1:  # Extract the player's position
                    position = cols[1].get_text(strip=True)
                if len(cols) > 2:  # Extract the player's age
                    age = cols[2].get_text(strip=True)

                for col in cols[3:]:  # Extract salary data from remaining columns
                    cell_text = col.get_text(strip=True)
                    if "Two-Way" in cell_text:
                        salary_data.append("Two-Way")
                    elif "UFA" in cell_text:
                        salary_data.append("UFA")
                    elif "RFA" in cell_text:
                        salary_data.append("RFA")
                    else:  # Extract numeric salary values
                        salary_matches = re.findall(r"\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?", cell_text)
                        salary_data.extend(salary_matches)

                # Combine all collected data into a single row
                salary_data = [player_name, player_link, player_key, team_name, url, position, age] + salary_data
                salary_data += [""] * (len(headers) - len(salary_data))  # Ensure row matches the header length

                if salary_data[0]:  # Only save data if player name exists
                    all_data.append(salary_data)
                    pd.DataFrame([salary_data], columns=headers).to_csv(output_csv, index=False, mode="a", header=False, encoding="utf-8", quoting=1)

        print(f"Processed {idx + 1}/{total_teams} teams ({((idx + 1) / total_teams) * 100:.2f}%): {team_name}")

# Sort all data by the player key for consistency
sorted_data = sorted(all_data, key=lambda x: x[2].lower())
# Overwrite the CSV with sorted data
pd.DataFrame(sorted_data, columns=headers).to_csv(output_csv, index=False, mode="w", encoding="utf-8", quoting=1)

# Get the current datetime in the local timezone
import pytz
import datetime
timezone = pytz.timezone("America/Chicago")  # Replace with your local timezone
current_time = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S %Z%z")

# Print the completion message with timestamp and timezone
print(f"Data saved to {output_csv} at {current_time}")


## scrape_signed.py

Run this script infrequently on a local machine to loop through all active players and scrape their individual page for details regarding how they were signed.

In [None]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

# Input CSV file containing the salary data
input_csv = "salary_data.csv"
# Read the salary data from the input file into a pandas DataFrame
salary_data = pd.read_csv(input_csv)

# Filter out inactive players or those with "Two-Way" contracts
active_data = salary_data[(salary_data["2024-25"] != "Two-Way") & (salary_data["2024-25"] != "-")]

# Extract unique player links and keys, and sort by player key for consistency
unique_links = active_data.drop_duplicates(subset=["Player Link", "Player Key"]).sort_values(by="Player Key")["Player Link"].tolist()

# List of minor words that should not be capitalized unless they are at the beginning of a phrase
minor_words = {"and", "or", "the", "in", "at", "for", "to", "by", "with", "a", "an", "of", "on"}

# Capitalizes specific prefixes and applies title case to the rest of the text
def format_signed(text):
    # If the text is None, return None
    if text is None:
        return None

    # Split the text into words by spaces or hyphens
    words = re.split(r"[-\s]", text)
    formatted_words = []

    # Capitalize words based on specific conditions
    for i, word in enumerate(words):
        # If the word starts with "non", "mid", or "bi", capitalize it (e.g., "Non-" becomes "Non")
        if any(word.lower().startswith(prefix) for prefix in ("non", "mid", "bi")):
            formatted_words.append(word.capitalize())
        # Capitalize all other words unless they are minor words
        else:
            formatted_words.append(word if word.lower() in minor_words else word.capitalize())

    # Join the formatted words into a single string
    formatted = " ".join(formatted_words)

    # Replace the capitalization for "Non-", "Mid-", "Bi-" if needed
    formatted = re.sub(r"(?<=\w)(?=\b(?:Non|Mid|Bi)-)", "-", formatted)

    # Remove space after "Non ", "Mid ", "Bi " and replace it with a hyphen
    formatted = re.sub(r"(Non|Mid|Bi)\s", r"\1-", formatted)

    # Special case: Handle "Sign and Trade" as a unique exception
    formatted = re.sub(r"Sign and Trade", "Sign-and-Trade", formatted)

    return formatted

# Function to scrape player data from the player's individual page
def scrape_player_data(player_link, player_key, player_name):
    try:
        # Send a GET request to the player's page
        page = requests.get(player_link)
        soup = BeautifulSoup(page.content, "html.parser")  # Parse the HTML content of the page

        # CSS selector to find the "Signed Using" contract information
        signed_using_selector = "#contracts > div > div > div.contract-wrapper.mb-5 > div.contract-details.row.m-0 > div:nth-child(5) > div.label"
        # Find the corresponding HTML element using the selector
        signed_using_element = soup.select_one(signed_using_selector)

        # Get the text of the next sibling element containing the actual contract information
        signed_using_value = signed_using_element.find_next_sibling().get_text().strip() if signed_using_element else None

        # Format the extracted contract data using the format_signed function
        cleaned_value = format_signed(signed_using_value)

        # Return a dictionary containing the player data with the cleaned "Signed Using" value
        return {
            "Player": player_name,
            "Player Link": player_link,
            "Player Key": player_key,
            "Signed Using": cleaned_value
        }
    except Exception as e:
        # If an error occurs (e.g., page structure changes), return None for contract data
        return {
            "Player": player_name,
            "Player Link": player_link,
            "Player Key": player_key,
            "Signed Using": None
        }

# Output file where the scraped data will be saved
output_csv = "signed_data.csv"
# Initialize the output CSV file with headers
pd.DataFrame(columns=["Player", "Player Link", "Player Key", "Signed Using"]).to_csv(output_csv, index=False, mode="w", encoding="utf-8", quoting=1)

# Loop through each unique player link and scrape the data
for idx, link in enumerate(unique_links):
    # Extract player key and player name from the active data DataFrame
    player_key = active_data[active_data["Player Link"] == link]["Player Key"].values[0]
    player_name = active_data[active_data["Player Link"] == link]["Player"].values[0]

    # Scrape the player's contract data using the scrape_player_data function
    scraped_row = scrape_player_data(link, player_key, player_name)
    # Append the scraped data to the output CSV file, replacing the "Signed Using" column with the cleaned data
    pd.DataFrame([scraped_row]).to_csv(output_csv, mode="a", header=False, index=False, encoding="utf-8", quoting=1)

    # Print progress as players are processed
    print(f"Processed {idx + 1}/{len(unique_links)} players ({((idx + 1) / len(unique_links)) * 100:.2f}%): {player_name}")

# Get the current datetime in the local timezone
import pytz
import datetime
timezone = pytz.timezone("America/Chicago")  # Replace with your local timezone
current_time = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S %Z%z")

# Print the completion message with timestamp and timezone
print(f"Data saved to {output_csv} at {current_time}")
