<h3>Importing libraries for web scraping and data processing</h3>

In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


<h3>Get HTML from URL using requests</h3>

In [6]:
def get_html(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error fetching {url}: {response.status_code}")
        return None

<h3>Scraping top NBA players and their profile links from Basketball Reference</h3>

In [38]:
players_ranked = []
player_links = []
max_players = 60
for year in range(2020, 2026):
    current_rank = 1
    unique_names = set()
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_totals.html"
    print(url)
    page = get_html(url)
    soup = BeautifulSoup(page, "html.parser")

    season = f"{year - 1}-{str(year)[-2:]}"

    players = soup.select("th+ .left a")

    for tag in players:
        if current_rank > max_players:
            break
        name = tag.text.strip()
        if name in unique_names:
            continue

        link = "https://www.basketball-reference.com" + tag['href']

        rank = {
            "rank": current_rank,
            "name": name,
            "season": season
        }
        players_ranked.append(rank)
        player_links.append(link)
        unique_names.add(name)
        current_rank += 1

print(player_links)
print(players_ranked)
players_ranked_df = pd.DataFrame(players_ranked)
players_ranked_df.to_csv("players_ranked.csv")

https://www.basketball-reference.com/leagues/NBA_2020_totals.html
https://www.basketball-reference.com/leagues/NBA_2021_totals.html
https://www.basketball-reference.com/leagues/NBA_2022_totals.html
https://www.basketball-reference.com/leagues/NBA_2023_totals.html
https://www.basketball-reference.com/leagues/NBA_2024_totals.html
https://www.basketball-reference.com/leagues/NBA_2025_totals.html
['https://www.basketball-reference.com/players/h/hardeja01.html', 'https://www.basketball-reference.com/players/l/lillada01.html', 'https://www.basketball-reference.com/players/b/bookede01.html', 'https://www.basketball-reference.com/players/a/antetgi01.html', 'https://www.basketball-reference.com/players/y/youngtr01.html', 'https://www.basketball-reference.com/players/d/doncilu01.html', 'https://www.basketball-reference.com/players/b/bealbr01.html', 'https://www.basketball-reference.com/players/j/jamesle01.html', 'https://www.basketball-reference.com/players/m/mitchdo01.html', 'https://www.basket

<h3>Scraping NBA champions and their rosters from Basketball Reference</h3>

In [39]:
url = "https://www.basketball-reference.com/leagues/"
page = get_html(url)
soup = BeautifulSoup(page, "html.parser")
season_tag = soup.select("th a")[1:]
Champion_tag = soup.select(".left:nth-child(3) a")

champions = []
champion_links = []

for season_el, Champion_el in zip(season_tag, Champion_tag):
    season = season_el.text.strip()
    if season == "2018-19":
        break
    players_name = []
    Champion_name = Champion_el.text.strip()
    Champion_link = "https://www.basketball-reference.com" + Champion_el['href']
    page2 = get_html(Champion_link)
    soup2 = BeautifulSoup(page2, "html.parser")
    players = soup2.select(".center+ .left a")
    for tag in players:
        name = tag.text.strip()
        link = "https://www.basketball-reference.com" + tag['href']
        players_name.append(name)
        player_links.append(link)
    champion_dict = {"season": season, "Champion": Champion_name, "players": players_name}
    champions.append(champion_dict)
    champion_links.append(Champion_link)
print(champions)
champions_df = pd.DataFrame(champions)
champions_df.to_csv("champions.csv", index=False)

[{'season': '2024-25', 'Champion': 'Oklahoma City Thunder', 'players': ['Branden Carlson', 'Alex Caruso', 'Ousmane Dieng', 'Luguentz Dort', 'Alex Ducas', 'Adam Flagler', 'Shai Gilgeous-Alexander', 'Isaiah Hartenstein', 'Chet Holmgren', 'Isaiah Joe', 'Dillon Jones', 'Malevy Leons', 'Ajay Mitchell', 'Alex Reese', 'Cason Wallace', 'Aaron Wiggins', 'Jalen Williams', 'Jaylin Williams', 'Kenrich Williams']}, {'season': '2023-24', 'Champion': 'Boston Celtics', 'players': ['Dalano Banton', 'Oshae Brissett', 'Jaylen Brown', 'JD Davison', 'Sam Hauser', 'Jrue Holiday', 'Al Horford', 'Luke Kornet', 'Svi Mykhailiuk', 'Drew Peterson', 'Kristaps PorziÅ\x86Ä£is', 'Payton Pritchard', 'Neemias Queta', 'Jaden Springer', 'Lamar Stevens', 'Jayson Tatum', 'Xavier Tillman Sr.', 'Jordan Walsh', 'Derrick White']}, {'season': '2022-23', 'Champion': 'Denver Nuggets', 'players': ['Christian Braun', 'Bruce Brown', 'Thomas Bryant', 'Kentavious Caldwell-Pope', 'Vlatko Ä\x8canÄ\x8dar', 'Aaron Gordon', 'Jeff Green', '

In [40]:
player_links = list(set(player_links))
len(player_links)

215

<h3>Extracting player profile data from Basketball Reference</h3>

In [110]:
def extract_player_info(url):
    country_map = {
        "us": "United States", "ca": "Canada", "fr": "France", "de": "Germany", "it": "Italy",
        "es": "Spain", "br": "Brazil", "ar": "Argentina", "au": "Australia", "cn": "China",
        "jp": "Japan", "ir": "Iran", "ru": "Russia", "gb": "United Kingdom", "nl": "Netherlands",
        "tr": "Turkey", "gr": "Greece", "mx": "Mexico", "pt": "Portugal", "se": "Sweden",
        "ch": "Switzerland", "pl": "Poland", "fi": "Finland", "dk": "Denmark", "no": "Norway",
        "be": "Belgium", "ie": "Ireland", "cz": "Czech Republic", "at": "Austria", "hu": "Hungary",
        "za": "South Africa", "ro": "Romania", "bg": "Bulgaria", "rs": "Serbia", "ua": "Ukraine",
        "sk": "Slovakia", "si": "Slovenia", "hr": "Croatia", "lt": "Lithuania", "lv": "Latvia",
        "ee": "Estonia", "nz": "New Zealand", "in": "India", "pk": "Pakistan", "id": "Indonesia",
        "my": "Malaysia", "ph": "Philippines", "kr": "South Korea", "sa": "Saudi Arabia",
        "ae": "United Arab Emirates", "eg": "Egypt", "ma": "Morocco", "tn": "Tunisia",
        "dz": "Algeria", "ng": "Nigeria", "ke": "Kenya", "gh": "Ghana", "th": "Thailand",
        "vn": "Vietnam", "bd": "Bangladesh", "il": "Israel", "iq": "Iraq", "sy": "Syria",
        "af": "Afghanistan", "np": "Nepal", "lk": "Sri Lanka", "kz": "Kazakhstan",
        "uz": "Uzbekistan", "ge": "Georgia", "az": "Azerbaijan", "am": "Armenia", "cm": "Cameroon",
        "ba": "Bosnia and Herzegovina"
    }

    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    meta = soup.select_one("#meta")
    paragraphs = meta.select("p") if meta else []

    name_tag = meta.select_one("h1 span") if meta else None
    name = name_tag.text.strip() if name_tag else None

    birth_dt = None
    death_dt = None
    age = None
    status = "Alive"
    height = None
    weight = None
    positions = None
    shoots = None
    team = "Free Agent"
    nationality = None
    experience = None
    pts = None

    for p in paragraphs:
        text = p.text.strip()

        birth_tag = p.select_one("#necro-birth")
        if birth_tag:
            birth = birth_tag.get("data-birth")
            if birth:
                try:
                    birth_dt = datetime.strptime(birth, "%Y-%m-%d")
                except:
                    birth_dt = None

        death_tag = p.select_one("#necro-death")
        if death_tag:
            death = death_tag.get("data-death")
            if death:
                try:
                    death_dt = datetime.strptime(death, "%Y-%m-%d")
                    status = "Deceased"
                except:
                    death_dt = None

        if birth_dt:
            help_date = death_dt if death_dt else datetime.today()
            age = help_date.year - birth_dt.year - ((help_date.month, help_date.day) < (birth_dt.month, birth_dt.day))

        match = re.search(r"\((\d+)cm,\s*(\d+)kg\)", text)
        if match:
            height = int(match.group(1))
            weight = int(match.group(2))

        if "Position:" in text and "Shoots:" in text:
            p_match = re.search(r"Position:\s*(.+?)\s*▪", text)
            if p_match:
                raw = p_match.group(1).replace(" and ", ",")
                positions = [x.strip() for x in raw.split(",")]
            shoot_match = re.search(r"Shoots:\s*(\w+)", text)
            if shoot_match:
                shoots = shoot_match.group(1)

        if "Team" in text:
            a_tag = p.find("a")
            if a_tag:
                team = a_tag.text.strip()

        n_link = p.find("a", href=re.compile(r"country="))
        if n_link:
            href = n_link.get("href", "")
            match = re.search(r"country=([A-Za-z]+)", href)
            if match:
                code = match.group(1).lower()
                nationality = country_map.get(code, code.capitalize())

        if "Experience" in text:
            exp_match = re.search(r"Experience:\s*(\d+)\s*year", text)
            if exp_match:
                experience = int(exp_match.group(1))

        if "Career Length" in text and experience is None:
            text = text.replace("\xa0", " ")
            career_match = re.search(r"Career Length:\s*(\d+)\s*year", text)
            if career_match:
                experience = int(career_match.group(1))

    stats_section = soup.select_one(".stats_pullout")
    if stats_section:
        pts_block = stats_section.find("strong", string="PTS")
        if pts_block:
            parent_div = pts_block.find_parent("div")
            if parent_div:
                p_tags = parent_div.find_all("p")
                if len(p_tags) >= 2:
                    try:
                        pts = float(p_tags[1].text.strip())  # مقدار دوم یعنی Career
                    except:
                        pts = None

    return {
        "Name": name,
        "Birth_date": birth_dt.strftime("%Y-%m-%d") if birth_dt else None,
        "Death_date": death_dt.strftime("%Y-%m-%d") if death_dt else None,
        "Age": age,
        "Status": status,
        "Height": height,
        "Weight": weight,
        "Nationality": nationality,
        "Positions": positions,
        "Shoots": shoots,
        "Current_team": team,
        "Experience": experience,
        "PTS": pts
    }

In [116]:
players_info = []
driver = webdriver.Chrome()
for li in player_links:
    player_info = extract_player_info(li)
    players_info.append(player_info)
    print(player_info)
players_info_df = pd.DataFrame(players_info)

{'Name': 'Tobias Harris', 'Birth_date': '1992-07-15', 'Death_date': None, 'Age': 33, 'Status': 'Alive', 'Height': 203, 'Weight': 102, 'Nationality': 'United States', 'Positions': ['Power Forward', 'Small Forward'], 'Shoots': 'Right', 'Current_team': 'Detroit Pistons', 'Experience': 14, 'PTS': 16.1}
{'Name': 'Joel Embiid', 'Birth_date': '1994-03-16', 'Death_date': None, 'Age': 31, 'Status': 'Alive', 'Height': 213, 'Weight': 127, 'Nationality': 'United States', 'Positions': ['Center'], 'Shoots': 'Right', 'Current_team': 'Philadelphia 76ers', 'Experience': 9, 'PTS': 27.7}
{'Name': 'Norman Powell', 'Birth_date': '1993-05-25', 'Death_date': None, 'Age': 32, 'Status': 'Alive', 'Height': 190, 'Weight': 97, 'Nationality': 'United States', 'Positions': ['Shooting Guard', 'Small Forward'], 'Shoots': 'Right', 'Current_team': 'Miami Heat', 'Experience': 10, 'PTS': 13.2}
{'Name': 'Jordan Walsh', 'Birth_date': '2004-03-03', 'Death_date': None, 'Age': 21, 'Status': 'Alive', 'Height': 201, 'Weight': 9

In [117]:
players_info_df.to_csv("players_info.csv", index=False)

<h3>Players on the Michael Jordan Award List at Basketball Reference</h3>

In [99]:
Michael_Jordan_Trophy = []
for year in range(2020, 2025):
    url = f"https://www.basketball-reference.com/awards/awards_{year}.html"
    print(url)
    page = get_html(url)
    soup = BeautifulSoup(page, "html.parser")

    season = f"{year - 1}-{str(year)[-2:]}"

    players = soup.select("#mvp th+ .left a")
    current_rank = 1
    for tag in players:
        name = tag.text.strip()

        link = "https://www.basketball-reference.com" + tag['href']

        pl = {
            "rank": current_rank,
            "name": name,
            "season": season
        }
        Michael_Jordan_Trophy.append(pl)
        current_rank += 1

print(Michael_Jordan_Trophy)
Michael_Jordan_Trophy_df = pd.DataFrame(Michael_Jordan_Trophy)
Michael_Jordan_Trophy_df.to_csv("Michael_Jordan_Trophy.csv")

https://www.basketball-reference.com/awards/awards_2020.html
https://www.basketball-reference.com/awards/awards_2021.html
https://www.basketball-reference.com/awards/awards_2022.html
https://www.basketball-reference.com/awards/awards_2023.html
https://www.basketball-reference.com/awards/awards_2024.html
[{'rank': 1, 'name': 'Giannis Antetokounmpo', 'season': '2019-20'}, {'rank': 2, 'name': 'LeBron James', 'season': '2019-20'}, {'rank': 3, 'name': 'James Harden', 'season': '2019-20'}, {'rank': 4, 'name': 'Luka DonÄ\x8diÄ\x87', 'season': '2019-20'}, {'rank': 5, 'name': 'Kawhi Leonard', 'season': '2019-20'}, {'rank': 6, 'name': 'Anthony Davis', 'season': '2019-20'}, {'rank': 7, 'name': 'Chris Paul', 'season': '2019-20'}, {'rank': 8, 'name': 'Damian Lillard', 'season': '2019-20'}, {'rank': 9, 'name': 'Nikola JokiÄ\x87', 'season': '2019-20'}, {'rank': 10, 'name': 'Pascal Siakam', 'season': '2019-20'}, {'rank': 11, 'name': 'Jimmy Butler', 'season': '2019-20'}, {'rank': 12, 'name': 'Jayson Ta

<h3>Extracting basic info of NBA teams from Basketball Reference</h3>

In [113]:
def extract_basic_team_info_from_url(url):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    meta = soup.select_one("#meta")
    paragraphs = meta.select("p") if meta else []

    team_name = None
    location = None
    founded = None
    championships = None
    playoff_appearances = None

    name_tag = meta.select_one("h1 span") if meta else None
    if name_tag:
        team_name = name_tag.text.strip()

    for p in paragraphs:
        text = p.text.strip()

        if "Location:" in text:
            location = text.replace("Location:", "").strip()

        if "Seasons:" in text:
            match = re.search(r"(\d{4}-\d{2})", text)
            if match:
                founded = match.group(1)

        if "Championships:" in text:
            match = re.search(r"Championships:\s*(\d+)", text)
            if match:
                championships = int(match.group(1))

        if "Playoff Appearances:" in text:
            match = re.search(r"Playoff Appearances:\s*(\d+)", text)
            if match:
                playoff_appearances = int(match.group(1))

    return {
        "Team": team_name,
        "Location": location,
        "Founded": founded,
        "Championships": championships,
        "Playoff_Appearances": playoff_appearances
    }

In [114]:
def get_team_links():
    url = "https://www.basketball-reference.com/teams/"
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    team_tags = soup.select("#teams_active a")
    team_links = ["https://www.basketball-reference.com" + tag.get("href") for tag in team_tags if tag.get("href")]

    return team_links

In [115]:
driver = webdriver.Chrome()
team_links = get_team_links()

team_infos = []
for link in team_links:
    info = extract_basic_team_info_from_url(link)
    team_infos.append(info)
    print(info)
team_infos_df = pd.DataFrame(team_infos)
team_infos_df.to_csv("team_infos.csv")
driver.quit()

{'Team': 'Atlanta Hawks', 'Location': 'Atlanta, Georgia', 'Founded': '1949-50', 'Championships': 1, 'Playoff_Appearances': 49}
{'Team': 'Boston Celtics', 'Location': 'Boston, Massachusetts', 'Founded': '1946-47', 'Championships': 18, 'Playoff_Appearances': 62}
{'Team': 'Brooklyn Nets', 'Location': 'Brooklyn, New York', 'Founded': '1967-68', 'Championships': 2, 'Playoff_Appearances': 31}
{'Team': 'Charlotte Hornets', 'Location': 'Charlotte, North Carolina', 'Founded': '1988-89', 'Championships': 0, 'Playoff_Appearances': 10}
{'Team': 'Chicago Bulls', 'Location': 'Chicago, Illinois', 'Founded': '1966-67', 'Championships': 6, 'Playoff_Appearances': 36}
{'Team': 'Cleveland Cavaliers', 'Location': 'Cleveland, Ohio', 'Founded': '1970-71', 'Championships': 1, 'Playoff_Appearances': 25}
{'Team': 'Dallas Mavericks', 'Location': 'Dallas, Texas', 'Founded': '1980-81', 'Championships': 1, 'Playoff_Appearances': 25}
{'Team': 'Denver Nuggets', 'Location': 'Denver, Colorado', 'Founded': '1967-68', 'C