In [16]:
import pandas as pd
import time
import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

# Player data

In [None]:
driverService = webdriver.EdgeService("C:\Windows\System32\msedgedriver.exe")
driver = webdriver.Edge(service = driverService)

In [8]:
driver.get(url="https://www.nba.com/stats/players/traditional?PerMode=PerGame&sort=PTS&dir=-1")

In [112]:
glossaryButton = driver.find_elements(By.CLASS_NAME, "Crom_base__f0niE")[0].find_element(By.TAG_NAME, "button")
glossaryButton.click()

acronyms = driver.find_elements(By.CLASS_NAME, "StatsTableGlossary_dt__GPooh")
acronyms = [a.text for a in acronyms]
definitions = driver.find_elements(By.CLASS_NAME, "StatsTableGlossary_dd__zcr38")
definitions = [d.text for d in definitions]

glossary = pd.DataFrame({"Type": "Player", "Variable": acronyms, "Definition": definitions})

In [3]:
def readTable(tableObj):
    rows = tableObj.find_elements(By.TAG_NAME, "tr")

    colNames = [i.text for i in rows[0].find_elements(By.TAG_NAME, "th") if len(i.text.strip()) > 0]
    rows = rows[1:]

    tableData = [[cell.text for cell in row.find_elements(By.TAG_NAME, "td")[1:] if len(cell.text) > 0] for row in tqdm.tqdm(rows)]

    table = pd.DataFrame(tableData, columns=colNames)

    return table

In [None]:
season_select_dd = Select(driver.find_elements(By.CLASS_NAME, "DropDown_select__4pIg9")[0])
seasons = [option.text for option in season_select_dd.options]

season_tables = []
for season in seasons:
    season_select_dd.select_by_visible_text(season)

    time.sleep(10)
    page_select_dd = driver.find_elements(By.CLASS_NAME, "DropDown_select__4pIg9")[-1]

    page_select_dd = Select(page_select_dd)
    page_select_dd.select_by_index(0)

    player_table = driver.find_element(By.CLASS_NAME, "Crom_table__p1iZz")

    time.sleep(10)
    
    player_table = readTable(player_table)
    season_tables.append(player_table)

In [372]:
def create_season_column(df, season):
    df["Season"] = season
    return df

In [93]:
seasons = [i for i in range(2016, 2024)]

playerdf = pd.concat([create_season_column(df, season).iloc[:,1:] for df, season in zip(season_tables, seasons)])
playerdf.to_csv("./data/playerTradStats.csv", index = False)

In [126]:
driver.close()

# Team Data

In [4]:
driverService = webdriver.EdgeService("C:\Windows\System32\msedgedriver.exe")
driver = webdriver.Edge(service = driverService)
driver.get(url="https://www.nba.com/stats/players/traditional?PerMode=PerGame&sort=PTS&dir=-1")

In [5]:
def typeButtonClick():
    typeButton = driver.find_elements(By.CLASS_NAME, "StatsQuickNavSelector_nav__JzoME")[2].find_element(By.TAG_NAME, "button")
    typeButton.click()
    time.sleep(1)

def retTypeList(closeMenu = True):
    typeButtonClick()
    typeList = driver.find_elements(By.CLASS_NAME, "StatsQuickNavSelector_list__nb3l1")[2]
    typeList = typeList.find_elements(By.TAG_NAME, "li")
    if closeMenu:
        typeButtonClick()
    return typeList

def pickType(typeIndex):
    typeList = retTypeList(closeMenu = False)
    typeList[typeIndex].click()
    time.sleep(4)

In [6]:
# typeList = retTypeList(closeMenu = False)
# typeNames = [t.text for t in typeList]
# typeButtonClick()

typeNames = ["advanced", "opponent"]

In [9]:
season_select_dd = Select(driver.find_elements(By.CLASS_NAME, "DropDown_select__4pIg9")[0])
seasons = [option.text for option in season_select_dd.options]

allTables = {}
for i, typeName in enumerate(typeNames):

    driver.get(url=f"https://www.nba.com/stats/teams/{typeName}?PerMode=Totals")
    
    time.sleep(5)

    season_tables = {}
    for season in seasons:
        season_select_dd = Select(driver.find_elements(By.CLASS_NAME, "DropDown_select__4pIg9")[0])
        season_select_dd.select_by_visible_text(season)

        time.sleep(10)

        player_table = driver.find_element(By.CLASS_NAME, "Crom_table__p1iZz")
        
        player_table = readTable(player_table)
        season_tables[season] = (player_table)

    allTables[typeName] = season_tables

100%|██████████| 30/30 [00:20<00:00,  1.48it/s]
100%|██████████| 30/30 [00:19<00:00,  1.56it/s]
100%|██████████| 30/30 [00:21<00:00,  1.41it/s]
100%|██████████| 30/30 [00:18<00:00,  1.60it/s]
100%|██████████| 30/30 [00:16<00:00,  1.83it/s]
100%|██████████| 30/30 [00:16<00:00,  1.80it/s]
100%|██████████| 30/30 [00:16<00:00,  1.81it/s]
100%|██████████| 30/30 [00:16<00:00,  1.77it/s]
100%|██████████| 30/30 [00:17<00:00,  1.75it/s]
100%|██████████| 30/30 [00:18<00:00,  1.62it/s]
100%|██████████| 30/30 [00:16<00:00,  1.79it/s]
100%|██████████| 30/30 [00:18<00:00,  1.66it/s]
100%|██████████| 30/30 [00:17<00:00,  1.76it/s]
100%|██████████| 30/30 [00:20<00:00,  1.46it/s]
100%|██████████| 30/30 [00:23<00:00,  1.27it/s]
100%|██████████| 30/30 [00:21<00:00,  1.37it/s]
100%|██████████| 30/30 [00:18<00:00,  1.63it/s]
100%|██████████| 30/30 [00:19<00:00,  1.55it/s]
100%|██████████| 30/30 [00:17<00:00,  1.68it/s]
100%|██████████| 30/30 [00:18<00:00,  1.61it/s]
100%|██████████| 29/29 [00:16<00:00,  1.

In [13]:
def create_season_column(df, season):
    df["Season"] = season.split("-")[0]
    return df

# advanceddf = pd.concat([create_season_column(df, season).iloc[:,1:] for season, df in allTables["Advanced"].items()])
# advanceddf.to_csv("./data/teamAdvStats.csv", index = False)

In [14]:
for typeName in typeNames:
    teamdf = pd.concat([create_season_column(df, season) for season, df in allTables[typeName].items()])
    teamdf.to_csv(f"./data/team{typeName}Stats.csv", index = False)
driver.close()

# All player team awards

In [1]:
import requests
import bs4
import re

In [2]:
nbaAwardsPage = bs4.BeautifulSoup(requests.get("https://www.nba.com/news/history-all-nba-teams").text)

In [11]:
def isSeason(strObj):
    return re.match("\d{4}-\d{2}", strObj) is not None

def isAwardTitle(strObj):
    return re.match("[A-Z]+ TEAM", strObj) is not None

def isPlayerTeam(htmlObj):
    isPara = False
    matchesForm = False
    if len(re.findall("</*p>", htmlObj.decode())) == 2:
        isPara = True
    if re.match("^([A-Z]:){0,1}[^()]*,.*", htmlObj.text) is not None:
        matchesForm = True

    return  isPara and matchesForm

In [12]:
seasons = [season for season in nbaAwardsPage.find_all("h3") if isSeason(season.text)]

In [13]:
dataDict = {}
for season in seasons:
    children = season.find_next_siblings()
    seasonDict = {}

    awardTitle = None
    playerTeam = []

    for child in children:

        if isAwardTitle(child.text):
            if awardTitle is not None:
                seasonDict[awardTitle] = playerTeam
                playerTeam = []
            awardTitle = child.text.replace(":", "")
        
        if isPlayerTeam(child):
            playerTeam.append(child.text)
        
        if isSeason(child.text):
            seasonDict[awardTitle] = playerTeam
            break

        child = None
    
    if child is None:
        seasonDict[awardTitle] = playerTeam
        
    dataDict[season.text] = seasonDict

In [28]:
awardsdf = pd.DataFrame.from_dict(dataDict, orient = "index").stack().explode().str.split(",", expand = True)
awardsdf[["Position", 0]] = awardsdf[0].str.extract("(([A-Z]):){0,1}(.*)")[[1, 2]].rename({1: "Position", 2: "Player"}, axis = 1)
awardsdf = awardsdf.rename({0: "Player", 1: "Team"}, axis = 1)
awardsdf["Player"] = awardsdf.Player.str.strip()
awardsdf.Team = awardsdf.Team.str.strip()
awardsdf.to_csv("./data/awards.csv")

# Coach df

In [1]:
import pandas as pd

In [2]:
link = f"https://www.basketball-reference.com/leagues/NBA_{2023}_coaches.html#NBA_coaches"
coachdf = pd.read_html(link)[0]

In [18]:
def getTable(season):
    link = f"https://www.basketball-reference.com/leagues/NBA_{season+1}_coaches.html#NBA_coaches"
    coachdf = pd.read_html(link)[0]
    colsToKeep = [0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 23, 24]
    coachdf = coachdf.iloc[:, colsToKeep]
    coachdf.columns = ["Coach", "TEAM", "N_Seasons_TEAM", "N_Seasons_Overall","RS_G_Current", "RS_W_Current", "RS_L_Current", "RS_G_TEAM", "RS_W_TEAM", "RS_L_TEAM", "RS_G_Overall", "RS_W_Perc_Overall", "P_G_Overall", "P_W_Overall"]
    coachdf["P_W_Perc"] = coachdf.P_W_Overall/coachdf.P_G_Overall
    coachdf["Season"] = season
    return coachdf.drop("P_W_Overall", axis = 1)

In [20]:
coachdfs = [getTable(season) for season in range(1996, 2023)]

In [21]:
coachdf = pd.concat(coachdfs)

In [96]:
coachdf.to_csv("./data/coachdf.csv")