In [47]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import time

In [49]:
# Set up Chrome options
options = Options()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

years = list(range(2004, 2005))

for year in years:
    
    # Initialize the driver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    url = f"https://stats.wnba.com/players/advanced/?sort=GP&dir=-1&Season={year}&SeasonType=Regular%20Season"
    all_data = []
    previous_data_length = 0
    same_page_count = 0

    try:
        driver.get(url)
        print("Page loaded, waiting for content...")
        time.sleep(5)  # Initial load
    
        while True:
            # Wait for table to load
            table = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".nba-stat-table__overflow")))
        
            # Extract current page data
            rows = table.find_elements(By.CSS_SELECTOR, "tr")
            current_page_data = []
        
            for row in rows[1:]:  # Skip header
                cols = row.find_elements(By.CSS_SELECTOR, "td")
                if cols and len(cols) >= 7:
                    player = cols[1].text
                    team = cols[2].text
                    Games_Played = cols[4].text
                    Minutes = cols[7].text
                    Offensive_Rating = cols[8].text
                    Defensive_Rating = cols[9].text
                    AST_pct = cols[11].text
                    Assist_to_Turnover = cols[12].text
                    Assist_Ratio = cols[13].text
                    OREB_pct = cols[14].text
                    DREB_pct = cols[15].text
                    REB_pct = cols[16].text
                    Turnover_Ratio = cols[17].text
                    EFG_pct = cols[18].text
                    TS_pct = cols[19].text
                    USG_pct = cols[20].text
                    
                    current_page_data.append([player, team, Games_Played, Minutes, Offensive_Rating, Defensive_Rating, AST_pct,
                                             Assist_to_Turnover, Assist_Ratio, OREB_pct, DREB_pct, REB_pct, Turnover_Ratio, 
                                              EFG_pct, TS_pct, USG_pct])
        
            # Check if we're getting new data
            if len(current_page_data) == 0:
                print("No new data found - stopping")
                break
            
            if len(all_data) > 0 and current_page_data == all_data[-len(current_page_data):]:
                same_page_count += 1
                print(f"Same data detected {same_page_count} times")
                if same_page_count >= 3:
                    print("Same data repeated 3 times - assuming end of data")
                    break
            else:
                same_page_count = 0
        
            all_data.extend(current_page_data)
            print(f"Collected {len(all_data)} players so far...")
        
            # Try to find and click next page button
            try:
                next_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "a[title='Next'], .stats-table-pagination__next")))
            
                if "disabled" in next_button.get_attribute("class"):
                    print("Next button disabled - end of data")
                    break
                
                # Scroll to and click the button
                driver.execute_script("arguments[0].scrollIntoView();", next_button)
                time.sleep(1)
                driver.execute_script("arguments[0].click();", next_button)
                print("Clicked next page button")
                time.sleep(3)  # Wait for next page to load
            
                # Additional check for duplicate data
                if len(all_data) == previous_data_length:
                    print("No new data after page turn - stopping")
                    break
                previous_data_length = len(all_data)
            
            except Exception as e:
                print(f"Error finding/clicking next button: {str(e)}")
                break
    
        # Save all collected data - corrected column names to match the data structure
        df = pd.DataFrame(all_data, columns=['Name', 'Team', 'Games_Played', 'Minutes', 'Offensive Rating', 'Defensive Rating', 'AST%',
                                             'Assist/Turnover', 'Assist Ratio', 'OREB%', 'DREB%', 'REB%', 'Turnover Ratio', 
                                              'EFG%', 'TS%', 'USG%'])
        df['Total Minutes'] = df['Games Played'].astype(float) * df['Minutes'].astype(float)
        df.to_csv(f"WNBA_players/wnba_players_{year}.csv", index=False)
        print(f"Successfully saved {len(df)} unique players to WNBA_players/wnba_players_{year}.csv")

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        driver.save_screenshot("error_screenshot.png")
        print("Saved screenshot to error_screenshot.png")

    finally:
        driver.quit()

Page loaded, waiting for content...
Collected 50 players so far...
Clicked next page button
Collected 100 players so far...
Clicked next page button
Collected 150 players so far...
Clicked next page button
Collected 164 players so far...
Clicked next page button
Same data detected 1 times
Collected 178 players so far...
Clicked next page button
Same data detected 2 times
Collected 192 players so far...
Clicked next page button
Same data detected 3 times
Same data repeated 3 times - assuming end of data
Successfully saved 192 unique players to WNBA_players/wnba_players_2004.csv


In [43]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time

# Set up Chrome options
options = Options()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

years = list(range(2004, 2005))

for year in years:
    print(f"\nProcessing year: {year}")
    
    # Initialize the driver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    url = f"https://stats.wnba.com/players/shooting/?Season={year}&SeasonType=Regular%20Season&DistanceRange=By%20Zone"
    all_data = []
    max_pages = 10  # Safety limit to prevent infinite looping
    current_page = 0

    try:
        driver.get(url)
        print("Page loaded, waiting for content...")
        
        # Wait longer for initial load
        time.sleep(5)
        
        # Function to check if we have real data
        def has_real_data(row):
            cols = row.find_elements(By.TAG_NAME, "td")
            return len(cols) > 0 and cols[0].text.strip() != ''

        # Function to process a page
        def process_page():
            table = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CLASS_NAME, "nba-stat-table__overflow")))
            
            rows = table.find_elements(By.TAG_NAME, "tr")
            print(f"Found {len(rows)} rows in the table")
            
            page_data = []
            for row in rows:
                try:
                    if has_real_data(row):
                        cols = row.find_elements(By.TAG_NAME, "td")
                        if len(cols) >= 21:
                            player_data = [
                                cols[0].text.strip(),  # Player
                                cols[1].text.strip(),  # Team
                                cols[2].text.strip(),  # Age
                                # Restricted Area
                                cols[3].text.strip(),
                                cols[4].text.strip(),
                                cols[5].text.strip(),
                                # In The Paint (Non-RA)
                                cols[6].text.strip(),
                                cols[7].text.strip(),
                                cols[8].text.strip(),
                                # Mid-Range
                                cols[9].text.strip(),
                                cols[10].text.strip(),
                                cols[11].text.strip(),
                                # Left Corner 3
                                cols[12].text.strip(),
                                cols[13].text.strip(),
                                cols[14].text.strip(),
                                # Right Corner 3
                                cols[15].text.strip(),
                                cols[16].text.strip(),
                                cols[17].text.strip(),
                                # Above the Break 3
                                cols[18].text.strip(),
                                cols[19].text.strip(),
                                cols[20].text.strip()
                            ]
                            page_data.append(player_data)
                except Exception as e:
                    print(f"Error processing row: {str(e)}")
                    continue
            
            return page_data

        # Process first page
        page_data = process_page()
        all_data.extend(page_data)
        print(f"Initial page: Collected {len(page_data)} players")
        
        # Handle pagination with safety limits
        while current_page < max_pages:
            try:
                # Find all pagination buttons
                pagination = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "stats-table-pagination")))
                
                next_button = pagination.find_element(By.CSS_SELECTOR, ".stats-table-pagination__next")
                
                # Check if next button is disabled
                if "disabled" in next_button.get_attribute("class"):
                    print("Reached last page")
                    break
                
                # Scroll to and click the button
                driver.execute_script("arguments[0].scrollIntoView();", next_button)
                time.sleep(1)
                next_button.click()
                print("Clicked next page button")
                current_page += 1
                
                # Wait for new data to load
                time.sleep(3)
                
                # Process new page
                page_data = process_page()
                if not page_data:
                    print("No new data on this page - stopping")
                    break
                
                all_data.extend(page_data)
                print(f"Page {current_page}: Collected {len(page_data)} players (Total: {len(all_data)})")
                
                # Check if we're getting duplicate data (stuck in a loop)
                if len(all_data) > 100 and len(set([tuple(d) for d in all_data[-50:]])) < 5:
                    print("Detected duplicate data - stopping")
                    break
                
            except Exception as e:
                print(f"Error handling pagination: {str(e)}")
                break

        # Save data if we collected any
        if all_data:
            columns = [
                'Player', 'Team', 'Age',
                'RA_FGM', 'RA_FGA', 'RA_FG%',
                'ITP_FGM', 'ITP_FGA', 'ITP_FG%',
                'MR_FGM', 'MR_FGA', 'MR_FG%',
                'LC3_FGM', 'LC3_FGA', 'LC3_FG%',
                'RC3_FGM', 'RC3_FGA', 'RC3_FG%',
                'AB3_FGM', 'AB3_FGA', 'AB3_FG%'
            ]
            df = pd.DataFrame(all_data, columns=columns)
            
            # Filter out any completely empty rows
            df = df[df['Player'].notna() & (df['Player'] != '')]
            
            # Remove duplicates
            df = df.drop_duplicates()
            
            print(f"Final dataset: {len(df)} unique players")
            
            # Save to CSV
            df.to_csv(f"WNBA_shooting_location/wnba_players_{year}_shooting_location.csv", index=False)
            print(f"Saved data for {year}")
        else:
            print(f"No data collected for {year}")

    except Exception as e:
        print(f"An error occurred for year {year}: {str(e)}")
        driver.save_screenshot(f"error_screenshot_{year}.png")
        print(f"Saved screenshot to error_screenshot_{year}.png")

    finally:
        driver.quit()
        print(f"Finished processing {year}")


Processing year: 2004
Page loaded, waiting for content...
Found 156 rows in the table
Initial page: Collected 50 players
Clicked next page button
Found 156 rows in the table
Page 1: Collected 50 players (Total: 100)
Clicked next page button
Found 156 rows in the table
Page 2: Collected 50 players (Total: 150)
Clicked next page button
Found 48 rows in the table
Page 3: Collected 14 players (Total: 164)
Clicked next page button
Found 48 rows in the table
Page 4: Collected 14 players (Total: 178)
Clicked next page button
Found 48 rows in the table
Page 5: Collected 14 players (Total: 192)
Clicked next page button
Found 48 rows in the table
Page 6: Collected 14 players (Total: 206)
Clicked next page button
Found 48 rows in the table
Page 7: Collected 14 players (Total: 220)
Clicked next page button
Found 48 rows in the table
Page 8: Collected 14 players (Total: 234)
Clicked next page button
Found 48 rows in the table
Page 9: Collected 14 players (Total: 248)
Clicked next page button
Found