In [None]:
# Defines player positions

POSITION_TABLE_MAP = {
    'FW': 'scout_full_FW',
    'AM': 'scout_full_AM',
    'MF': 'scout_full_MF',
    'FB': 'scout_full_FB',
    'CB': 'scout_full_CB',
    'GK': 'scout_full_GK'
}

def get_primary_position(soup):
    meta = soup.find('div', id='meta')
    if not meta:
        return None

    pos_strong = meta.find('strong', string='Position:')
    if not pos_strong:
        return None

    position_text = pos_strong.next_sibling.strip()
    return position_text.split(',')[0].strip()


In [None]:
# Gets the player positions for scraping
def get_primary_position(soup):
    meta = soup.find('div', id='meta')
    if not meta:
        return None

    pos_strong = meta.find('strong', string='Position:')
    if not pos_strong:
        return None

    position_text = pos_strong.next_sibling.strip()
    primary_position = position_text.split(',')[0].strip()

    return primary_position

In [1]:
# IMPORTS AND SETUPS
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# Path to MSEdgeDriver executable
edge_driver_path = "msedgedriver.exe"


def scrape_player_data(player_url):
# 1. INITIALIZE DRIVERS
    service = Service(executable_path=edge_driver_path)
    options = webdriver.EdgeOptions()
    # options.add_argument("--headless") # Uncomment to not have browser window pop-up
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Edge(service=service, options=options)
    
    print(f"Opening URL with Selenium (Edge): {player_url}")
    driver.get(player_url)
    # Parse page HTML
    html_content_selenium = driver.page_source
    soup_selenium = BeautifulSoup(html_content_selenium, 'html.parser')
    # Detect player position
    player_position = get_primary_position(soup_selenium)
    print(f"Detected position: {player_position}")


# 2. WAIT FOR CORRECT SCOUTING TABLE TO LOAD
table_id = POSITION_TABLE_MAP.get(player_position)

if not table_id:
    print(f"No complete scouting table for position {player_position}")
    driver.quit()
    return None
# Waiting for table
try:
    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.ID, table_id))
    )
    print(f"Table element '{table_id}' found.")
    time.sleep(2)  # Ethical scraping delay
except Exception as e:
    print(f"ERROR: Could not find table {table_id} for {player_url}. Error: {e}")
    driver.quit()
    return None

# 3. SCRAPE DATA
html_content_selenium = driver.page_source
soup_selenium = BeautifulSoup(html_content_selenium, 'html.parser')
# Extracting table
target_table_selenium = soup_selenium.find('table', {'id': table_id})

# 4. PARSE TABLE INTO DATAFRAME
if target_table_selenium:
    df_list = pd.read_html(str(target_table_selenium))
    if df_list:
        player_df = df_list[0]
        print(f"Successfully scraped DataFrame of shape: {player_df.shape}")

        # 5. QUIT DRIVER SAFELY
        try:
            driver.quit()
        except:
            pass

        return player_df

print("ERROR: Failed to parse table into a DataFrame.")
driver.quit()
return None


In [5]:
# MULTI-PLAYER LOOP AND DATA AGGREGATION

# 1. LIST OF PLAYER URLs
player_urls = [
    "https://fbref.com/en/players/1f44ac21/scout/12524/Erling-Haaland-Scouting-Report",
    "https://fbref.com/en/players/b282337d/scout/12524/Cole-Palmer-Scouting-Report",
    "https://fbref.com/en/players/e342ad68/scout/12524/Mohamed-Salah-Scouting-Report",
    # Add more player URLs here later
]

# 2. EXECUTE THE LOOP
all_player_data = []

for url in player_urls:
    df = scrape_player_data(url)
    
    # Adds DataFrame if scraping was successful
    if df is not None:
        all_player_data.append(df)
        
        # Adds long pauses between players to be polite to the website
        # For ethical multi-player scraping
        time.sleep(10)

# 3. COMBINE AND SAVE DATA
if all_player_data:
    # Concatenate all individual player DataFrames into one DataFrame
    final_scouting_df = pd.concat(all_player_data, ignore_index=True)
    
    # Save the combined DataFrame
    output_filename = "all_player_scouting_report.csv"
    final_scouting_df.to_csv(output_filename, index=False)
    
    print("\n----------------------------------------------------")
    print("Multi-Player Scraping Complete!")
    print(f"Total rows scraped: {final_scouting_df.shape[0]}")
    print(f"Combined data saved to {output_filename}")
else:
    print("No player data was successfully scraped.")

In [None]:
player_scout_df = player_scout_df.copy() 
player_scout_df.columns = ['_'.join(col).strip() for col in player_scout_df.columns.values]

# Fill all non number values with 0
player_scout_df.fillna(0, inplace=True)

columns_to_convert = ['Standard_Gls', 'Standard_Ast', 'Performance_SCA', 'Performance_GCA'] 

for col in columns_to_convert:

    player_scout_df[col] = pd.to_numeric(player_scout_df[col], errors='coerce')

# Fill new non number values with 0
player_scout_df.fillna(0, inplace=True)

In [14]:
# To import the data from the CSV file
import pandas as pd
df = pd.read_csv('erling_haaland_scouting_report.csv')

# To display the first 5 rows 
print(df.head())

         Standard Stats Standard Stats.1 Standard Stats.2
0             Statistic           Per 90       Percentile
1     Non-Penalty Goals             0.63             92.0
2  npxG: Non-Penalty xG             0.62             94.0
3           Shots Total             3.42             88.0
4               Assists             0.10             34.0
