# Scraping Season Stats for Players from FBREF

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Creating the scraping function
# This function returns a DataFrame
def fetch_data(url):
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
        'cache-control': 'max-age=0',
        'if-modified-since': 'Thu, 24 Oct 2024 09:58:04 GMT',
        'priority': 'u=0, i',
        'referer': 'https://fbref.com/en/comps/Big5/Big-5-European-Leagues-Stats',
        'sec-ch-ua': '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
    }

    response = requests.get(
        url=url,
        headers=headers
    )
        
    soup = BeautifulSoup(response.content, 'html.parser')

    # FBREF tables has a unique structure
    # The table has column names with 2 layers
    # We should take both of them and combine as a one column name
    header_row = []
    for th in soup.select("thead tr:not(.over_header) th"):
        over_header = th.get('data-over-header', '').replace('-', '_').replace(' ', '_')
        current_header = th.get_text(strip=True).replace('-', '_').replace(' ', '_')
        if over_header:
            new_header = f"{over_header.replace(' ', '')}_{current_header}"
        else:
            new_header = current_header
        header_row.append(new_header)

    # Fetching data in the rows
    rows = []
    for row in soup.select("tbody tr"):
        cells = [td.get_text(strip=True) for td in row.find_all("td")]
        rows.append(cells)
    
    # If the header row has an extra entry, removing first item in header_row to align the lengths
    if len(header_row) > len(rows[0]):
        header_row.pop(0)
    
    # Now we have data(row) and columns(header_row), so let's convert them to a DataFrame
    df = pd.DataFrame(rows, columns=header_row)
    df.dropna(how='all', inplace=True)
    
    # Editing for 'Nation' and 'Comp' columns
    df = extract_uppercase(df)
    
    # Removing 'Matches' column
    df = df.drop(columns=['Matches'])

    # Age column is like '23-190', so we are taking just '23' in here
    df['Age'] = df['Age'].str.split('-', expand=True)[0]
    
    print(f"Done! -> URL: {url}")
    
    return df

# This function edits the data in the 'Nation' and 'Comp' columns
def extract_uppercase(df):
    # 'eng ENG' -> 'ENG'
    if 'Nation' in df.columns:
        df['Nation'] = df['Nation'].str.extract(r'([A-Z]+)')[0]
    
    # 'es La Liga' -> 'La Liga'
    if 'Comp' in df.columns:
        df['Comp'] = df['Comp'].str.extract(r'([A-Z][a-zA-Z\s]*)')[0]

    return df

# Defining FBREF pages as a list we want to scrape
urls = [
    'https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats',
    'https://fbref.com/en/comps/Big5/shooting/players/Big-5-European-Leagues-Stats',
    'https://fbref.com/en/comps/Big5/passing/players/Big-5-European-Leagues-Stats',
    'https://fbref.com/en/comps/Big5/passing_types/players/Big-5-European-Leagues-Stats',
    'https://fbref.com/en/comps/Big5/gca/players/Big-5-European-Leagues-Stats',
    'https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats',
    'https://fbref.com/en/comps/Big5/possession/players/Big-5-European-Leagues-Stats',
    'https://fbref.com/en/comps/Big5/misc/players/Big-5-European-Leagues-Stats'
]

# Extract and merge data from all URLs
dfs = []
for url in urls:
    df = fetch_data(url)
    dfs.append(df)

# Inserting DataFrames side by side
final_df = pd.concat(dfs, axis=1)

# Removing duplicating columns
final_df = final_df.loc[:,~final_df.columns.duplicated()]

# Exporting as csv
final_df.to_csv('fbref_player_stats.csv', encoding="utf-8-sig", index=False)

Done! -> URL: https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats
Done! -> URL: https://fbref.com/en/comps/Big5/shooting/players/Big-5-European-Leagues-Stats
Done! -> URL: https://fbref.com/en/comps/Big5/passing/players/Big-5-European-Leagues-Stats
Done! -> URL: https://fbref.com/en/comps/Big5/passing_types/players/Big-5-European-Leagues-Stats
Done! -> URL: https://fbref.com/en/comps/Big5/gca/players/Big-5-European-Leagues-Stats
Done! -> URL: https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats
Done! -> URL: https://fbref.com/en/comps/Big5/possession/players/Big-5-European-Leagues-Stats
Done! -> URL: https://fbref.com/en/comps/Big5/misc/players/Big-5-European-Leagues-Stats
