In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

def fetch_player_basic_data(headers):
    # URL for the national competitions page
    country_competitions_url = "https://www.transfermarkt.us/wettbewerbe/national/wettbewerbe/189/saison_id/2023"
    response = requests.get(country_competitions_url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all competitions links
    competitions = []
    competition_table = soup.find('table', class_='items')
    for row in competition_table.find_all('tr'):
        header = row.find('td', class_='extrarow')
        if header and 'Tier' in header.text:
            links = row.find_next_sibling('tr').find_all('a', href=True, title=True)
            for link in links:
                if 'wettbewerb' in link['href']:
                    competitions.append(f"https://www.transfermarkt.us{link['href']}")

    # Loop through each competition and scrape team data
    all_player_data = []
    for competition_url in tqdm(competitions, desc="Competitions"):
        response = requests.get(competition_url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")
        teams = soup.find_all('a', href=True, class_='vereinprofil_tooltip')

        for team in teams:
            team_name = team.text.strip()
            team_url = f"https://www.transfermarkt.us{team['href']}kader/verein/{team['href'].split('/')[4]}/saison_id/2023/plus/1"
            team_response = requests.get(team_url, headers=headers)
            team_soup = BeautifulSoup(team_response.content, "html.parser")
            players = team_soup.find_all('tr', class_=['odd', 'even'])

            for player in players:
                name_tag = player.select_one('a[href^="/"]')
                if name_tag:
                    all_player_data.append({
                        'Team': team_name,
                        'Name': name_tag.text.strip(),
                        'URL': "https://www.transfermarkt.us" + name_tag['href'],
                        'Position': player.find('td', class_='posrela').find_next('td').text.strip() if player.find('td', class_='posrela') else 'Position Not Found'
                    })

    return all_player_data


In [2]:
def fetch_detailed_player_data(player_url, headers):
    response = requests.get(player_url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    player_id = player_url.split('/')[-1]

    try:
        # Use css selectors to find class and then get text, remove whitespace, remove null
        player_name = soup.select_one('h1[class="data-header__headline-wrapper"]').text.split('\n')[-1].strip()
    except AttributeError:
        player_name = None

    try:
        # Use css selectors to find class and then get text, remove #, remove null
        player_number = soup.select_one('span[class="data-header__shirt-number"]').text.replace('#', '').strip()
    except AttributeError:
        player_number = None

    try:
        player_contract_expiry = re.search(r"Contract expires: .*__content\">(.*?)</span>", str(soup)).group(1)
    except AttributeError:
        player_contract_expiry = None

    try:
        player_foot = re.search(r"Foot:</span>\s*<span class=\"info-table__content info-table__content--bold\">(.*?)</span>", str(soup)).group(1)
    except AttributeError:
        player_foot = None

    try:
        player_agent = re.search(r"Player agent:</span>\s*<span[^>]*>\s*<a[^>]*>([^<]+)</a>", str(soup)).group(1)
    except AttributeError:
        player_agent = None

    try:
        player_outfitter = re.search(r"Outfitter:</span>\s*<span class=\"info-table__content info-table__content--bold\">\s*(.*?)\s*</span>", str(soup)).group(1)
    except AttributeError:
        player_outfitter = None

    try:
        player_citizenship = re.search(r"Citizenship:</span>[\s\S]*?alt=\"([^\"]+)\"", str(soup)).group(1)
    except AttributeError:
        player_citizenship = None

    try:
        player_contract_start = re.search(r"Joined:</span>\s*<span[^>]*>\s*([^<]+)</span>", str(soup)).group(1)
    except AttributeError:
        player_contract_start = None

    try:
        # Find the span that directly contains birthplace
        birthplace_span = soup.find('span', itemprop="birthPlace")
        if birthplace_span:
            city = birthplace_span.text.strip()
            country_img = birthplace_span.find_previous('img', class_="flaggenrahmen")
            if country_img and country_img.has_attr('title'):
                country = country_img['title'].strip()
                player_birthplace = f"{city}, {country}"
            else:
                # If no country, just leave as city
                player_birthplace = city
        else:
            player_birthplace = None
    except AttributeError:
        player_birthplace = None

    # Organize data into a dictionary
    player_data = {
        'Name': player_name,
        'Number': player_number,
        'Contract Expiry': player_contract_expiry,
        'Foot': player_foot,
        'Agent': player_agent,
        'Outfitter': player_outfitter,
        'Citizenship': player_citizenship,
        'Contract Start Date': player_contract_start,
        'Birthplace': player_birthplace
    }

    # Create DataFrame
    player_df = pd.DataFrame([player_data])
    return player_df

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# Fetch basic data
players_basic_data = fetch_player_basic_data(headers)

# Fetch detailed data for each player
final_data = []
for player in tqdm(players_basic_data, desc="Fetching Detailed Player Data"):
    detailed_data = fetch_detailed_player_data(player['URL'], headers)
    player.update(detailed_data)  # Merge dictionaries
    final_data.append(player)

# Create DataFrame
df = pd.DataFrame(final_data)
print(df.head())

Competitions: 100%|███████████████████████████████| 6/6 [00:20<00:00,  3.34s/it]
Fetching Detailed Player Data: 0it [00:00, ?it/s]

Empty DataFrame
Columns: []
Index: []





NameError: name 'df' is not defined