In [21]:
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

# Define user agent header to avoid being blocked
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# URL for the national competitions page
country_competitions_url = "https://www.transfermarkt.us/wettbewerbe/national/wettbewerbe/189/saison_id/2023"

# Fetching competition data
response = requests.get(country_competitions_url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

# Find all competitions links
competitions = []
competition_table = soup.find('table', class_='items')
for row in competition_table.find_all('tr'):
    header = row.find('td', class_='extrarow')
    if header and 'Tier' in header.text:
        links = row.find_next_sibling('tr').find_all('a', href=True, title=True)
        for link in links:
            if 'wettbewerb' in link['href']:
                competitions.append({
                    'name': link.get_text(strip=True),
                    'url': f"https://www.transfermarkt.us{link['href']}"
                })

# Initialize an empty list to collect all player data from all competitions
all_player_data = []

# Loop through each competition and scrape team data
with tqdm(total=len(competitions), desc="Competitions") as comp_progress:
    for competition in competitions:
        response = requests.get(competition['url'], headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract teams information
        teams_data = []
        table = soup.find('table', class_='items')
        rows = table.find_all('tr', class_=['odd', 'even'])

        for row in rows:
            team_link = row.find('a', href=True)
            team_name = team_link.text.strip()
            team_id = team_link['href'].split('/')[4]
            team_url = team_link['href'].rsplit('/', 1)[0]

            teams_data.append({
                "team_name": team_name,
                "team_id": team_id,
                "team_url": team_url,
                "transfermarkt_league_id": competition['url'].split('/')[-1]
            })

        # Fetch player data for each team
        for team in teams_data:
            team_url = f"https://www.transfermarkt.us{team['team_url']}/kader/verein/{team['team_id']}/saison_id/2023/plus/1"
            response = requests.get(team_url, headers=headers)
            soup = BeautifulSoup(response.content, "html.parser")

            # Find all player rows in the table
            players = soup.find_all('tr', class_=['odd', 'even'])

            # Extract details for each player
            for player in players:
                name_tag = player.select_one('a[href^="/"]')
                if name_tag:
                    name = name_tag.text.strip()
                    player_url = "https://www.transfermarkt.us" + name_tag['href']

                    # Correct extraction of position
                    position_tag = player.find('td', class_='posrela').find_next('tr')
                    if position_tag:
                        position = position_tag.find('td').text.strip() if position_tag.find('td') else 'Position Not Found'

                    all_player_data.append({
                        'competition': competition['name'],
                        'team': team['team_name'],
                        'name': name,
                        'position': position,
                        'url': player_url
                    })
        comp_progress.update(1)

# Convert list to DataFrame
player_df = pd.DataFrame(all_player_data)

Competitions: 100%|███████████████████████████████| 6/6 [07:28<00:00, 74.82s/it]


Unnamed: 0,Competition,Team,Name,Position,URL
0,Premier League,,Ederson,,https://www.transfermarkt.us/ederson/profil/sp...
1,Premier League,,Stefan Ortega,,https://www.transfermarkt.us/stefan-ortega/pro...
2,Premier League,,Scott Carson,,https://www.transfermarkt.us/scott-carson/prof...
3,Premier League,,Rúben Dias,,https://www.transfermarkt.us/ruben-dias/profil...
4,Premier League,,,,https://www.transfermarkt.us/rb-leipzig/starts...
...,...,...,...,...,...
3694,National League South,,,,https://www.transfermarkt.us/newport-county/st...
3695,National League South,,,,https://www.transfermarkt.us/oldham-athletic/s...
3696,National League South,,Aaron Jarvis,,https://www.transfermarkt.us/aaron-jarvis/prof...
3697,National League South,,,,https://www.transfermarkt.us/weymouth-fc/start...


In [25]:
import numpy as np

player_df['Name'].replace('', np.nan, inplace=True)

# Drop rows where 'Name' is null
player_df.dropna(subset=['Name'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  player_df['Name'].replace('', np.nan, inplace=True)


Unnamed: 0,Competition,Team,Name,Position,URL
0,Premier League,,Ederson,,https://www.transfermarkt.us/ederson/profil/sp...
1,Premier League,,Stefan Ortega,,https://www.transfermarkt.us/stefan-ortega/pro...
2,Premier League,,Scott Carson,,https://www.transfermarkt.us/scott-carson/prof...
3,Premier League,,Rúben Dias,,https://www.transfermarkt.us/ruben-dias/profil...
5,Premier League,,Manuel Akanji,,https://www.transfermarkt.us/manuel-akanji/pro...
...,...,...,...,...,...
3687,National League South,,Daniel Martin,,https://www.transfermarkt.us/daniel-martin/pro...
3689,National League South,,Brett McGavin,,https://www.transfermarkt.us/brett-mcgavin/pro...
3690,National League South,,Tom Lapslie,,https://www.transfermarkt.us/tom-lapslie/profi...
3691,National League South,,Asa Hall,,https://www.transfermarkt.us/asa-hall/profil/s...


In [34]:
def get_detailed_player_data(url, headers=headers):
    # Get player id from url
    player_id = url.split('/')[-1]

    # Make request to webpage
    response = requests.get(url, headers=headers)

    # Create soup and parse html
    soup = BeautifulSoup(response.content, "html.parser")
    
    try:
        # Use css selectors to find class and then get text, remove whitespace, remove null
        player_name = soup.select_one('h1[class="data-header__headline-wrapper"]').text.split('\n')[-1].strip()
    except AttributeError:
        player_name = None

    try:
        # Use css selectors to find class and then get text, remove #, remove null
        player_number = soup.select_one('span[class="data-header__shirt-number"]').text.replace('#', '').strip()
    except AttributeError:
        player_number = None

    try:
        player_contract_expiry = re.search(r"Contract expires: .*__content\">(.*?)</span>", str(soup)).group(1)
    except AttributeError:
        player_contract_expiry = None

    try:
        player_foot = re.search(r"Foot:</span>\s*<span class=\"info-table__content info-table__content--bold\">(.*?)</span>", str(soup)).group(1)
    except AttributeError:
        player_foot = None

    try:
        player_agent = re.search(r"Player agent:</span>\s*<span[^>]*>\s*<a[^>]*>([^<]+)</a>", str(soup)).group(1)
    except AttributeError:
        player_agent = None

    try:
        player_outfitter = re.search(r"Outfitter:</span>\s*<span class=\"info-table__content info-table__content--bold\">\s*(.*?)\s*</span>", str(soup)).group(1)
    except AttributeError:
        player_outfitter = None

    try:
        player_citizenship = re.search(r"Citizenship:</span>[\s\S]*?alt=\"([^\"]+)\"", str(soup)).group(1)
    except AttributeError:
        player_citizenship = None

    try:
        player_contract_start = re.search(r"Joined:</span>\s*<span[^>]*>\s*([^<]+)</span>", str(soup)).group(1)
    except AttributeError:
        player_contract_start = None

    try:
        # Find the span that directly contains birthplace
        birthplace_span = soup.find('span', itemprop="birthPlace")
        if birthplace_span:
            city = birthplace_span.text.strip()
            country_img = birthplace_span.find_previous('img', class_="flaggenrahmen")
            if country_img and country_img.has_attr('title'):
                country = country_img['title'].strip()
                player_birthplace = f"{city}, {country}"
            else:
                # If no country, just leave as city
                player_birthplace = city
        else:
            player_birthplace = None
    except AttributeError:
        player_birthplace = None

    # Organize data into a dictionary
    player_data = {
        'player_id': player_id
        'player_name': player_name,
        'number': player_number,
        'contract_expiry_date': player_contract_expiry,
        'foot': player_foot,
        'agent': player_agent,
        'outfitter': player_outfitter,
        'citizenship': player_citizenship,
        'contract_start_date': player_contract_start,
        'birthplace': player_birthplace
    }

    # Create DataFrame
    player_df = pd.DataFrame([player_data])
    
    market_value_df = get_market_value_data(player_id, headers)
    transfer_history_df = get_transfer_history_data(player_id, headers)
    
    player_df['market_value'] = [market_value_df]  # Enclose in list to store as an object
    player_df['transfer_history'] = [transfer_history_df]

    return player_df

def get_market_value_data(player_id, headers=headers):
    # Call api endpoint to get market value development over time
    market_value_response = requests.get(f'https://www.transfermarkt.us/ceapi/marketValueDevelopment/graph/{player_id}', headers=headers)
    market_value_data = market_value_response.json()

    # Extract the list of data points needed
    market_value_list = market_value_data.get('list', [])

    # Prepare the data by renaming fields and filtering out unwanted data
    market_value_cleaned_data = [
        {
            'age': value['age'],
            'team_name': value['verein'],
            'date': value['datum_mw'],
            'market_value': value['mw']
        }
        for value in market_value_list
    ]

    market_value_df = pd.DataFrame(market_value_cleaned_data)
    return market_value_df

def get_transfer_history_data(player_id, headers=headers):
    # Call api endpoint to get transfer history over time
    transfer_history_response = requests.get(f'https://www.transfermarkt.us/ceapi/transferHistory/list/{player_id}', headers=headers)
    transfer_history_data = transfer_history_response.json()

    # Extract the list of data points needed
    transfer_history_list = transfer_history_data.get('transfers', [])

    # Prepare the data by renaming fields and filtering out unwanted data
    transfer_history_cleaned_data = [
        {
            'date': transfer['dateUnformatted'],
            'season': transfer['season'],
            'market_value': transfer['marketValue'],
            'transfer_fee': transfer['fee'],
            'from_club_name': transfer['from']['clubName'],
            'to_club_name': transfer['to']['clubName']
        }
        for transfer in transfer_history_list
    ]

    transfer_history_df = pd.DataFrame(transfer_history_cleaned_data)
    return transfer_history_df

SyntaxError: invalid syntax (2707085819.py, line 73)

In [38]:
for index, row in player_df.iterrows():
    detailed_df = get_detailed_player_data(row['URL'], headers)
    
    # Update the row with new columns
    for col in detailed_df.columns:
        row[col] = detailed_df.iloc[0][col]
    new_rows.append(row)

# Convert the list of updated rows back to a DataFrame
updated_player_df = pd.DataFrame(new_rows)

# Display the updated DataFrame
display(updated_player_df)

NameError: name 'get_detailed_player_data' is not defined