In [13]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

url = 'https://www.transfermarkt.us/ederson/profil/spieler/238223'

# Get player id from url
player_id = url.split('/')[-1]

# Find user agent header from https://www.whatismybrowser.com/detect/what-http-headers-is-my-browser-sending in order to avoid being blocked by transfermarkt
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# Make request to webpage
response = requests.get(url, headers=headers)

# Create soup and parse html
soup = BeautifulSoup(response.content, "html.parser")

In [14]:
try:
    # Use css selectors to find class and then get text, remove whitespace, remove null
    player_name = soup.select_one('h1[class="data-header__headline-wrapper"]').text.split('\n')[-1].strip()
except AttributeError:
    player_name = None

try:
    # Use css selectors to find class and then get text, remove #, remove null
    player_number = soup.select_one('span[class="data-header__shirt-number"]').text.replace('#', '').strip()
except AttributeError:
    player_number = None

try:
    player_contract_expiry = re.search(r"Contract expires: .*__content\">(.*?)</span>", str(soup)).group(1)
except AttributeError:
    player_contract_expiry = None

try:
    player_foot = re.search(r"Foot:</span>\s*<span class=\"info-table__content info-table__content--bold\">(.*?)</span>", str(soup)).group(1)
except AttributeError:
    player_foot = None

try:
    player_agent = re.search(r"Player agent:</span>\s*<span[^>]*>\s*<a[^>]*>([^<]+)</a>", str(soup)).group(1)
except AttributeError:
    player_agent = None

try:
    player_outfitter = re.search(r"Outfitter:</span>\s*<span class=\"info-table__content info-table__content--bold\">\s*(.*?)\s*</span>", str(soup)).group(1)
except AttributeError:
    player_outfitter = None

try:
    player_citizenship = re.search(r"Citizenship:</span>[\s\S]*?alt=\"([^\"]+)\"", str(soup)).group(1)
except AttributeError:
    player_citizenship = None

try:
    player_contract_start = re.search(r"Joined:</span>\s*<span[^>]*>\s*([^<]+)</span>", str(soup)).group(1)
except AttributeError:
    player_contract_start = None

try:
    # Find the span that directly contains birthplace
    birthplace_span = soup.find('span', itemprop="birthPlace")
    if birthplace_span:
        city = birthplace_span.text.strip()
        country_img = birthplace_span.find_previous('img', class_="flaggenrahmen")
        if country_img and country_img.has_attr('title'):
            country = country_img['title'].strip()
            player_birthplace = f"{city}, {country}"
        else:
            # If no country, just leave as city
            player_birthplace = city
    else:
        player_birthplace = None
except AttributeError:
    player_birthplace = None

# Organize data into a dictionary
player_data = {
    'Name': player_name,
    'Number': player_number,
    'Contract Expiry': player_contract_expiry,
    'Foot': player_foot,
    'Agent': player_agent,
    'Outfitter': player_outfitter,
    'Citizenship': player_citizenship,
    'Contract Start Date': player_contract_start,
    'Birthplace': player_birthplace
}

# Create DataFrame
player_df = pd.DataFrame([player_data])

# Display the DataFrame to verify
display(player_df)

# TO:DO regex for player current club

Unnamed: 0,Name,Number,Contract Expiry,Foot,Agent,Outfitter,Citizenship,Contract Start Date,Birthplace
0,Ederson,31,"Jun 30, 2026",left,Gestifute,Puma,Brazil,"Jul 1, 2017","Osasco (SP), Brazil"


In [15]:
# Call api endpoint to get market value development over time
market_value_response = requests.get(f'https://www.transfermarkt.us/ceapi/marketValueDevelopment/graph/{player_id}', headers=headers)
market_value_data = market_value_response.json()

# Extract the list of data points needed
market_value_list = market_value_data.get('list', [])

# Prepare the data by renaming fields and filtering out unwanted data
market_value_cleaned_data = [
    {
        'age': value['age'],
        'team_name': value['verein'],
        'date': value['datum_mw'],
        'market_value': value['mw']
    }
    for value in market_value_list
]

market_value_df = pd.DataFrame(market_value_cleaned_data)
print(market_value_df.head())

  age   team_name          date market_value
0  19  Rio Ave FC   Apr 3, 2013        €300k
1  19  Rio Ave FC   Jul 2, 2013        €300k
2  20  Rio Ave FC  Jul 25, 2014        €900k
3  21  Rio Ave FC  Feb 17, 2015        €600k
4  21  SL Benfica   Jul 1, 2015       €1.20m


In [16]:
# Call api endpoint to get transfer history over time
transfer_history_response = requests.get(f'https://www.transfermarkt.us/ceapi/transferHistory/list/{player_id}', headers=headers)
transfer_history_data = transfer_history_response.json()

# Extract the list of data points needed
transfer_history_list = transfer_history_data.get('transfers', [])

# Prepare the data by renaming fields and filtering out unwanted data
transfer_history_cleaned_data = [
    {
        'date': transfer['dateUnformatted'],
        'season': transfer['season'],
        'market_value': transfer['marketValue'],
        'transfer_fee': transfer['fee'],
        'from_club_name': transfer['from']['clubName'],
        'to_club_name': transfer['to']['clubName']
    }
    for transfer in transfer_history_list
]

transfer_history_df = pd.DataFrame(transfer_history_cleaned_data)
print(transfer_history_df.head())

         date season market_value   transfer_fee from_club_name to_club_name
0  2017-07-01  17/18      €22.00m        €40.00m        Benfica     Man City
1  2015-07-01  15/16       €1.20m          €500k        Rio Ave      Benfica
2  2012-07-01  12/13            -  free transfer    GD Ribeirão      Rio Ave
3  2011-07-01  11/12            -  free transfer    Benfica U19  GD Ribeirão
4  2010-07-01  10/11            -              -    Benfica U17  Benfica U19
