In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

#mimic a real browser request
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

# Academy squad pages with the filter selected for those who currently play
TEAM_PAGES = {
    "Barcelona": "https://www.transfermarkt.us/fc-barcelona/jugendarbeit/verein/131/plus/0/galerie/0?wettbewerb_id=gesamt&option=4&art=0",
    "Arsenal": "https://www.transfermarkt.us/arsenal-fc/jugendarbeit/verein/11/plus/0/galerie/0?wettbewerb_id=gesamt&option=4&art=0"
}

def get_player_ids(team_name, team_url):
    """Scrape player IDs and positions from the academy squad page."""
    print(f"Fetching players for {team_name}...")
    response = requests.get(team_url, headers=HEADERS)
    if response.status_code != 200:
        print(f"Failed to fetch squad page for {team_name}. Status Code:", response.status_code)
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    player_data = {}

    for row in soup.select("tr.bg_grey"):  
        player_link = row.select_one("a[href*='/profil/spieler/']")
        position_td = row.select_one("td.links table.inline-table tr:nth-of-type(2) td")  # Position is in the 2nd row

        if player_link and position_td:
            href = player_link["href"]
            player_id = href.split("/")[-1]  
            player_name = player_link.text.strip()
            player_position = position_td.text.strip()  

            player_data[player_name] = {"id": player_id, "position": player_position}

    print(f"Extracted {len(player_data)} players from {team_name}: {player_data}")
    return player_data

def get_market_values(player_name, player_id, team_name, position):
    """Fetch market value history for a single player."""
    api_url = f"https://www.transfermarkt.us/ceapi/marketValueDevelopment/graph/{player_id}"
    response = requests.get(api_url, headers=HEADERS)
    
    if response.status_code != 200:
        print(f"Failed to fetch market values for {player_name} ({team_name}). Status Code:", response.status_code)
        return []

    data = response.json()
    market_values = []

    for entry in data.get("list", []):
        market_values.append({
            "Team": team_name,
            "Player": player_name,
            "Position": position,
            "Date": entry["datum_mw"],
            "Value (€)": entry["y"],
            "Club": entry["verein"]
        })

    return market_values

def main():
    all_market_values = []

    for team_name, team_url in TEAM_PAGES.items():
        players = get_player_ids(team_name, team_url)

        for player_name, player_info in players.items():
            print(f"Fetching market value data for {player_name} ({team_name})...")
            market_data = get_market_values(player_name, player_info['id'], team_name, player_info['position'])
            
            if market_data:
                all_market_values.extend(market_data)  

            time.sleep(2)  # Prevents us getting blocked

    if all_market_values:
        df = pd.DataFrame(all_market_values)
        print("Final Data Preview:\n", df.head())  # Debugging step
        
        df['Player'] = df['Player'].apply(lambda x: x.encode('utf-8').decode('utf-8'))
        
        # Save to CSV with utf-8-sig encoding so that our accents and orinal name is preserved
        df.to_csv("Transfermarkt_values.csv", index=False, encoding="utf-8-sig")
        print("Data saved to Transfermarkt_values.csv!")
    else:
        print("No data retrieved. Check for scraping issues.")

if __name__ == "__main__":
    main()


Fetching players for Barcelona...
Extracted 12 players from Barcelona: {'Lamine Yamal': {'id': '937958', 'position': 'Right Winger'}, 'Gavi': {'id': '646740', 'position': 'Central Midfield'}, 'Pau Cubarsí': {'id': '962110', 'position': 'Centre-Back'}, 'Dani Olmo': {'id': '293385', 'position': 'Attacking Midfield'}, 'Alejandro Balde': {'id': '636688', 'position': 'Left-Back'}, 'Fermín López': {'id': '636703', 'position': 'Central Midfield'}, 'Marc Casadó': {'id': '636695', 'position': 'Defensive Midfield'}, 'Eric García': {'id': '466794', 'position': 'Centre-Back'}, 'Iñaki Peña': {'id': '283170', 'position': 'Goalkeeper'}, 'Ansu Fati': {'id': '466810', 'position': 'Left Winger'}, 'Héctor Fort': {'id': '937955', 'position': 'Right-Back'}, 'Marc Bernal': {'id': '1018920', 'position': 'Defensive Midfield'}}
Fetching market value data for Lamine Yamal (Barcelona)...
Fetching market value data for Gavi (Barcelona)...
Fetching market value data for Pau Cubarsí (Barcelona)...
Fetching market v

In [18]:
all_positions = set()  
for team_name, team_url in TEAM_PAGES.items():
    players = get_player_ids(team_name, team_url)
    for player_info in players.values():
        all_positions.add(player_info['position'])

print("Unique Positions:", all_positions)

Fetching players for Barcelona...
Extracted 12 players from Barcelona: {'Lamine Yamal': {'id': '937958', 'position': 'Right Winger'}, 'Gavi': {'id': '646740', 'position': 'Central Midfield'}, 'Pau Cubarsí': {'id': '962110', 'position': 'Centre-Back'}, 'Dani Olmo': {'id': '293385', 'position': 'Attacking Midfield'}, 'Alejandro Balde': {'id': '636688', 'position': 'Left-Back'}, 'Fermín López': {'id': '636703', 'position': 'Central Midfield'}, 'Marc Casadó': {'id': '636695', 'position': 'Defensive Midfield'}, 'Eric García': {'id': '466794', 'position': 'Centre-Back'}, 'Iñaki Peña': {'id': '283170', 'position': 'Goalkeeper'}, 'Ansu Fati': {'id': '466810', 'position': 'Left Winger'}, 'Héctor Fort': {'id': '937955', 'position': 'Right-Back'}, 'Marc Bernal': {'id': '1018920', 'position': 'Defensive Midfield'}}
Fetching players for Arsenal...
Extracted 3 players from Arsenal: {'Bukayo Saka': {'id': '433177', 'position': 'Right Winger'}, 'Ethan Nwaneri': {'id': '890719', 'position': 'Right Wing