In [1]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup
import time

A few problems we will have to face:

Fuzzy names - one site will way Darwin Nunez, the other will say D. Nunez

We'll need a layer that accounts for difference in quality of leagues. 

Loans are sometimes treated as transfers. But we don't want that.

Transfer fees can be a nightmare

# This is to download Transfermarkt data

In [2]:


# --- SETUP: HEADERS TO MIMIC A BROWSER ---
# Transfermarkt has strong anti-bot protection. This header is mandatory.
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}

# --- PART 1: SCRAPE TRANSFERMARKT (Transfers) ---
def get_transfermarkt_transfers(league_id, season_id):
    # URL for Premier League (GB1) transfers
    url = f"https://www.transfermarkt.com/premier-league/transfers/wettbewerb/{league_id}/plus/?saison_id={season_id}"
    
    print(f"Scraping Transfermarkt: {url}...")
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    
    # The data is usually in a div class 'box', but table structures change.
    # We look for the main transfer table.
    transfers = []
    
    # Find all transfer boxes (In/Out for each club)
    clubs = soup.find_all('div', class_='box')
    
    for club_box in clubs:
        # Identify the club name (often in a header)
        club_header = club_box.find('h2', class_='content-box-headline')
        if not club_header: continue
        
        buying_club = club_header.text.strip()
        
        # Find the table inside this box
        table = club_box.find('table', class_='items')
        if not table: continue
        
        rows = table.find_all('tr', class_=['odd', 'even'])
        for row in rows:
            cols = row.find_all('td')
            if len(cols) < 5: continue
            
            # Extract basic data
            # Note: This relies on specific column indices which may shift slightly
            try:
                player_name = row.find('td', class_='hauptlink').text.strip()
                age = row.find_all('td', class_='zentriert')[0].text.strip()
                market_value = row.find_all('td', class_='rechts')[0].text.strip()
                fee = row.find_all('td', class_='rechts')[1].text.strip()
                
                # Check if it's an "Arrival" (In) or "Departure" (Out)
                # Transfermarkt groups them, but usually we filter by column logic or separate tables.
                # For this simple snippet, we assume we grab everything and filter later.
                
                transfers.append({
                    'Buying_Club': buying_club,
                    'Player': player_name,
                    'Age': age,
                    'Market_Value': market_value,
                    'Fee': fee
                })
            except Exception as e:
                continue

    return pd.DataFrame(transfers)

# Run it for Premier League (GB1), Season 2023
df_tm = get_transfermarkt_transfers('GB1', 2023)

# Display the messy raw data
print("Transfermarkt Data Shape:", df_tm.shape)
display(df_tm.head())

Scraping Transfermarkt: https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id=2023...
Transfermarkt Data Shape: (0, 0)


# Above didn't work. Not sure why.

In [3]:
# 1. Define the URL and Headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
# URL for Premier League Transfers (2023 Season)
url = "https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id=2023"

# 2. Request the Page
print(f"Connecting to {url}...")
response = requests.get(url, headers=headers)

# 3. DEBUG: Check if we actually got the page
if response.status_code != 200:
    print(f"BLOCKED: Status Code {response.status_code}")
elif len(response.text) < 5000:
    print("BLOCKED: Page content too short (likely a CAPTCHA page).")
    print(response.text) # Print to see what they sent us
else:
    print("Connection Successful. Parsing tables...")

    # 4. Use Pandas to parse HTML directly
    try:
        # This returns a LIST of all dataframes found on the page
        tables = pd.read_html(response.text)
        
        print(f"Found {len(tables)} tables on the page.")
        
        # Transfermarkt usually splits transfers by club, so you might get 20 small tables.
        # Let's look at the first one to see what we got.
        if len(tables) > 0:
            df_example = tables[0]
            display(df_example.head())
        else:
            print("No tables found. Transfermarkt might have changed the layout.")
            
    except ValueError as e:
        print(f"Pandas parsing error: {e}")

Connecting to https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id=2023...
Connection Successful. Parsing tables...
Found 41 tables on the page.


  tables = pd.read_html(response.text)


Unnamed: 0,0,1,2
0,Filter by season:,29/30 28/29 27/28 26/27 25/26 24/25 23/24 22/2...,
1,Date of transfer:,doesn't matter Summer Winter,
2,Loans:,All transfers Only include loans Without pla...,
3,,Transfers within the club,


In [5]:
df_example.head()

Unnamed: 0,0,1,2
0,Filter by season:,29/30 28/29 27/28 26/27 25/26 24/25 23/24 22/2...,
1,Date of transfer:,doesn't matter Summer Winter,
2,Loans:,All transfers Only include loans Without pla...,
3,,Transfers within the club,


# FAILURE - We'll have to come back to this

# below, this is to load FBREF data

In [None]:
# --- PART 2: SCRAPE FBREF (Stats) ---
def get_fbref_stats(season_end_year):
    # URL for "Standard Stats"
    url = f"https://fbref.com/en/comps/9/{season_end_year-1}-{season_end_year}/stats/Premier-League-Stats"
    print(f"Scraping FBref: {url}...")
    
    response = requests.get(url, headers=headers)
    
    # Pandas read_html returns a list of all tables on the page
    # The main stats table is usually index 0
    tables = pd.read_html(response.text, header=1) # header=1 because row 0 is a grouper
    df = tables[0]
    
    # Cleanup: FBref repeats headers every 20 rows. Remove them.
    df = df[df['Rk'] != 'Rk']
    
    return df

# Run it for 2023-2024 Season
df_fbref = get_fbref_stats(2024)

print("FBref Data Shape:", df_fbref.shape)
display(df_fbref[['Player', 'Squad', 'Age', 'MP', 'Gls', 'Ast', 'xG']].head())

# the code below will download static datasets

In [None]:
# You need: pip install opendatasets
import opendatasets as od

# This will ask for your Kaggle username and key
od.download("https://www.kaggle.com/datasets/davidcariboo/player-scores")

# Load files
import pandas as pd
transfers_csv = pd.read_csv('player-scores/transfers.csv')
display(transfers_csv.head())