# Scraping Transfermarkt Data with BeautifulSoup

## Setup

Before running the script, ensure you have `beautifulsoup4` installed. If not, install it using:
```python
!pip install beautifulsoup4


In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

## 1. Define Headers and URL
To prevent getting blocked, we set a **User-Agent** string that mimics a real browser:
```python
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [23]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

base_url = 'https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop'
params = {
    'land_id': '0',
    'ausrichtung': 'alle',
    'spielerposition_id': 'alle',
    'altersklasse': 'alle',
    'jahrgang': '0',
    'kontinent_id': '0',
    'plus': '1'
}

## 2. Fetching Web Pages
To request each page, we define a function:
```python
def fetch_page(page):
    params['page'] = page
    response = requests.get(base_url, params=params, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch page {page}")
        return None


In [24]:

all_players = []

def fetch_page(page):
    params['page'] = page
    response = requests.get(base_url, params=params, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch page {page}")
        return None

def parse_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', class_='items')
    if not table:
        return []
    
    rows = table.find_all('tr', class_=['odd', 'even'])
    players = []
    
    for row in rows:
        cols = row.find_all('td')
        if len(cols) < 15:
            continue
        
        player_td = cols[1]
        name = cols[3].get_text(strip=True)
        position = cols[4].get_text(strip=True)
        age = cols[5].get_text(strip=True)
        
        primary_nat = cols[6].find("img")["alt"] if cols[6].find("img") else cols[6].get_text(strip=True)
        secondary_nat = cols[6].find_all("img")[-1]["alt"] if len(cols[6].find_all("img")) > 1 else "-"
        
        club = cols[7].find("img")["alt"] if cols[7].find("img") else cols[7].get_text(strip=True)
        
        market_value = cols[8].get_text(strip=True)
        
        matches = cols[9].get_text(strip=True)
        goals = cols[10].get_text(strip=True)
        own_goals = cols[11].text.strip() if cols[11].text else ''
        assists = cols[12].get_text(strip=True)
        yellow_cards = cols[13].get_text(strip=True)
        second_yellow = cols[14].text.strip() if cols[14].text else ''
        red_cards = cols[15].get_text(strip=True) if len(cols) > 15 else ""
        sub_in = cols[16].get_text(strip=True) if len(cols) > 16 else ""
        sub_out = cols[14].text.strip() if cols[14].text else ''
        
        player_data = {
            'Name': name,
            'Position': position,
            'Club': club,
            'Market Value': market_value,
            'Age': age,
            'Primary Nationality': primary_nat,
            'Secondary Nationality': secondary_nat,
            'Matches Played': matches,
            'Goals': goals,
            'Assists': assists,
            'Yellow Cards': yellow_cards,
            'Second Yellow Cards': second_yellow,
            'Red Cards': red_cards,
            'Substituted In': sub_in,
            'Substituted Out': sub_out,
            'Own Goals': own_goals
        }
        players.append(player_data)
    
    return players

for page in range(1, 21):  
    print(f"Processing page {page}...")
    html = fetch_page(page)
    if html:
        players = parse_page(html)
        all_players.extend(players)
    sleep(1) 



Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Processing page 9...
Processing page 10...
Processing page 11...
Processing page 12...
Processing page 13...
Processing page 14...
Processing page 15...
Processing page 16...
Processing page 17...
Processing page 18...
Processing page 19...
Processing page 20...


## 3. Saving Scraped Data to CSV with Pandas

After collecting player data, we store it in a **Pandas DataFrame** for structured handling:
```python
df = pd.DataFrame(all_players)


In [25]:
df = pd.DataFrame(all_players)
df.to_csv('collected.csv', index=False, encoding='utf-8-sig')

print("Data collection complete. Saved to collected.csv")

Data collection complete. Saved to collected.csv
