In [1]:
import requests
from bs4 import BeautifulSoup as bs4
import pandas as pd

base_url = "https://www.olympedia.org/"
response = requests.get(base_url)
response.status_code

200

In [2]:
soup = bs4(response.text, 'lxml')
soup

<!DOCTYPE html>
<html>
<head>
<title>Olympedia – Main Page</title>
<meta content="authenticity_token" name="csrf-param"/>
<meta content="lXmaRKLmbFiicbk74owTAWnG7kWztzGQpO8E9Mx4y8LyE/zs3aYN2rVzV8UU6bywA5/f+9UyJZuqc3VwFTMqIQ==" name="csrf-token"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="EN" http-equiv="content-language"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="/assets/bootstrap.min-460a43de22fd9534d595e5aea2715cb154560291c9c6401b526e31c86a5ce32d.css" media="all" rel="stylesheet"/>
<link href="/assets/bootstrap-sortable-363d232309d54b549fa85446295ef2b5d290e3f8a49f1a646247340be3705ef9.css" media="all" rel="stylesheet"/>
<link href="/assets/jquery-ui-1.11.4.min-c2c16849482519011be3d591a077786323ba15af8b2b5586ecf274e7a51c4c79.css" media="all" rel="stylesheet"/>
<link href="/assets/lightbox-e29689e123fc27505d2b9d919f43ffcb6fade539cb4670f21c35aa07848105e7.css" media="screen" rel="stylesheet"/>
<link data-tu

In [None]:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_athlete(athlete_id):
    url = f"https://www.olympedia.org/athletes/{athlete_id}"
    response = requests.get(url)
    if response.status_code != 200:
        return None  # Page does not exist or error

    soup = BeautifulSoup(response.content, 'html.parser')

    # Initialize all fields
    Roles = Sex = Full_name = Used_name = Other_Names = Born = Died = Height = Weight = Measurements = Affiliations = NOC = None

    bio_info = soup.find_all('div', class_='athlete_bio')
    if not bio_info:
        return None

    for bio in bio_info:
        table = bio.find('table', class_='biodata')
        if table:
            rows = table.find_all('tr')
            for row in rows:
                th = row.find('th')
                td = row.find('td')
                if not th or not td:
                    continue

                key = th.get_text(strip=True)
                value = td.get_text(separator=' ', strip=True)

                if key == "Roles":
                    Roles = value
                elif key == "Sex":
                    Sex = value
                elif key == "Full name":
                    Full_name = value
                elif key == "Used name":
                    Used_name = value
                elif key == "Other names":
                    Other_Names = value
                elif key == "Born":
                    Born = value
                elif key == "Died":
                    Died = value
                elif key == "Measurements":
                    if '/' in value:
                        parts = value.split('/')
                        if len(parts) == 2:
                            Height = parts[0].strip()
                            Weight = parts[1].strip()
                    else:
                        Measurements = value
                elif key == "Affiliations":
                    Affiliations = value
                elif key == "NOC":
                    NOC = value

    return {
        "Athlete ID": athlete_id,
        "Roles": Roles,
        "Sex": Sex,
        "Full Name": Full_name,
        "Used Name": Used_name,
        "Other Names": Other_Names,
        "Born": Born,
        "Died": Died,
        "Height (cm)": Height,
        "Weight (kg)": Weight,
        "Measurements": Measurements,
        "Affiliations": Affiliations,
        "NOC": NOC
    }

# Main loop to scrape all athletes
all_data = []
start_id = 1
end_id = 50000
save_interval = 500  # Save every 500 records

for i in range(start_id, end_id + 1):
    try:
        print(f"Scraping athlete ID: {i}")
        data = scrape_athlete(i)
        if data:
            all_data.append(data)

        if i % save_interval == 0:
            df = pd.DataFrame(all_data)
            df.to_csv(f"athletes_partial_{i}.csv", index=False)
            print(f"Saved partial data up to ID {i}")

        time.sleep(1.5)  # Be respectful to the server

    except Exception as e:
        print(f"Error at ID {i}: {e}")
        continue

# Final save
df = pd.DataFrame(all_data)
df.to_csv("all_athletes.csv", index=False)
print("✅ All data saved.")


Scraping athlete ID: 7490
Saved partial data up to ID 7490
Scraping athlete ID: 7491
Scraping athlete ID: 7492
Scraping athlete ID: 7493
Scraping athlete ID: 7494
Scraping athlete ID: 7495
Scraping athlete ID: 7496
Scraping athlete ID: 7497
Scraping athlete ID: 7498
Scraping athlete ID: 7499
Scraping athlete ID: 7500
Saved partial data up to ID 7500
Scraping athlete ID: 7501
Scraping athlete ID: 7502
Scraping athlete ID: 7503
Scraping athlete ID: 7504
Scraping athlete ID: 7505
Scraping athlete ID: 7506
Scraping athlete ID: 7507
Scraping athlete ID: 7508
Scraping athlete ID: 7509
Scraping athlete ID: 7510
Saved partial data up to ID 7510
Scraping athlete ID: 7511
Scraping athlete ID: 7512
Scraping athlete ID: 7513
Scraping athlete ID: 7514
Scraping athlete ID: 7515
Scraping athlete ID: 7516
Scraping athlete ID: 7517
Scraping athlete ID: 7518
Scraping athlete ID: 7519
Scraping athlete ID: 7520
Saved partial data up to ID 7520
Scraping athlete ID: 7521
Scraping athlete ID: 7522
Scraping a