In [2]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import os


In [None]:
# Parameters
start_id = 1
end_id = 5000
chunk_size = 2500

# Folder to save data
folder_name = "athlete_data_1"
os.makedirs(folder_name, exist_ok=True)

# Loop through athlete IDs in chunks
for chunk_start in range(start_id, end_id + 1, chunk_size):
    chunk_end = min(chunk_start + chunk_size - 1, end_id)
    print(f"\nProcessing athlete IDs from {chunk_start} to {chunk_end}...\n")

    all_data = []

    for athlete_id in range(chunk_start, chunk_end + 1):
        url = f"https://www.olympedia.org/athletes/{athlete_id}"
        print(f"Fetching athlete ID {athlete_id}...")

        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch athlete ID {athlete_id}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', class_='table')

        if not table:
            print(f"No result table found for athlete ID {athlete_id}")
            continue

        rows = table.find_all('tr')

        current_game = ""
        current_discipline = ""
        current_noc = ""
        current_as = ""

        for row in rows[1:]:  # Skip header
            cols = row.find_all('td')

            # If the row has class 'active', it's a new group
            if 'active' in row.get('class', []):
                current_game = cols[0].get_text(strip=True)
                current_discipline = cols[1].get_text(strip=True)
                team_td = cols[2]
                noc_link = team_td.find('a')
                current_noc = noc_link.get_text(strip=True) if noc_link else ""
                current_as = cols[5].get_text(strip=True)
            else:
                event = cols[1].get_text(strip=True) if len(cols) > 1 else ""
                team = cols[2].get_text(strip=True) if len(cols) > 2 else ""
                pos = cols[3].get_text(strip=True) if len(cols) > 3 else ""
                medal = cols[4].get_text(strip=True) if len(cols) > 4 else ""
                as_name = cols[5].get_text(strip=True) if len(cols) > 5 else ""

                all_data.append({
                    "Athlete_ID": athlete_id,
                    "Games": current_game,
                    "Discipline": current_discipline,
                    "Event": event,
                    "Team": team,
                    "NOC": current_noc,
                    "Pos": pos,
                    "Medal": medal,
                    "As": as_name or current_as
                })

    # Convert to DataFrame
    df = pd.DataFrame(all_data)

     # Save to CSV
    output_file = os.path.join(folder_name, f"athletes_{chunk_start}_{chunk_end}.csv")
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Saved chunk to {output_file}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Fetching athlete ID 1057...
Fetching athlete ID 1058...
Fetching athlete ID 1059...
Fetching athlete ID 1060...
Fetching athlete ID 1061...
Fetching athlete ID 1062...
Fetching athlete ID 1063...
Fetching athlete ID 1064...
Fetching athlete ID 1065...
Fetching athlete ID 1066...
Fetching athlete ID 1067...
Fetching athlete ID 1068...
Fetching athlete ID 1069...
Fetching athlete ID 1070...
Fetching athlete ID 1071...
Fetching athlete ID 1072...
Fetching athlete ID 1073...
Fetching athlete ID 1074...
Fetching athlete ID 1075...
Fetching athlete ID 1076...
Fetching athlete ID 1077...
Fetching athlete ID 1078...
Fetching athlete ID 1079...
Fetching athlete ID 1080...
Fetching athlete ID 1081...
Fetching athlete ID 1082...
Fetching athlete ID 1083...
Fetching athlete ID 1084...
Fetching athlete ID 1085...
Fetching athlete ID 1086...
Fetching athlete ID 1087...
Fetching athlete ID 1088...
Fetching athlete ID 1089...
Fetching at

In [4]:
os.listdir()


['.config', 'sample_data']

In [6]:
os.listdir('sample_data')

['README.md',
 'anscombe.json',
 'california_housing_test.csv',
 'mnist_test.csv',
 'mnist_train_small.csv',
 'california_housing_train.csv']

In [None]:
import shutil

# Zip the folder
shutil.make_archive('athlete_data', 'zip', 'athlete_data')




'/content/athlete_data.zip'

In [None]:
# Download the zipped folder
from google.colab import files
files.download('athlete_data.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>