In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ashery/chexpert")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'chexpert' dataset.
Path to dataset files: /kaggle/input/chexpert


In [25]:
import os
import pandas as pd
import shutil
from pathlib import Path

def create_dataset_subset(
    csv_path='train.csv',
    output_csv_path='train.csv',
    source_dir='.',
    dest_dir='/content/dataset_subset',
    num_patients=100
):
    # 1. Load the original CSV dataset
    print(f"Loading {csv_path}...")
    df = pd.read_csv(csv_path)
    df.head()

    # 2. Extract the patient ID from the 'Path' column
    # Example Path: CheXpert-v1.0-small/train/patient00001/study1/view1_frontal.jpg
    # Using split('/'), the patient ID is usually at index 2 (if the path starts with CheXpert...)
    # We can use a regex to be more robust, extracting 'patient\d+'
    df['patient_id'] = df['Path'].str.extract(r'(patient\d+)')

    # 3. Get the first 'num_patients' unique patient IDs
    unique_patients = df['patient_id'].dropna().unique()
    first_n_patients = unique_patients[:num_patients]
    print(f"Filtering dataset for the first {len(first_n_patients)} unique patients...")

    # 4. Filter the dataframe
    df_subset = df[df['patient_id'].isin(first_n_patients)].copy()

    # Drop the temporary 'patient_id' column to keep the exact same schema
    df_subset = df_subset.drop(columns=['patient_id'])

    # 5. Copy the image files to the new location mirroring the file structure
    print(f"Copying images to {dest_dir}...")
    copied_count = 0
    missing_files = []

    for relative_path in df_subset['Path']:
        src_path = os.path.join(source_dir, relative_path.replace('CheXpert-v1.0-small/', ''))
        dst_path = os.path.join(dest_dir, relative_path)

        # Create destination directories if they don't exist
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)

        # Copy file if it exists in the source
        if os.path.exists(src_path):
            if not os.path.exists(dst_path):
                shutil.copy2(src_path, dst_path)
            copied_count += 1
        else:
            missing_files.append(src_path)

    print(f"Copied {copied_count} files.")
    if missing_files:
        print(f"Warning: {len(missing_files)} files were listed in the CSV but not found in the source directory.")

    # 6. Save the new CSV file with exactly the same schema and relative paths
    df_subset.to_csv(output_csv_path, index=False)
    print(f"Saved subset CSV to {output_csv_path} with {len(df_subset)} records.")


In [26]:
def zip_and_download(folder_path, zip_filename="dataset_subset"):
    """Zips the specified folder and attempts to download it."""
    print(f"Zipping '{folder_path}' into '{zip_filename}.zip'...")

    # Create the zip archive
    shutil.make_archive(zip_filename, 'zip', folder_path)
    print("Zip file created successfully.")

    # Attempt to download if running in Google Colab
    try:
        from google.colab import files
        print("Google Colab environment detected. Initiating download...")
        files.download(f"{zip_filename}.zip")
    except ImportError:
        # Provide the absolute path for local environments
        abs_path = os.path.abspath(f"{zip_filename}.zip")
        print(f"Local environment detected. Your dataset is ready at:\n{abs_path}")

In [27]:
path_csv = Path(path) / 'train.csv'
create_dataset_subset(
        csv_path=str(path_csv),
        output_csv_path='/content/dataset_subset/train.csv',
        source_dir=path,               # Set to your original dataset root folder
        dest_dir='/content/dataset_subset',  # New folder to place the copied structure
        num_patients=100
    )

Loading /kaggle/input/chexpert/train.csv...
Filtering dataset for the first 100 unique patients...
Copying images to /content/dataset_subset...
Copied 362 files.
Saved subset CSV to /content/dataset_subset/train.csv with 362 records.


In [24]:
zip_and_download(folder_path='/content/dataset_subset', zip_filename='chexpert')

Zipping '/content/dataset_subset' into 'chexpert.zip.zip'...
Zip file created successfully.
Google Colab environment detected. Initiating download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>