the example script here runs the command
```
./sagmaker_submit_dir/run_scraper.py --from-id 8627380 --to-id 8627391 --local-dir danbooru_downloads --upload-dir s3://dataset-ingested/danbooru
```
on a sagemaker ml.m5.xlarge instance. The script downloads images from danbooru and uploads them to an s3 bucket

In [1]:
from sagemaker.pytorch import PyTorch

def launch_scraper_job(start_id: int, end_id: int, local_dir: str, upload_dir: str, 
                       instance_type: str = "ml.m5.xlarge", max_run: int = 7200):
    """
    Launch a single SageMaker job to run the scraper script.

    Args:
        start_id (int): Starting post ID for the scraper.
        end_id (int): Ending post ID for the scraper.
        local_dir (str): Local directory to store scraped data.
        upload_dir (str): S3 bucket URI to upload the scraped data.
        instance_type (str): SageMaker instance type to use.
        max_run (int): Maximum run time in seconds (default: 2 hours).
    """
    # Define hyperparameters
    hyperparameters = {
        'from-id': start_id,
        'to-id': end_id,
        'local-dir': local_dir,
        'upload-dir': upload_dir,
    }

    # Estimator configuration
    estimator = PyTorch(
        entry_point='run_scraper.py',
        source_dir='/home/ubuntu/danbooru-scraper/notebooks/sagmaker_submit_dir',
        role='sagemaker_training_execution_role',  # Replace with your SageMaker role ARN
        instance_count=1,
        instance_type=instance_type,
        image_uri='763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.1-cpu-py310',
        hyperparameters=hyperparameters,
        max_run=max_run,
        volume_size=50,  # Adjust based on your storage needs
    )

    # Job name for tracking
    job_name = f"scraper-job-{start_id}-{end_id}"

    # Launch the job
    print(f"Launching job: {job_name}")
    estimator.fit(wait=False, job_name=job_name)
    print(f"Job {job_name} completed.")



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml


In [7]:
launch_scraper_job(
    start_id=8627380,
    end_id=8627391,
    local_dir='danbooru_downloads',
    upload_dir='s3://dataset-ingested/danbooru'
)

Launching job: scraper-job-8627380-8627391


2024-12-28 07:04:57 Starting - Starting the training job...
2024-12-28 07:05:12 Starting - Preparing the instances for training...
2024-12-28 07:05:48 Downloading - Downloading the training image......Job scraper-job-8627380-8627391 completed.


In [None]:
import math
def launch_danbooru_scrape_jobs(from_id: int, to_id: int, n_jobs: int, upload_dir: str):
    """
    Launch multiple SageMaker jobs to distribute the scraping workload.

    Args:
        from_id (int): Starting post ID for the scraper.
        to_id (int): Ending post ID for the scraper.
        n_jobs (int): Number of jobs to divide the workload.
        upload_dir (str): S3 bucket URI to upload the scraped data.
    """
    # Calculate the range of IDs each job will handle
    total_posts = to_id - from_id + 1
    posts_per_job = math.ceil(total_posts / n_jobs)

    for i in range(n_jobs):
        start_id = from_id + i * posts_per_job
        end_id = min(start_id + posts_per_job - 1, to_id)

        local_dir = f'danbooru_downloads_job_{i + 1}'

        print(f"Launching job for IDs {start_id} to {end_id}")
        launch_scraper_job(start_id, end_id, local_dir, upload_dir)

FROM_ID = 0
TO_ID = 8627626
N_JOBS = 80
UPLOAD_DIR = 's3://unidataset-danbooru/metadata/20241228_rescrape_full/'
launch_danbooru_scrape_jobs(FROM_ID, TO_ID, N_JOBS, UPLOAD_DIR)

In [3]:
import boto3
def terminate_scraper_jobs(name_contains: str = "scraper-job"):
    """
    Terminates all SageMaker training jobs with names starting with 'scraper-job'.
    """
    sagemaker_client = boto3.client('sagemaker')

    # List all training jobs
    response = sagemaker_client.list_training_jobs(
        NameContains=name_contains,
        StatusEquals="InProgress",
    )

    training_jobs = response.get("TrainingJobSummaries", [])

    for job in training_jobs:
        job_name = job["TrainingJobName"]
        print(f"Stopping job: {job_name}")
        sagemaker_client.stop_training_job(TrainingJobName=job_name)

terminate_scraper_jobs()

Stopping job: scraper-job-8519834-8627626
Stopping job: scraper-job-8411988-8519833
Stopping job: scraper-job-8304142-8411987
Stopping job: scraper-job-8196296-8304141
Stopping job: scraper-job-8088450-8196295
Stopping job: scraper-job-7980604-8088449
Stopping job: scraper-job-7872758-7980603
Stopping job: scraper-job-7764912-7872757
Stopping job: scraper-job-7657066-7764911
Stopping job: scraper-job-7549220-7657065
