# Download C Images from S3

This notebook dynamically searches and downloads C images (ending with _C.png) from S3 buckets based on tenant and SID ranges defined in img_folders.json.

The script will:
- Read tenant and SID ranges from img_folders.json
- Search for images ending with _C.png in the specified S3 paths
- Download found images with proper error handling
- Rename images to a flat structure: tenant_sid_filename_C.png

In [37]:
import boto3
import os
import json
from pathlib import Path
from botocore.exceptions import ClientError, NoCredentialsError
import time
from typing import List, Dict, Tuple

In [38]:
# Configuration
AWS_PROFILE = "s3user"  # Set your AWS profile here
S3_BUCKET = "rcs-production-resources-profile-images"
IMG_FOLDERS_JSON = "img_folders.json"
LOCAL_DOWNLOAD_PATH = "./downloads/center_images"
DOWNLOAD_LIMIT = 250000  # Maximum number of images to download
TARGET_SUFFIX = "_C.png"  # Only download images ending with this suffix
SAMPLING_RATIO = 3  # Download every 3rd image (calculated for ~200k-250k total images)
TARGET_TOTAL_IMAGES = 225000  # Target number of images (middle of 200k-250k range)

In [39]:
# Create boto3 session with specified profile
try:
    session = boto3.Session(profile_name=AWS_PROFILE)
    s3_client = session.client('s3')
    print(f"Connected to S3 with profile: {AWS_PROFILE}")
except NoCredentialsError:
    print(f"Error: AWS credentials not found for profile '{AWS_PROFILE}'")
    raise
except Exception as e:
    print(f"Error connecting to S3: {e}")
    raise

Connected to S3 with profile: s3user


In [40]:
# Create local download directory
Path(LOCAL_DOWNLOAD_PATH).mkdir(parents=True, exist_ok=True)
print(f"Download directory: {LOCAL_DOWNLOAD_PATH}")

Download directory: ./downloads/center_images


In [41]:
# Load tenant and SID ranges from img_folders.json
def load_img_folders_config(json_file_path: str) -> Dict[str, List[List[int]]]:
    """
    Load tenant and SID range configuration from JSON file.
    Handles multiple ranges per tenant by collecting all ranges.
    
    Args:
        json_file_path: Path to the img_folders.json file
    
    Returns:
        Dictionary with tenant names as keys and list of [start_sid, end_sid] ranges as values
    """
    try:
        with open(json_file_path, 'r') as f:
            content = f.read()
        
        # Parse JSON manually to handle duplicate keys
        tenant_ranges = {}
        
        # Simple JSON parsing for this specific format
        import re
        pattern = r'"([^"]+)"\s*:\s*\[(\d+),\s*(\d+)\]'
        matches = re.findall(pattern, content)
        
        for tenant, start_sid, end_sid in matches:
            start_sid, end_sid = int(start_sid), int(end_sid)
            if tenant not in tenant_ranges:
                tenant_ranges[tenant] = []
            tenant_ranges[tenant].append([start_sid, end_sid])
        
        # Count total ranges and unique tenants
        total_ranges = sum(len(ranges) for ranges in tenant_ranges.values())
        print(f"Loaded {total_ranges} ranges for {len(tenant_ranges)} unique tenants")
        
        # Show tenants with multiple ranges
        multi_range_tenants = {t: r for t, r in tenant_ranges.items() if len(r) > 1}
        if multi_range_tenants:
            print("Tenants with multiple ranges:")
            for tenant, ranges in multi_range_tenants.items():
                print(f"  {tenant}: {ranges}")
        
        return tenant_ranges
    except FileNotFoundError:
        print(f"Error: Configuration file {json_file_path} not found")
        raise
    except Exception as e:
        print(f"Error loading configuration: {e}")
        raise

# Load the configuration
img_folders_config = load_img_folders_config(IMG_FOLDERS_JSON)

Loaded 12 ranges for 10 unique tenants
Tenants with multiple ranges:
  vbz: [[3279, 3740], [2693, 3043]]
  cts: [[56, 123], [1, 47]]


In [42]:
def list_s3_objects_in_prefix(s3_client, bucket: str, prefix: str) -> List[str]:
    """
    List all objects in S3 bucket with the given prefix.
    
    Args:
        s3_client: Boto3 S3 client
        bucket: S3 bucket name
        prefix: S3 prefix to search in
    
    Returns:
        List of S3 object keys
    """
    try:
        objects = []
        paginator = s3_client.get_paginator('list_objects_v2')
        page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)
        
        for page in page_iterator:
            if 'Contents' in page:
                for obj in page['Contents']:
                    objects.append(obj['Key'])
        
        return objects
    except ClientError as e:
        if e.response['Error']['Code'] == 'NoSuchBucket':
            print(f"  ✗ Bucket {bucket} does not exist")
        else:
            print(f"  ✗ Error listing objects in {prefix}: {e}")
        return []
    except Exception as e:
        print(f"  ✗ Unexpected error listing objects in {prefix}: {e}")
        return []

In [43]:
def find_c_images_for_tenant_sid(s3_client, bucket: str, tenant: str, sid: int) -> List[str]:
    """
    Find all images ending with _C.png for a specific tenant and SID.
    
    Args:
        s3_client: Boto3 S3 client
        bucket: S3 bucket name
        tenant: Tenant name
        sid: SID number
    
    Returns:
        List of S3 keys for C images
    """
    prefix = f"{tenant}/{sid}/"
    all_objects = list_s3_objects_in_prefix(s3_client, bucket, prefix)
    
    # Filter for images ending with _C.png
    c_images = [obj for obj in all_objects if obj.endswith(TARGET_SUFFIX)]
    
    return c_images

In [44]:
def download_file_from_s3(s3_client, bucket: str, s3_key: str, local_path: str) -> bool:
    """
    Download a file from S3 to local storage.
    
    Args:
        s3_client: Boto3 S3 client
        bucket: S3 bucket name
        s3_key: S3 object key
        local_path: Local file path to save to
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        
        # Check if file already exists
        if os.path.exists(local_path):
            return True
        
        s3_client.download_file(bucket, s3_key, local_path)
        return True
    except ClientError as e:
        if e.response['Error']['Code'] == 'NoSuchKey':
            print(f"  ✗ Failed: {s3_key} not found in bucket {bucket}")
        elif e.response['Error']['Code'] == '403':
            print(f"  ✗ Access denied: {s3_key}")
        else:
            print(f"  ✗ Error downloading {s3_key}: {e}")
        return False
    except Exception as e:
        print(f"  ✗ Unexpected error downloading {s3_key}: {e}")
        return False

In [45]:
def construct_local_filename(s3_key: str) -> str:
    """
    Construct local filename from S3 key.
    Example: ava/126/c1_0000000160_C.png -> ava_126_c1_0000000160_C.png
    
    Args:
        s3_key: S3 object key
    
    Returns:
        Local filename
    """
    parts = s3_key.split('/')
    if len(parts) >= 3:
        tenant = parts[0]
        sid = parts[1]
        filename = parts[2]
        return f"{tenant}_{sid}_{filename}"
    else:
        # Fallback: use the original filename
        return os.path.basename(s3_key)

In [46]:
# Main download process
downloaded_files = []
failed_downloads = []
empty_folders = []
processed_folders = 0
total_folders = 0

print(f"Starting download of C images...")
print(f"Target suffix: {TARGET_SUFFIX}")
print(f"Local download path: {LOCAL_DOWNLOAD_PATH}")
print("-" * 80)

# Calculate total number of folders to process
for tenant, sid_ranges in img_folders_config.items():
    for sid_range in sid_ranges:
        start_sid, end_sid = sid_range
        total_folders += (end_sid - start_sid + 1)

print(f"Total folders to process: {total_folders}")
print("-" * 80)

start_time = time.time()

try:
    for tenant, sid_ranges in img_folders_config.items():
        tenant_downloaded = 0
        tenant_failed = 0
        tenant_empty = 0
        
        for sid_range in sid_ranges:
            start_sid, end_sid = sid_range
            
            for sid in range(start_sid, end_sid + 1):
                processed_folders += 1
                
                # Find C images for this tenant/SID combination
                c_images = find_c_images_for_tenant_sid(s3_client, S3_BUCKET, tenant, sid)
                
                if not c_images:
                    empty_folders.append(f"{tenant}/{sid}")
                    tenant_empty += 1
                    continue
                
                # Apply sampling ratio
                sampled_c_images = c_images[::SAMPLING_RATIO]
                
                # Download each sampled C image
                for s3_key in sampled_c_images:
                    local_filename = construct_local_filename(s3_key)
                    local_file_path = os.path.join(LOCAL_DOWNLOAD_PATH, local_filename)
                    
                    if download_file_from_s3(s3_client, S3_BUCKET, s3_key, local_file_path):
                        downloaded_files.append({
                            's3_key': s3_key,
                            'local_path': local_file_path,
                            'tenant': tenant,
                            'sid': sid
                        })
                        tenant_downloaded += 1
                    else:
                        failed_downloads.append({
                            's3_key': s3_key,
                            'tenant': tenant,
                            'sid': sid
                        })
                        tenant_failed += 1
                    
                    # Apply download limit if set
                    if DOWNLOAD_LIMIT and len(downloaded_files) >= DOWNLOAD_LIMIT:
                        break
                
                # Progress update every 100 folders
                if processed_folders % 100 == 0:
                    print(f"Progress: {processed_folders}/{total_folders} folders processed, {len(downloaded_files)} images downloaded")
                
                if DOWNLOAD_LIMIT and len(downloaded_files) >= DOWNLOAD_LIMIT:
                    break
            
            if DOWNLOAD_LIMIT and len(downloaded_files) >= DOWNLOAD_LIMIT:
                break
        
        # Tenant summary
        print(f"Tenant {tenant}: {tenant_downloaded} downloaded, {tenant_failed} failed, {tenant_empty} empty")
        
        if DOWNLOAD_LIMIT and len(downloaded_files) >= DOWNLOAD_LIMIT:
            print(f"Download limit reached ({DOWNLOAD_LIMIT})")
            break

except KeyboardInterrupt:
    print("\nDownload interrupted by user")
except Exception as e:
    print(f"\nError during download: {e}")

Starting download of C images...
Target suffix: _C.png
Local download path: ./downloads/center_images
--------------------------------------------------------------------------------
Total folders to process: 1803
--------------------------------------------------------------------------------
Progress: 100/1803 folders processed, 15617 images downloaded
Progress: 100/1803 folders processed, 15617 images downloaded
Tenant bvb: 27024 downloaded, 0 failed, 47 empty
Tenant bvb: 27024 downloaded, 0 failed, 47 empty
Progress: 200/1803 folders processed, 45004 images downloaded
Progress: 200/1803 folders processed, 45004 images downloaded
Progress: 400/1803 folders processed, 64514 images downloaded
Progress: 400/1803 folders processed, 64514 images downloaded
Tenant vbz: 46258 downloaded, 0 failed, 603 empty
Tenant vbz: 46258 downloaded, 0 failed, 603 empty
Tenant cts: 30913 downloaded, 0 failed, 69 empty
Tenant cts: 30913 downloaded, 0 failed, 69 empty
Progress: 1100/1803 folders processed

In [47]:
# Download summary
end_time = time.time()
duration = end_time - start_time

print("\n" + "=" * 80)
print("DOWNLOAD SUMMARY")
print("=" * 80)
print(f"Total execution time: {duration:.2f} seconds")
print(f"Total folders processed: {processed_folders}/{total_folders}")
print(f"Total images downloaded: {len(downloaded_files)}")
print(f"Failed downloads: {len(failed_downloads)}")
print(f"Empty folders: {len(empty_folders)}")

if downloaded_files:
    # Show breakdown by tenant
    tenant_counts = {}
    for file_info in downloaded_files:
        tenant = file_info['tenant']
        tenant_counts[tenant] = tenant_counts.get(tenant, 0) + 1
    
    print("\nDownloads by tenant:")
    for tenant, count in sorted(tenant_counts.items()):
        print(f"  {tenant}: {count} images")

if duration > 0:
    print(f"\nDownload rate: {len(downloaded_files)/duration:.2f} images/second")

print(f"\nFiles saved to: {LOCAL_DOWNLOAD_PATH}")


DOWNLOAD SUMMARY
Total execution time: 36893.18 seconds
Total folders processed: 1435/1803
Total images downloaded: 250000
Failed downloads: 0
Empty folders: 820

Downloads by tenant:
  ava: 21980 images
  bernmobil: 16421 images
  bvb: 27024 images
  cts: 30913 images
  gent: 23456 images
  gvb: 18651 images
  retm: 65297 images
  vbz: 46258 images

Download rate: 6.78 images/second

Files saved to: ./downloads/center_images


In [48]:
# Save download log
log_file = os.path.join(LOCAL_DOWNLOAD_PATH, 'download_log.json')
download_log = {
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
    'total_downloaded': len(downloaded_files),
    'total_failed': len(failed_downloads),
    'total_empty_folders': len(empty_folders),
    'execution_time_seconds': duration,
    'download_limit': DOWNLOAD_LIMIT,
    'target_suffix': TARGET_SUFFIX,
    'tenant_summary': {tenant: count for tenant, count in sorted(tenant_counts.items())} if downloaded_files else {},
    'empty_folders': empty_folders[:50] if len(empty_folders) > 50 else empty_folders  # Limit to first 50
}

try:
    with open(log_file, 'w') as f:
        json.dump(download_log, f, indent=2)
    print(f"\nDownload log saved to: {log_file}")
except Exception as e:
    print(f"\nWarning: Could not save download log: {e}")


Download log saved to: ./downloads/center_images/download_log.json
