In [1]:
import rasterio
import rasterio.mask
import numpy as np
from pathlib import Path
import sys
import time
from tqdm import tqdm
import logging

sys.path.insert(0, '/path/to/civicpulse-ai_org')
from src.region_manager import ConfigurableBoundaryManager

# ‚úÖ Create logs folder first
log_dir = Path("logs")
log_dir.mkdir(parents=True, exist_ok=True)

# Setup logging
logging.basicConfig(
    filename='logs/clip_india_progress.log',
    level=logging.INFO,
    format='%(asctime)s - %(message)s'
)

print("="*70)
print("CLIPPING FULL INDIA - 8-12 HOUR OPERATION")
print("="*70)
print("‚è∞ Best run: Overnight")
print("üìä Expected output: ~2.5GB per year\n")

CLIPPING FULL INDIA - 8-12 HOUR OPERATION
‚è∞ Best run: Overnight
üìä Expected output: ~2.5GB per year



In [2]:
mgr = ConfigurableBoundaryManager()
india = mgr.get_region('India')

print(f"Processing: {india.area_km2:,.0f} km¬≤")
print(f"Expected grid cells: {india.grid_cell_count():,}")

worldpop_dir = Path('data/raw/worldpop')
files = sorted(worldpop_dir.glob('ind_ppp_*.tif'))

import re

available_years = []

for f in files:
    match = re.search(r'(20\d{2})', f.stem)
    if match:
        available_years.append(int(match.group(1)))

available_years = sorted(available_years)

print(f"Years to process: {available_years}")

Processing: 9,493,372 km¬≤
Expected grid cells: 9,493,372
Years to process: [2000, 2005, 2010, 2015, 2020]


In [3]:
print("\n" + "="*70)
print("CLIPPING WORLDPOP DATA")
print("="*70)

import re

# Create year ‚Üí filepath mapping
year_file_map = {}

for f in files:
    match = re.search(r'(20\d{2})', f.stem)
    if match:
        year = int(match.group(1))
        year_file_map[year] = f

print("Detected files:", year_file_map)

start_time = time.time()

for year in tqdm(sorted(year_file_map.keys()), desc="Clipping years"):

    file_path = year_file_map[year]   # ‚úÖ use actual file

    print(f"\nüì• {year}...")

    try:
        with rasterio.open(file_path) as src:

            clipped, transform = rasterio.mask.mask(
                src,
                [india.geometry],
                crop=True
            )

            profile = src.profile
            profile.update(
                transform=transform,
                width=clipped.shape[2],
                height=clipped.shape[1]
            )

            output_path = Path('data/processed') / f'india_pop_clipped_{year}.tif'
            output_path.parent.mkdir(parents=True, exist_ok=True)

            with rasterio.open(output_path, 'w', **profile) as dst:
                dst.write(clipped)

            file_size_mb = output_path.stat().st_size / 1e6

            print(f"  ‚úÖ {clipped.shape} ‚Üí {file_size_mb:.1f} MB")
            logging.info(f"{year}: shape {clipped.shape}, size {file_size_mb:.1f} MB")

    except Exception as e:
        print(f"  ‚ùå Error: {e}")
        logging.error(f"{year}: {e}")

elapsed = time.time() - start_time
print(f"\n‚è±Ô∏è Total time: {elapsed/3600:.2f} hours")
logging.info(f"Complete in {elapsed/3600:.2f} hours")


CLIPPING WORLDPOP DATA
Detected files: {2000: WindowsPath('data/raw/worldpop/ind_ppp_2000_1km_Aggregated.tif'), 2005: WindowsPath('data/raw/worldpop/ind_ppp_2005_1km_Aggregated.tif'), 2010: WindowsPath('data/raw/worldpop/ind_ppp_2010_1km_Aggregated.tif'), 2015: WindowsPath('data/raw/worldpop/ind_ppp_2015_1km_Aggregated.tif'), 2020: WindowsPath('data/raw/worldpop/ind_ppp_2020_1km_Aggregated.tif')}


Clipping years:   0%|                                                                                         | 0/5 [00:00<?, ?it/s]


üì• 2000...


Clipping years:  20%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè                                                                | 1/5 [00:00<00:03,  1.06it/s]

  ‚úÖ (1, 3193, 3446) ‚Üí 18.9 MB

üì• 2005...


Clipping years:  40%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç                                                | 2/5 [00:02<00:03,  1.07s/it]

  ‚úÖ (1, 3193, 3446) ‚Üí 18.9 MB

üì• 2010...


Clipping years:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                                | 3/5 [00:03<00:02,  1.04s/it]

  ‚úÖ (1, 3193, 3446) ‚Üí 19.0 MB

üì• 2015...


Clipping years:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä                | 4/5 [00:04<00:01,  1.04s/it]

  ‚úÖ (1, 3193, 3446) ‚Üí 19.0 MB

üì• 2020...


Clipping years: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:05<00:00,  1.01s/it]

  ‚úÖ (1, 3193, 3446) ‚Üí 19.0 MB

‚è±Ô∏è Total time: 0.00 hours





In [4]:
print("\n" + "="*70)
print("VERIFICATION")
print("="*70)

output_files = sorted(Path('data/processed').glob('india_pop_clipped_*.tif'))
print(f"\nFiles created: {len(output_files)}")
total_size = 0
for f in output_files:
    size_mb = f.stat().st_size / 1e6
    total_size += size_mb
    year = int(f.stem.split('_')[-1])
    print(f"  {year}: {size_mb:.1f} MB")

print(f"\nTotal size: {total_size:.1f} MB (~{total_size/1024:.1f} GB)")
print(f"‚úÖ All {len(output_files)} years clipped successfully")


VERIFICATION

Files created: 5
  2000: 18.9 MB
  2005: 18.9 MB
  2010: 19.0 MB
  2015: 19.0 MB
  2020: 19.0 MB

Total size: 94.8 MB (~0.1 GB)
‚úÖ All 5 years clipped successfully
