In [1]:
import sys
import os
from pathlib import Path
from datetime import datetime
import numpy as np
import rasterio
import rasterio.mask
import geopandas as gpd
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from src.region_manager import ConfigurableBoundaryManager
from src.preprocessing import RegionAwarePreprocessor

print("‚úÖ All imports successful")

‚úÖ All imports successful


In [3]:
mgr = ConfigurableBoundaryManager()

telangana = mgr.get_region('Telangana')
maharashtra = mgr.get_region('Maharashtra')

print(f"üìç Telangana: {telangana.area_km2:,.0f} km¬≤ ({telangana.grid_cell_count():,} cells)")
print(f"üìç Maharashtra: {maharashtra.area_km2:,.0f} km¬≤ ({maharashtra.grid_cell_count():,} cells)")

# Check for WorldPop files
worldpop_dir = Path('data/raw/worldpop')
worldpop_files = sorted(worldpop_dir.glob('ind_ppp_*.tif'))

worldpop_files = sorted(worldpop_dir.glob('*.tif'))

print(f"WorldPop files found: {len(worldpop_files)}")
for f in worldpop_files:
    size_mb = f.stat().st_size / 1e6
    print(f"  {f.name}: {size_mb:.1f} MB")

üìç Telangana: 207,816 km¬≤ (207,816 cells)
üìç Maharashtra: 699,413 km¬≤ (699,412 cells)
WorldPop files found: 5
  ind_ppp_2000_1km_Aggregated.tif: 19.0 MB
  ind_ppp_2005_1km_Aggregated.tif: 19.0 MB
  ind_ppp_2010_1km_Aggregated.tif: 19.0 MB
  ind_ppp_2015_1km_Aggregated.tif: 19.1 MB
  ind_ppp_2020_1km_Aggregated.tif: 19.1 MB


In [13]:
print("\n" + "="*70)
print("CLIPPING TELANGANA DATA")
print("="*70)

tel_data = {}

nodata_value = -99999  # WorldPop nodata

for file_path in worldpop_files:
    fname = file_path.stem  # e.g. ind_ppp_2000_1km_Aggregated
    print(f"\nüì• Processing {fname}...")

    with rasterio.open(file_path) as src:
        clipped, transform = rasterio.mask.mask(
            src,
            [telangana.geometry],
            crop=True
        )

        profile = src.profile
        profile.update(
            transform=transform,
            width=clipped.shape[2],
            height=clipped.shape[1]
        )

        output_tif = Path("data/processed") / f"telangana_{fname}.tif"
        with rasterio.open(output_tif, "w", **profile) as dst:
            dst.write(clipped)

        # Convert to float32 and mask nodata
        clipped_array = clipped[0].astype(np.float32)
        valid_mask = clipped_array != nodata_value
        valid_data = clipped_array[valid_mask]

        # Optionally set nodata cells to 0 for downstream use
        clipped_array[~valid_mask] = 0.0

        output_npy = Path("data/processed") / f"telangana_{fname}.npy"
        np.save(output_npy, clipped_array)

        tel_data[fname] = clipped_array

        print(f"  ‚úÖ Clipped shape: {clipped_array.shape}")
        if valid_data.size > 0:
            print(f"  ‚úÖ Range (valid): {valid_data.min():.0f} - {valid_data.max():.0f} people")
            print(f"  ‚úÖ Total population (valid): {valid_data.sum():,.0f}")
        else:
            print("  ‚ö†Ô∏è No valid data cells after masking")
        print(f"  ‚úÖ Saved to {output_tif.name}")


CLIPPING TELANGANA DATA

üì• Processing ind_ppp_2000_1km_Aggregated...
  ‚úÖ Clipped shape: (517, 469)
  ‚úÖ Range (valid): 1 - 34027 people
  ‚úÖ Total population (valid): 45,907,576
  ‚úÖ Saved to telangana_ind_ppp_2000_1km_Aggregated.tif

üì• Processing ind_ppp_2005_1km_Aggregated...
  ‚úÖ Clipped shape: (517, 469)
  ‚úÖ Range (valid): 0 - 34356 people
  ‚úÖ Total population (valid): 48,334,464
  ‚úÖ Saved to telangana_ind_ppp_2005_1km_Aggregated.tif

üì• Processing ind_ppp_2010_1km_Aggregated...
  ‚úÖ Clipped shape: (517, 469)
  ‚úÖ Range (valid): 0 - 35532 people
  ‚úÖ Total population (valid): 50,955,064
  ‚úÖ Saved to telangana_ind_ppp_2010_1km_Aggregated.tif

üì• Processing ind_ppp_2015_1km_Aggregated...
  ‚úÖ Clipped shape: (517, 469)
  ‚úÖ Range (valid): 0 - 36651 people
  ‚úÖ Total population (valid): 53,875,368
  ‚úÖ Saved to telangana_ind_ppp_2015_1km_Aggregated.tif

üì• Processing ind_ppp_2020_1km_Aggregated...
  ‚úÖ Clipped shape: (517, 469)
  ‚úÖ Range (valid): 0 

In [14]:
print("\n" + "="*70)
print("QUALITY ASSESSMENT - TELANGANA")
print("="*70)

for filename, data in tel_data.items():
    quality = preprocessor.calculate_quality_score(data, region_type='mixed')
    low_quality = preprocessor.identify_low_quality_regions(data)
    
    low_pct = (low_quality.sum() / low_quality.size) * 100
    
    print(f"\n{filename}:")
    print(f"  Quality (mean): {quality.mean():.3f}")
    print(f"  Low-quality cells: {low_quality.sum():,} ({low_pct:.1f}%)")



QUALITY ASSESSMENT - TELANGANA

ind_ppp_2000_1km_Aggregated:
  Quality (mean): 0.811
  Low-quality cells: 14,111 (5.8%)

ind_ppp_2005_1km_Aggregated:
  Quality (mean): 0.803
  Low-quality cells: 14,108 (5.8%)

ind_ppp_2010_1km_Aggregated:
  Quality (mean): 0.787
  Low-quality cells: 14,098 (5.8%)

ind_ppp_2015_1km_Aggregated:
  Quality (mean): 0.761
  Low-quality cells: 14,119 (5.8%)

ind_ppp_2020_1km_Aggregated:
  Quality (mean): 0.758
  Low-quality cells: 14,133 (5.8%)


In [15]:
print("\n" + "="*70)
print("INTERPOLATING MISSING DATA")
print("="*70)

tel_interp = {}

for fname, data in tel_data.items():
    print(f"\nInterpolating {fname}...")
    
    low_quality = preprocessor.identify_low_quality_regions(data)
    data_filled = preprocessor.adaptive_interpolation(data, low_quality)
    
    tel_interp[fname] = data_filled
    
    output_npy = Path('data/processed') / f'telangana_interp_{fname}.npy'
    np.save(output_npy, data_filled)
    
    print(f"  ‚úÖ Saved interpolated data")


INTERPOLATING MISSING DATA

Interpolating ind_ppp_2000_1km_Aggregated...
  ‚úÖ Saved interpolated data

Interpolating ind_ppp_2005_1km_Aggregated...
  ‚úÖ Saved interpolated data

Interpolating ind_ppp_2010_1km_Aggregated...
  ‚úÖ Saved interpolated data

Interpolating ind_ppp_2015_1km_Aggregated...
  ‚úÖ Saved interpolated data

Interpolating ind_ppp_2020_1km_Aggregated...
  ‚úÖ Saved interpolated data


In [10]:
import rasterio
import numpy as np
from pathlib import Path
import re

worldpop_dir = Path('data/raw/worldpop')
worldpop_files = sorted(worldpop_dir.glob('ind_ppp_*.tif'))

tel_interp = {}

for f in worldpop_files:
    match = re.search(r'(20\d{2})', f.stem)
    if not match:
        continue

    year = int(match.group(1))
    print(f"Processing {year}...")

    with rasterio.open(f) as src:
        data = src.read(1)

        # Replace nodata with 0
        if src.nodata is not None:
            data = np.where(data == src.nodata, 0, data)

        # For now, we directly store full raster
        # (Later you can crop to Telangana boundary)
        tel_interp[year] = data.astype(np.float32)

print("tel_interp years:", list(tel_interp.keys()))

Processing 2000...
Processing 2005...
Processing 2010...
Processing 2015...
Processing 2020...
tel_interp years: [2000, 2005, 2010, 2015, 2020]


In [16]:
print("\n" + "="*70)
print("CREATING TEMPORAL SEQUENCE")
print("="*70)

available_years = sorted(tel_interp.keys())

if len(available_years) == 0:
    raise ValueError("‚ùå No valid years available for stacking.")

sequence = np.stack(
    [tel_interp[year] for year in available_years],
    axis=0
)

print(f"\nSequence shape: {sequence.shape}")
print(f"  Time steps: {sequence.shape[0]}")
print(f"  Height: {sequence.shape[1]} pixels")
print(f"  Width: {sequence.shape[2]} pixels")

# Temporal consistency
print("\nTemporal Consistency (correlation between consecutive years):")

for t in range(sequence.shape[0] - 1):
    corr = np.corrcoef(
        sequence[t].flatten(),
        sequence[t+1].flatten()
    )[0, 1]
    
    print(f"  {available_years[t]} ‚Üí {available_years[t+1]}: {corr:.3f}")

# Save
seq_path = Path('data/processed') / 'telangana_population_sequence.npy'
seq_path.parent.mkdir(parents=True, exist_ok=True)

np.save(seq_path, sequence.astype(np.float32))

print(f"\n‚úÖ Saved to {seq_path}")


CREATING TEMPORAL SEQUENCE

Sequence shape: (5, 517, 469)
  Time steps: 5
  Height: 517 pixels
  Width: 469 pixels

Temporal Consistency (correlation between consecutive years):
  ind_ppp_2000_1km_Aggregated ‚Üí ind_ppp_2005_1km_Aggregated: 0.992
  ind_ppp_2005_1km_Aggregated ‚Üí ind_ppp_2010_1km_Aggregated: 0.993
  ind_ppp_2010_1km_Aggregated ‚Üí ind_ppp_2015_1km_Aggregated: 0.987
  ind_ppp_2015_1km_Aggregated ‚Üí ind_ppp_2020_1km_Aggregated: 0.992

‚úÖ Saved to data\processed\telangana_population_sequence.npy


In [17]:
print("\n" + "="*70)
print("REPEATING FOR MAHARASHTRA")
print("="*70)

import re
import numpy as np
from pathlib import Path
import rasterio
import rasterio.mask

# Create year ‚Üí filepath mapping
year_file_map = {}

for f in worldpop_files:
    match = re.search(r'(20\d{2})', f.stem)
    if match:
        year = int(match.group(1))
        year_file_map[year] = f

maha_interp = {}

for year in sorted(year_file_map.keys()):

    file_path = year_file_map[year]

    with rasterio.open(file_path) as src:

        clipped, transform = rasterio.mask.mask(
            src,
            [maharashtra.geometry],
            crop=True
        )

        clipped_array = clipped[0].astype(np.float32)

        # Replace nodata with 0
        if src.nodata is not None:
            clipped_array = np.where(
                clipped_array == src.nodata,
                0,
                clipped_array
            )

        clipped_array = np.nan_to_num(clipped_array, nan=0)

        maha_interp[year] = clipped_array

        print(f"‚úÖ {year}: shape {clipped_array.shape}, pop {clipped_array.sum():,.0f}")

# Stack safely
maha_sequence = np.stack(
    [maha_interp[year] for year in sorted(maha_interp.keys())],
    axis=0
)

maha_path = Path('data/processed') / 'maharashtra_population_sequence.npy'
maha_path.parent.mkdir(parents=True, exist_ok=True)

np.save(maha_path, maha_sequence.astype(np.float32))

print(f"\n‚úÖ Maharashtra saved to {maha_path}")


REPEATING FOR MAHARASHTRA
‚úÖ 2000: shape (817, 997), pop 183,937,248
‚úÖ 2005: shape (817, 997), pop 198,845,504
‚úÖ 2010: shape (817, 997), pop 215,371,328
‚úÖ 2015: shape (817, 997), pop 233,695,280
‚úÖ 2020: shape (817, 997), pop 254,069,344

‚úÖ Maharashtra saved to data\processed\maharashtra_population_sequence.npy


In [18]:
print("\n" + "="*70)
print("PREPROCESSING COMPLETE ‚úÖ")
print("="*70)

files = list(Path('data/processed').glob('*.npy'))
print(f"\nFiles created: {len(files)}")
for f in sorted(files):
    size_mb = f.stat().st_size / 1e6
    print(f"  - {f.name}: {size_mb:.1f} MB")

print("\nüìä Ready for: Notebook 02 - Create HDF5 Dataset")


PREPROCESSING COMPLETE ‚úÖ

Files created: 12
  - maharashtra_population_sequence.npy: 16.3 MB
  - telangana_ind_ppp_2000_1km_Aggregated.npy: 1.0 MB
  - telangana_ind_ppp_2005_1km_Aggregated.npy: 1.0 MB
  - telangana_ind_ppp_2010_1km_Aggregated.npy: 1.0 MB
  - telangana_ind_ppp_2015_1km_Aggregated.npy: 1.0 MB
  - telangana_ind_ppp_2020_1km_Aggregated.npy: 1.0 MB
  - telangana_interp_ind_ppp_2000_1km_Aggregated.npy: 1.0 MB
  - telangana_interp_ind_ppp_2005_1km_Aggregated.npy: 1.0 MB
  - telangana_interp_ind_ppp_2010_1km_Aggregated.npy: 1.0 MB
  - telangana_interp_ind_ppp_2015_1km_Aggregated.npy: 1.0 MB
  - telangana_interp_ind_ppp_2020_1km_Aggregated.npy: 1.0 MB
  - telangana_population_sequence.npy: 4.8 MB

üìä Ready for: Notebook 02 - Create HDF5 Dataset
