In [6]:
import sys
import os
from pathlib import Path
from datetime import datetime
import numpy as np
import rasterio
import rasterio.mask
import geopandas as gpd
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from src.region_manager import ConfigurableBoundaryManager
from src.preprocessing import RegionAwarePreprocessor

print("‚úÖ All imports successful")

‚úÖ All imports successful


In [9]:
mgr = ConfigurableBoundaryManager()

telangana = mgr.get_region('Telangana')
maharashtra = mgr.get_region('Maharashtra')

print(f"üìç Telangana: {telangana.area_km2:,.0f} km¬≤ ({telangana.grid_cell_count():,} cells)")
print(f"üìç Maharashtra: {maharashtra.area_km2:,.0f} km¬≤ ({maharashtra.grid_cell_count():,} cells)")

# Check for WorldPop files
worldpop_dir = Path('data/raw/worldpop')
worldpop_files = sorted(worldpop_dir.glob('ind_ppp_*.tif'))

print(f"\nüì¶ WorldPop files found: {len(worldpop_files)}")

valid_years = []

for f in worldpop_files:
    size_mb = f.stat().st_size / 1e6
    last_part = f.stem.split('_')[-1]
    
    if last_part.isdigit():   # check if it's a number
        year = int(last_part)
        valid_years.append(year)
        print(f"  {year}: {size_mb:.1f} MB")
    else:
        print(f"  Skipping file (not a year): {f.name}")

available_years = sorted(valid_years)
print(f"\nAvailable years: {available_years}")

üìç Telangana: 207,816 km¬≤ (207,816 cells)
üìç Maharashtra: 699,413 km¬≤ (699,412 cells)

üì¶ WorldPop files found: 5
  Skipping file (not a year): ind_ppp_2000_1km_Aggregated.tif
  Skipping file (not a year): ind_ppp_2005_1km_Aggregated.tif
  Skipping file (not a year): ind_ppp_2010_1km_Aggregated.tif
  Skipping file (not a year): ind_ppp_2015_1km_Aggregated.tif
  Skipping file (not a year): ind_ppp_2020_1km_Aggregated.tif

Available years: []


In [10]:
print("\n" + "="*70)
print("QUALITY ASSESSMENT - TELANGANA")
print("="*70)

for year in available_years:
    if year not in tel_data:
        continue
    
    data = tel_data[year]
    quality = preprocessor.calculate_quality_score(data, region_type='mixed')
    low_quality = preprocessor.identify_low_quality_regions(data)
    
    low_pct = (low_quality.sum() / low_quality.size) * 100
    
    print(f"\n{year}:")
    print(f"  Quality (mean): {quality.mean():.3f}")
    print(f"  Low-quality cells: {low_quality.sum():,} ({low_pct:.1f}%)")



QUALITY ASSESSMENT - TELANGANA


In [None]:
print("\n" + "="*70)
print("INTERPOLATING MISSING DATA")
print("="*70)

tel_interp = {}
for year in available_years:
    if year not in tel_data:
        continue
    
    print(f"\nInterpolating {year}...")
    
    data = tel_data[year]
    low_quality = preprocessor.identify_low_quality_regions(data)
    data_filled = preprocessor.adaptive_interpolation(data, low_quality)
    
    tel_interp[year] = data_filled
    
    # Save
    output_npy = Path('data/processed') / f'telangana_pop_interp_{year}.npy'
    np.save(output_npy, data_filled)
    
    print(f"  ‚úÖ Saved interpolated data")

In [14]:
import rasterio
import numpy as np
from pathlib import Path
import re

worldpop_dir = Path('data/raw/worldpop')
worldpop_files = sorted(worldpop_dir.glob('ind_ppp_*.tif'))

tel_interp = {}

for f in worldpop_files:
    match = re.search(r'(20\d{2})', f.stem)
    if not match:
        continue

    year = int(match.group(1))
    print(f"Processing {year}...")

    with rasterio.open(f) as src:
        data = src.read(1)

        # Replace nodata with 0
        if src.nodata is not None:
            data = np.where(data == src.nodata, 0, data)

        # For now, we directly store full raster
        # (Later you can crop to Telangana boundary)
        tel_interp[year] = data.astype(np.float32)

print("tel_interp years:", list(tel_interp.keys()))

Processing 2000...
Processing 2005...
Processing 2010...
Processing 2015...
Processing 2020...
tel_interp years: [2000, 2005, 2010, 2015, 2020]


In [15]:
print("\n" + "="*70)
print("CREATING TEMPORAL SEQUENCE")
print("="*70)

available_years = sorted(tel_interp.keys())

if len(available_years) == 0:
    raise ValueError("‚ùå No valid years available for stacking.")

sequence = np.stack(
    [tel_interp[year] for year in available_years],
    axis=0
)

print(f"\nSequence shape: {sequence.shape}")
print(f"  Time steps: {sequence.shape[0]}")
print(f"  Height: {sequence.shape[1]} pixels")
print(f"  Width: {sequence.shape[2]} pixels")

# Temporal consistency
print("\nTemporal Consistency (correlation between consecutive years):")

for t in range(sequence.shape[0] - 1):
    corr = np.corrcoef(
        sequence[t].flatten(),
        sequence[t+1].flatten()
    )[0, 1]
    
    print(f"  {available_years[t]} ‚Üí {available_years[t+1]}: {corr:.3f}")

# Save
seq_path = Path('data/processed') / 'telangana_population_sequence.npy'
seq_path.parent.mkdir(parents=True, exist_ok=True)

np.save(seq_path, sequence.astype(np.float32))

print(f"\n‚úÖ Saved to {seq_path}")


CREATING TEMPORAL SEQUENCE

Sequence shape: (5, 3451, 3508)
  Time steps: 5
  Height: 3451 pixels
  Width: 3508 pixels

Temporal Consistency (correlation between consecutive years):
  2000 ‚Üí 2005: 0.988
  2005 ‚Üí 2010: 0.991
  2010 ‚Üí 2015: 0.986
  2015 ‚Üí 2020: 0.987

‚úÖ Saved to data\processed\telangana_population_sequence.npy


In [19]:
print("\n" + "="*70)
print("REPEATING FOR MAHARASHTRA")
print("="*70)

import re
import numpy as np
from pathlib import Path
import rasterio
import rasterio.mask

# Create year ‚Üí filepath mapping
year_file_map = {}

for f in worldpop_files:
    match = re.search(r'(20\d{2})', f.stem)
    if match:
        year = int(match.group(1))
        year_file_map[year] = f

maha_interp = {}

for year in sorted(year_file_map.keys()):

    file_path = year_file_map[year]

    with rasterio.open(file_path) as src:

        clipped, transform = rasterio.mask.mask(
            src,
            [maharashtra.geometry],
            crop=True
        )

        clipped_array = clipped[0].astype(np.float32)

        # Replace nodata with 0
        if src.nodata is not None:
            clipped_array = np.where(
                clipped_array == src.nodata,
                0,
                clipped_array
            )

        clipped_array = np.nan_to_num(clipped_array, nan=0)

        maha_interp[year] = clipped_array

        print(f"‚úÖ {year}: shape {clipped_array.shape}, pop {clipped_array.sum():,.0f}")

# Stack safely
maha_sequence = np.stack(
    [maha_interp[year] for year in sorted(maha_interp.keys())],
    axis=0
)

maha_path = Path('data/processed') / 'maharashtra_population_sequence.npy'
maha_path.parent.mkdir(parents=True, exist_ok=True)

np.save(maha_path, maha_sequence.astype(np.float32))

print(f"\n‚úÖ Maharashtra saved to {maha_path}")


REPEATING FOR MAHARASHTRA
‚úÖ 2000: shape (817, 997), pop 183,937,248
‚úÖ 2005: shape (817, 997), pop 198,845,504
‚úÖ 2010: shape (817, 997), pop 215,371,328
‚úÖ 2015: shape (817, 997), pop 233,695,280
‚úÖ 2020: shape (817, 997), pop 254,069,344

‚úÖ Maharashtra saved to data\processed\maharashtra_population_sequence.npy


In [20]:
print("\n" + "="*70)
print("PREPROCESSING COMPLETE ‚úÖ")
print("="*70)

files = list(Path('data/processed').glob('*.npy'))
print(f"\nFiles created: {len(files)}")
for f in sorted(files):
    size_mb = f.stat().st_size / 1e6
    print(f"  - {f.name}: {size_mb:.1f} MB")

print("\nüìä Ready for: Notebook 02 - Create HDF5 Dataset")


PREPROCESSING COMPLETE ‚úÖ

Files created: 2
  - maharashtra_population_sequence.npy: 16.3 MB
  - telangana_population_sequence.npy: 242.1 MB

üìä Ready for: Notebook 02 - Create HDF5 Dataset
