In [1]:
import h5py
import numpy as np
from pathlib import Path

print("Creating HDF5 dataset for efficient memory usage...")
print("Benefit: Load 40GB dataset with only 2GB active memory\n")


Creating HDF5 dataset for efficient memory usage...
Benefit: Load 40GB dataset with only 2GB active memory



In [2]:
print("Loading interpolated state data...")

tel_seq = np.load('data/processed/telangana_population_sequence.npy')
maha_seq = np.load('data/processed/maharashtra_population_sequence.npy')

print(f"Telangana shape: {tel_seq.shape}")
print(f"Maharashtra shape: {maha_seq.shape}")

Loading interpolated state data...
Telangana shape: (5, 517, 469)
Maharashtra shape: (5, 817, 997)


In [3]:
print("\nAligning dimensions...")

tel_h, tel_w = tel_seq.shape[1:]
maha_h, maha_w = maha_seq.shape[1:]

max_h = max(tel_h, maha_h)
max_w = max(tel_w, maha_w)

# Pad if needed
tel_padded = np.pad(tel_seq, 
                   ((0, 0), (0, max_h - tel_h), (0, max_w - tel_w)),
                   mode='constant', constant_values=0)

maha_padded = np.pad(maha_seq,
                    ((0, 0), (0, max_h - maha_h), (0, max_w - maha_w)),
                    mode='constant', constant_values=0)

print(f"After padding:")
print(f"  Telangana: {tel_padded.shape}")
print(f"  Maharashtra: {maha_padded.shape}")


Aligning dimensions...
After padding:
  Telangana: (5, 817, 997)
  Maharashtra: (5, 817, 997)


In [4]:
print("\nCreating HDF5 file...")

h5_path = 'data/processed/india_sample.h5'

with h5py.File(h5_path, 'w') as h5:
    # Create dataset with chunking (1 timestep, 256x256 spatial)
    dataset = h5.create_dataset(
        'population_data',
        shape=(
            tel_padded.shape[0],                           # 5 years
            tel_padded.shape[1] + maha_padded.shape[1],    # Stacked states
            tel_padded.shape[2]                             # Width
        ),
        dtype=np.float32,
        chunks=(1, 256, 256),           # Chunk for lazy loading
        compression='gzip',              # Compression
        compression_opts=4               # Balance speed vs ratio
    )
    
    # Write data
    print("Writing Telangana...")
    h5['population_data'][:, :tel_padded.shape[1], :] = tel_padded
    
    print("Writing Maharashtra...")
    h5['population_data'][:, tel_padded.shape[1]:, :] = maha_padded
    
    # Add metadata
    h5.attrs['description'] = 'India sample state population data'
    h5.attrs['years'] = '2000, 2005, 2010, 2015, 2020'
    h5.attrs['states'] = 'Telangana (top), Maharashtra (bottom)'
    h5.attrs['resolution_km'] = 1.0

file_size_mb = Path(h5_path).stat().st_size / 1e6
orig_size_mb = (tel_padded.nbytes + maha_padded.nbytes) / 1e6

print(f"\n✅ HDF5 created: {h5_path}")
print(f"✅ File size: {file_size_mb:.1f} MB (compressed from {orig_size_mb:.1f} MB)")
print(f"✅ Compression ratio: {orig_size_mb/file_size_mb:.1f}x")


Creating HDF5 file...
Writing Telangana...
Writing Maharashtra...

✅ HDF5 created: data/processed/india_sample.h5
✅ File size: 18.4 MB (compressed from 32.6 MB)
✅ Compression ratio: 1.8x


In [5]:
print("\nVerifying HDF5...")

with h5py.File(h5_path, 'r') as h5:
    print(f"Dataset shape: {h5['population_data'].shape}")
    print(f"Chunk shape: {h5['population_data'].chunks}")
    
    print(f"\nMetadata:")
    for key in h5.attrs:
        print(f"  {key}: {h5.attrs[key]}")
    
    # Test lazy loading
    print(f"\nTesting lazy loading...")
    import time
    
    start = time.time()
    data_2000 = h5['population_data'][0, :, :]
    elapsed = time.time() - start
    
    print(f"✅ Loaded single year in {elapsed:.3f}s")
    print(f"✅ Data range: {data_2000.min():.0f} - {data_2000.max():.0f}")
    print(f"✅ Memory: only ~{tel_padded.nbytes/1e6:.0f}MB loaded per timestep")



Verifying HDF5...
Dataset shape: (5, 1634, 997)
Chunk shape: (1, 256, 256)

Metadata:
  description: India sample state population data
  resolution_km: 1.0
  states: Telangana (top), Maharashtra (bottom)
  years: 2000, 2005, 2010, 2015, 2020

Testing lazy loading...
✅ Loaded single year in 0.057s
✅ Data range: 0 - 63067
✅ Memory: only ~16MB loaded per timestep


In [10]:
print("\n" + "="*70)
print("HDF5 CREATION COMPLETE ✅")
print("="*70)
print(f"\nDataset ready: data/processed/india_sample.h5")
print(f"Next: Notebook 03 - Clip Full India (8-12 hour operation)")


HDF5 CREATION COMPLETE ✅

Dataset ready: data/processed/india_sample.h5
Next: Notebook 03 - Clip Full India (8-12 hour operation)
