In [1]:
!pip install rioxarray rasterio pystac_client planetary_computer odc.stac sentinelsat hvplot zarr

Collecting rioxarray
  Downloading rioxarray-0.19.0-py3-none-any.whl.metadata (5.5 kB)
Collecting rasterio
  Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting pystac_client
  Downloading pystac_client-0.9.0-py3-none-any.whl.metadata (3.1 kB)
Collecting planetary_computer
  Downloading planetary_computer-1.0.0-py3-none-any.whl.metadata (7.4 kB)
Collecting odc.stac
  Downloading odc_stac-0.4.0-py3-none-any.whl.metadata (5.9 kB)
Collecting sentinelsat
  Downloading sentinelsat-1.2.1-py3-none-any.whl.metadata (10 kB)
Collecting hvplot
  Downloading hvplot-0.12.0-py3-none-any.whl.metadata (19 kB)
Collecting zarr
  Downloading zarr-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting pystac>=1.10.0 (from pystac[validation]>=1.10.0->pystac_client)
  Downloading pystac-1.13.0-py3-none-any.whl.metadata (4.7 kB)
Collect

In [2]:
import rasterio
from rasterio.merge import merge
import numpy as np
import os

def merge_tif_files(tif_paths, output_path):
    """
    Merge multiple GeoTIFF files into a single GeoTIFF file.

    Parameters:
    - tif_paths (list): List of file paths to the input GeoTIFF files.
    - output_path (str): Path to the output merged GeoTIFF file.
    """
    src_files = []
    for path in tif_paths:
        src = rasterio.open(path)
        src_files.append(src)
        print(f"Opened {path} with {src.count} bands.")

    # Check if all files have the same dimensions and CRS
    reference_profile = src_files[0].profile
    for src in src_files[1:]:
        if src.profile['width'] != reference_profile['width'] or \
           src.profile['height'] != reference_profile['height'] or \
           src.profile['crs'] != reference_profile['crs']:
            raise ValueError("All GeoTIFF files must have the same dimensions and CRS.")

    # Read all bands from each file
    band_data = []
    for src in src_files:
        bands = src.read()  # Shape: (num_bands, height, width)
        band_data.append(bands)
        print(f"Read {bands.shape[0]} bands from {src.name}.")

    # Concatenate bands along the first axis (band dimension)
    merged_bands = np.concatenate(band_data, axis=0)
    print(f"Total bands after merging: {merged_bands.shape[0]}")

    # Update the profile to reflect the new number of bands
    merged_profile = reference_profile.copy()
    merged_profile.update({
        'count': merged_bands.shape[0],
        'dtype': rasterio.uint16 if merged_bands.dtype == np.uint16 else rasterio.float32
    })

    # Write the merged bands to the output file
    with rasterio.open(output_path, 'w', **merged_profile) as dst:
        dst.write(merged_bands)
        print(f"Merged GeoTIFF written to {output_path} with {merged_bands.shape[0]} bands.")

    # Close all source files
    for src in src_files:
        src.close()

# Example usage:
if __name__ == "__main__":
    # List of input GeoTIFF file paths
    tif_file_list_fergana = [
        '/kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase1_B01_B02_B03.tif',
        '/kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase2_B04_B05_B06.tif',
        '/kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase3_B07_B08_B8A.tif',
        '/kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase4_B11_B12_WVP.tif',
        '/kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase5_AOT.tif'
    ]
    tif_file_list_orenburg = [
        '/kaggle/input/sentinel-image-parts/orenburg_weighted_mean_30m_phase1_B01_B02_B03.tif',
        '/kaggle/input/sentinel-image-parts/orenburg_weighted_mean_30m_phase2_B04_B05_B06.tif',
        '/kaggle/input/sentinel-image-parts/orenburg_weighted_mean_30m_phase3_B07_B08_B8A.tif',
        '/kaggle/input/sentinel-image-parts/orenburg_weighted_mean_30m_phase4_B11_B12_WVP.tif',
        '/kaggle/input/sentinel-image-parts/orenburg_weighted_mean_30m_phase5_AOT.tif'
    ]

    # Output merged GeoTIFF file path
    output_merged_tif_fergana = '/kaggle/working/weighted_mosaic_fergana_mean.tif'
    output_merged_tif_orenburg = '/kaggle/working/weighted_mosaic_orenburg_mean.tif'

    # Merge the files
    merge_tif_files(tif_file_list_fergana, output_merged_tif_fergana)
    merge_tif_files(tif_file_list_orenburg, output_merged_tif_orenburg)

Opened /kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase1_B01_B02_B03.tif with 3 bands.
Opened /kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase2_B04_B05_B06.tif with 3 bands.
Opened /kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase3_B07_B08_B8A.tif with 3 bands.
Opened /kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase4_B11_B12_WVP.tif with 3 bands.
Opened /kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase5_AOT.tif with 1 bands.


  bands = src.read()  # Shape: (num_bands, height, width)
  bands = src.read()  # Shape: (num_bands, height, width)


Read 3 bands from /kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase1_B01_B02_B03.tif.
Read 3 bands from /kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase2_B04_B05_B06.tif.
Read 3 bands from /kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase3_B07_B08_B8A.tif.
Read 3 bands from /kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase4_B11_B12_WVP.tif.
Read 1 bands from /kaggle/input/sentinel-image-parts/fergana_weighted_mean_30m_phase5_AOT.tif.
Total bands after merging: 13
Merged GeoTIFF written to /kaggle/working/weighted_mosaic_fergana_mean.tif with 13 bands.
Opened /kaggle/input/sentinel-image-parts/orenburg_weighted_mean_30m_phase1_B01_B02_B03.tif with 3 bands.
Opened /kaggle/input/sentinel-image-parts/orenburg_weighted_mean_30m_phase2_B04_B05_B06.tif with 3 bands.
Opened /kaggle/input/sentinel-image-parts/orenburg_weighted_mean_30m_phase3_B07_B08_B8A.tif with 3 bands.
Opened /kaggle/input/sentinel-image-parts/orenburg_weig

In [3]:
# # ----------------------------------------------------------
# # Split-band weighted mosaic processing for large areas
# # Handles HTTP timeouts by splitting into FIVE processing phases
# # Each phase processes 3 bands + SCL to stay within connection limits
# # ----------------------------------------------------------
# import warnings 
# warnings.filterwarnings('ignore')

# import numpy as np
# import pandas as pd
# import xarray as xr
# import matplotlib.pyplot as plt
# import rioxarray as rio
# import rasterio
# from rasterio.windows import Window
# from rasterio.enums import Resampling
# from tqdm import tqdm
# import pystac_client 
# import planetary_computer
# from odc.stac import stac_load
# import gc
# import psutil
# import os

# plt.rcParams['figure.figsize'] = (10,8)

# def mem():
#     """RAM in GB"""
#     return psutil.Process().memory_info().rss / 1024**3

# def _valid_mask(band, scl=None):
#     """True = usable pixel (not NaN, not black, not cloud)"""
#     # Handle both 0 (black/nodata) and NaN values
#     ok = (~np.isnan(band)) & (band != 0)
#     if scl is not None:
#         # Cloud mask: exclude clouds (8,9), cirrus (10), and snow/ice (11)
#         ok &= ~((scl == 8) | (scl == 9) | (scl == 10) | (scl == 11))
#     return ok.astype("float32")

# def build_chunked_weighted_tif(
#     data: xr.Dataset,
#     out_tif: str,
#     bands_to_process: list,
#     chunk_size=512,
#     target_res=30
# ):
#     """
#     Process specific bands from dataset into weighted mosaic
#     """
#     print("=" * 60)
#     print(f"[START] {out_tif}  RAM={mem():.1f} GB")
#     print(f"Processing bands: {bands_to_process}")

#     # Filter bands (exclude SCL from output bands)
#     output_bands = [b for b in bands_to_process if b != "SCL"]
#     if not output_bands:
#         raise ValueError("No output bands specified")

#     # Create 30m grid
#     minx, miny, maxx, maxy = data.rio.bounds()
#     res = 0.000269494585  # 30m at ~40° N
#     width  = int(np.ceil((maxx - minx) / res))
#     height = int(np.ceil((maxy - miny) / res))
    
#     transform = rasterio.transform.from_bounds(minx, miny, maxx, maxy, width, height)

#     # GeoTIFF profile
#     profile = dict(
#         driver="GTiff",
#         height=height,
#         width=width,
#         count=len(output_bands),
#         dtype="float32",
#         crs=data.rio.crs,
#         transform=transform,
#         nodata=np.nan,
#         tiled=True,
#         blockxsize=min(chunk_size, width),
#         blockysize=min(chunk_size, height),
#         compress="lzw",
#     )

#     with rasterio.open(out_tif, "w", **profile) as dst:
#         # Process each band
#         for idx, band in enumerate(output_bands, 1):
#             print(f"[{band}] processing band {idx}/{len(output_bands)}...")
            
#             # Initialize accumulation arrays
#             w_sum = np.zeros((height, width), dtype="float32")
#             w_cnt = np.zeros((height, width), dtype="float32")

#             # Process each time slice
#             for t_idx, t in enumerate(tqdm(data.time.values, desc=f"[{band}] dates", leave=False)):
#                 try:
#                     # Extract time slice
#                     da_t = data[band].sel(time=t)
#                     scl_t = data.get("SCL")
#                     if scl_t is not None:
#                         scl_t = scl_t.sel(time=t)

#                     # Reproject to target grid
#                     da_30 = da_t.rio.reproject(
#                         dst.crs, 
#                         shape=(height, width), 
#                         resampling=Resampling.bilinear
#                     )
                    
#                     if scl_t is not None:
#                         scl_30 = scl_t.rio.reproject(
#                             dst.crs, 
#                             shape=(height, width), 
#                             resampling=Resampling.nearest
#                         )
#                     else:
#                         scl_30 = None

#                     # Compute valid pixel mask
#                     da_values = da_30.values
#                     scl_values = scl_30.values if scl_30 is not None else None
#                     valid = _valid_mask(da_values, scl_values)
                    
#                     # Accumulate weighted sum
#                     w_sum += da_values * valid
#                     w_cnt += valid

#                     # Clean up memory
#                     del da_t, da_30, valid
#                     if scl_t is not None:
#                         del scl_t, scl_30
#                     gc.collect()

#                     # Memory check every 3 iterations (more frequent for smaller phases)
#                     if (t_idx + 1) % 3 == 0:
#                         current_mem = mem()
#                         if current_mem > 10:  # Lower threshold for smaller phases
#                             gc.collect()
#                             print(f"  Memory cleanup at date {t_idx+1}, RAM: {mem():.1f} GB")

#                 except Exception as e:
#                     print(f"  Warning: Skipping date {t} due to error: {str(e)[:100]}...")
#                     continue

#             # Compute weighted mean (avoid division by zero)
#             with np.errstate(divide='ignore', invalid='ignore'):
#                 mean_values = np.divide(w_sum, w_cnt, out=np.full_like(w_sum, np.nan), where=w_cnt>0)
            
#             # Write to file
#             dst.write(mean_values, idx)
            
#             # Set band description
#             dst.set_band_description(idx, f'{band}_weighted_mean')
            
#             print(f"[{band}] completed. Valid pixels: {np.sum(~np.isnan(mean_values)):,}")

#             # Clean up large arrays immediately
#             del w_sum, w_cnt, mean_values
#             gc.collect()

#     print(f"[DONE] {out_tif}  RAM={mem():.1f} GB")
#     print("=" * 60)

# def process_region_split_bands_5phases(bounds, time_window, region_name):
#     """Process a region with bands split into FIVE phases to avoid connection timeouts"""
    
#     # Define 5 band phases (3 bands + SCL each, except where SCL already exists)
#     phase_bands = {
#         1: ['B01', 'B02', 'B03', 'SCL'],           # First 3 bands + SCL
#         2: ['B04', 'B05', 'B06', 'SCL'],           # Next 3 bands + SCL  
#         3: ['B07', 'B08', 'B8A', 'SCL'],           # Next 3 bands + SCL
#         4: ['B11', 'B12', 'WVP', 'SCL'],           # Next 3 bands + SCL
#         5: ['AOT', 'SCL']                          # Remaining band + SCL
#     }
    
#     resolution = 30/111320.0
    
#     print(f"\n{'='*80}")
#     print(f"PROCESSING REGION: {region_name} (5-Phase Processing)")
#     print(f"{'='*80}")
    
#     successful_phases = []
    
#     # Process each phase
#     for phase_num, bands in phase_bands.items():
#         print(f"\n--- PHASE {phase_num}/5: Processing bands {[b for b in bands if b != 'SCL']} ---")
        
#         try:
#             # Create fresh STAC client for each phase to avoid connection issues
#             stac = pystac_client.Client.open('https://planetarycomputer.microsoft.com/api/stac/v1')
#             search = stac.search(
#                 bbox=bounds,
#                 datetime=time_window,
#                 collections=['sentinel-2-l2a'],
#                 query={'eo:cloud_cover': {'lt': 5}},
#             )
            
#             items = list(search.get_items())
#             print(f'Found {len(items)} Sentinel-2 scenes')
#             signed_items = [planetary_computer.sign(item) for item in items]
            
#             # Load data for current phase
#             data_phase = stac_load(
#                 signed_items,
#                 bands=bands,
#                 crs='EPSG:4326',
#                 resolution=resolution,
#                 chunks={'x': 1024, 'y': 1024},  # Smaller chunks for memory efficiency
#                 dtype='uint16',
#                 patch_url=planetary_computer.sign,
#                 bbox=bounds,
#             )
            
#             # Mosaic by date
#             print(f"Creating daily mosaics (Phase {phase_num})...")
#             unique_dates = data_phase.time.dt.date.drop_duplicates("time")
#             print(f"Unique dates: {unique_dates.size}")
            
#             mosaics = []
#             for date, ds in data_phase.groupby("time.date"):
#                 # Use max to handle overlapping tiles (valid pixels override nodata)
#                 mosaic = ds.max("time", skipna=True)
#                 mosaic = mosaic.assign_coords(time=np.datetime64(date))
#                 mosaics.append(mosaic)
            
#             data_phase = xr.concat(mosaics, dim="time")
            
#             # Process phase bands
#             output_bands_str = "_".join([b for b in bands if b != 'SCL'])
#             output_file = f"{region_name}_weighted_mean_30m_phase{phase_num}_{output_bands_str}.tif"
            
#             build_chunked_weighted_tif(
#                 data_phase, 
#                 output_file, 
#                 bands,
#                 chunk_size=512
#             )
            
#             successful_phases.append(phase_num)
            
#             # Clean up
#             del data_phase, mosaics
#             gc.collect()
            
#             print(f"✅ Phase {phase_num} completed successfully!")
#             print(f"   Output: {output_file}")
            
#         except Exception as e:
#             print(f"❌ ERROR in Phase {phase_num}: {str(e)}")
#             print(f"   Continuing to next phase...")
#             continue
        
#         # Add delay between phases to let connections reset
#         import time
#         if phase_num < 5:  # Don't delay after last phase
#             print(f"   Waiting 30 seconds before next phase...")
#             time.sleep(30)
    
#     # Summary
#     total_phases = len(phase_bands)
#     successful_count = len(successful_phases)
    
#     print(f"\n{'='*60}")
#     print(f"REGION {region_name.upper()} SUMMARY:")
#     print(f"  Successful phases: {successful_count}/{total_phases}")
#     print(f"  Completed phases: {successful_phases}")
    
#     if successful_count == total_phases:
#         print(f"  ✅ ALL PHASES COMPLETED SUCCESSFULLY!")
#         return True
#     elif successful_count > 0:
#         print(f"  ⚠️  PARTIAL SUCCESS ({successful_count}/{total_phases} phases)")
#         return True
#     else:
#         print(f"  ❌ ALL PHASES FAILED")
#         return False

# # ============================================================================
# # MAIN EXECUTION
# # ============================================================================

# # Define regions and time windows
# regions = {
#     'fergana': {
#         'bounds': (70.98653092, 39.99127677, 73.00948379, 42.01743359),
#         'time_window': "2020-06-01/2020-08-31"
#     },
#     'orenburg': {
#         'bounds': (53.97714297, 50.9672661, 56.01383563, 53.0001858),
#         'time_window': "2021-07-01/2021-07-31"
#     }
# }

# print("Starting 5-phase split-band processing...")
# print(f"Initial RAM usage: {mem():.1f} GB")
# print("\nProcessing strategy:")
# print("  Phase 1: B01, B02, B03 + SCL")
# print("  Phase 2: B04, B05, B06 + SCL") 
# print("  Phase 3: B07, B08, B8A + SCL")
# print("  Phase 4: B11, B12, WVP + SCL")
# print("  Phase 5: AOT + SCL")

# # Process each region
# overall_results = {}
# for region_name, config in regions.items():
#     print(f"\n🚀 Starting processing for {region_name.upper()}...")
    
#     success = process_region_split_bands_5phases(
#         config['bounds'], 
#         config['time_window'], 
#         region_name
#     )
    
#     overall_results[region_name] = success
    
#     # Force garbage collection between regions
#     gc.collect()
#     print(f"Memory after {region_name}: {mem():.1f} GB\n")

# # Final summary
# print("\n" + "="*80)
# print("🎉 ALL REGIONS PROCESSING COMPLETED!")
# print("="*80)

# for region_name, success in overall_results.items():
#     status = "✅ SUCCESS" if success else "❌ FAILED"
#     print(f"  {region_name.upper()}: {status}")

# # List all output files
# print("\n📁 Output files created:")
# tif_files = [f for f in os.listdir('.') if f.endswith('.tif')]
# if tif_files:
#     for f in sorted(tif_files):
#         try:
#             size_mb = os.path.getsize(f) / 1024**2
#             print(f"  📄 {f} ({size_mb:.1f} MB)")
#         except:
#             print(f"  📄 {f}")
# else:
#     print("  No .tif files found in current directory")

# print(f"\nFinal RAM usage: {mem():.1f} GB")
# print("🏁 Processing complete!")