In [1]:
import sys
sys.path.append("../..")

#Defining libraries
import os
import pandas as pd
import numpy as np
from shapely.geometry import box

import rasterio
from rasterio.mask import mask

In [2]:
'''
1-11-> Residential urban areas 
2-121,13->Industrial and abandoned urban areas
3-122,123,124 Transportation infrastructure (streets, highways, airports, and ports)
4-14->Urban green areas
5-2->Agricultural areas
6-3->Forest
7-4/5->Hydro and humid bodies
'''
#Convert from copernicus code 2018 to an internal code
URBAN = 1
INDUSTRIAL = 2
TRANSPORTATION = 3
URBAN_VEGETATION = 4
RURAL = 5
FOREST = 6
WATER = 7
LC_NO_DATA = 9999
NO_DATA = -9999

    
# Function to check if the file is a tiff and must be read.
def check_wrong_files(f):
    if f == 'clip': return True #avoid entering the "clip" folder
    if 'ipynb' in f: return True #avoid entering the "ipynb_checkpoint" file
    if 'tar' in f: return True #avoid entering "tar" files
    if 'aux' in f: return True #avoid entering "aux" files
    return False


In [129]:
# City parameters and global variables
city_info = {
    "resolution": 5,
    "epsg": 32632,
    "capitalized": "Milan"
}

city = 'MILANO'
current_city_info = city_info
city_epsg = current_city_info['epsg']
data_folder = "data"
landcover_base_path = f"{data_folder}/landcover"
total_samples_per_raster = 10000


landsat_raster_folder = "/home/user/ODC_harmonia/Landsat/Milan/data"

landsat_raster_file_list = os.listdir(f"{landsat_raster_folder}")

# create the "clip" if it does not exist
os.makedirs(f"{landsat_raster_folder}/clip", exist_ok=True)

lst_folder = f"{landsat_raster_folder}/clip"
lst_file_list = os.listdir(f"{lst_folder}")

In [130]:
def match_landsat_to_landcover(landsat):
    year = int(landsat[17:21])
    if year in [2015,2016]:
        return str(2015)
    elif year in [2017,2018,2019]:
        return str(2018)
    elif year in [2020,2021,2022]:
        return str(2021)

In [132]:
import numpy as np
import rasterio
import random

def extract_random_patches(raster_paths, patch_size=33, num_samples=1000):
    """
    Extracts random patches from multiple rasters while handling different nodata values.
    
    Args:
        raster_paths (list): List of file paths to raster images.
        patch_size (int): Size of the square patches.
        num_samples (int): Number of patches to extract.

    Returns:
        tuple: (X, y) where:
            - X is an array of shape (num_samples, num_bands-1, patch_size, patch_size)
            - y is an array of shape (num_samples, 1), containing center pixel values
    """
    rasters = []
    nodata_masks = []
    
    # Read all rasters
    for path in raster_paths:
        with rasterio.open(path) as src:
            img = src.read(1).astype(np.float32)  # Convert to float32
            nodata_value = src.nodata if src.nodata is not None else np.nan  # Handle missing nodata
            img[img == nodata_value] = np.nan  # Mask nodata values
            rasters.append(img)
            nodata_masks.append(np.isnan(img))  # Store nodata mask
    
    # Stack rasters into a multi-band array (bands, height, width)
    raster_stack = np.stack(rasters, axis=0)
    
    # UHI is the first band
    uhi_band = raster_stack[0]  
    feature_bands = raster_stack[1:]  # All other bands
    
    height, width = raster_stack.shape[1], raster_stack.shape[2]
    X_patches, y_centers = [], []
    
    for _ in range(num_samples):
        while True:
            # Randomly select top-left corner of patch
            i = random.randint(0, height - patch_size)
            j = random.randint(0, width - patch_size)
            
            # Extract patches
            X_patch = feature_bands[:, i:i+patch_size, j:j+patch_size]
            
            # Extract center pixel value from the UHI band
            center_value = uhi_band[i + patch_size // 2, j + patch_size // 2]
            
            # Ensure patch has valid data (not fully masked)
            if not np.isnan(X_patch).all() and not np.isnan(center_value):
                X_patches.append(np.nan_to_num(X_patch))  # Replace NaNs with 0
                y_centers.append(center_value)  # Center value as the label
                break  # Accept this patch and continue
    
    return np.array(X_patches), np.array(y_centers).reshape(-1, 1)  # y_centers as a single value per patch


In [133]:
bands = [
    '_uhi.tif',
    '_LST.TIF',
    '_NDVI.TIF',
    '_SR_B2.TIF',
    '_SR_B3.TIF',
    '_SR_B4.TIF',
    '_SR_B5.TIF',
]

In [134]:
X, y = [], []
for f in landsat_raster_file_list:
    if check_wrong_files(f): continue 
    raster_batch = []
    for band in bands:
        raster = f"{landsat_raster_folder}/clip/{f}/{f}{band}"
        raster_batch.append(raster)
    year = match_landsat_to_landcover(f)
    landcover_path = f'{landcover_base_path}/DUSAF_MCM_mapped_{year}.tif'
    raster_batch.append(landcover_path)
    
    batch_X, batch_y = extract_random_patches(raster_batch,17)
    X.append(batch_X)
    y.append(batch_y)

In [135]:
X_array = np.concatenate(X, axis=0)
y_array = np.concatenate(y, axis=0)

In [139]:
print(f"Patch shape: {X_array.shape}")  

Patch shape: (17000, 7, 17, 17)


In [140]:
np.save("train_patches_17.npy", X_array)
np.save("target_patches_17.npy", y_array)

In [161]:
def get_prediction_array(years,patch_size):
    raster_batches = []
    for f in landsat_raster_file_list:
        #print(f)
        if check_wrong_files(f): continue 
        raster_batch = []
        year = int(match_landsat_to_landcover(f))
        if year in years:
            #print(year)
            landcover_path = f'{landcover_base_path}/DUSAF_MCM_mapped_{year}.tif'
            for band in bands:
                raster = f"{landsat_raster_folder}/clip/{f}/{f}{band}"
                raster_batch.append(raster)
            raster_batch.append(landcover_path)
            raster_batches.append(raster_batch)
    print(raster_batches)
    return create_prediction_array(raster_batches,patch_size)

In [164]:
patch_size = 17
batch_X, batch_y = get_prediction_array([2021,2022],patch_size)

[['/home/user/ODC_harmonia/Landsat/Milan/data/clip/LC08_L2SP_194028_20220725_20220802_02_T1/LC08_L2SP_194028_20220725_20220802_02_T1_uhi.tif', '/home/user/ODC_harmonia/Landsat/Milan/data/clip/LC08_L2SP_194028_20220725_20220802_02_T1/LC08_L2SP_194028_20220725_20220802_02_T1_LST.TIF', '/home/user/ODC_harmonia/Landsat/Milan/data/clip/LC08_L2SP_194028_20220725_20220802_02_T1/LC08_L2SP_194028_20220725_20220802_02_T1_NDVI.TIF', '/home/user/ODC_harmonia/Landsat/Milan/data/clip/LC08_L2SP_194028_20220725_20220802_02_T1/LC08_L2SP_194028_20220725_20220802_02_T1_SR_B2.TIF', '/home/user/ODC_harmonia/Landsat/Milan/data/clip/LC08_L2SP_194028_20220725_20220802_02_T1/LC08_L2SP_194028_20220725_20220802_02_T1_SR_B3.TIF', '/home/user/ODC_harmonia/Landsat/Milan/data/clip/LC08_L2SP_194028_20220725_20220802_02_T1/LC08_L2SP_194028_20220725_20220802_02_T1_SR_B4.TIF', '/home/user/ODC_harmonia/Landsat/Milan/data/clip/LC08_L2SP_194028_20220725_20220802_02_T1/LC08_L2SP_194028_20220725_20220802_02_T1_SR_B5.TIF', 'd

In [166]:
batch_X

array([[       nan, 0.26803535,        nan,        nan,        nan,
               nan,        nan],
       [       nan, 0.27159673,        nan,        nan,        nan,
               nan,        nan],
       [       nan, 0.29865497,        nan,        nan,        nan,
               nan,        nan],
       [       nan, 0.31776816,        nan,        nan,        nan,
               nan,        nan],
       [       nan, 0.2703447 ,        nan,        nan,        nan,
               nan,        nan],
       [       nan, 0.31772885,        nan,        nan,        nan,
               nan,        nan],
       [       nan, 0.32968596,        nan,        nan,        nan,
               nan,        nan]], dtype=float32)

In [163]:
def create_prediction_array(raster_batches, patch_size=33):
    """
    Extracts average pixel values across multiple rasters for prediction.

    Args:
        raster_batches (list): List of lists of file paths to raster images.
        patch_size (int): Size of the square patches (not needed now, but could be kept for compatibility).

    Returns:
        tuple: (X, y) where:
            - X is an array of shape (num_samples, num_bands-1) for the averaged raster patches
            - y is an array of shape (num_samples, 1), containing average center pixel values
    """
    # Initialize variables to accumulate the sum and count for each raster batch
    total_X_patches = []
    total_y_centers = []

    for raster_paths in raster_batches:
        rasters = []
        nodata_masks = []

        # Read all rasters
        for path in raster_paths:
            with rasterio.open(path) as src:
                img = src.read(1).astype(np.float32)  # Convert to float32
                nodata_value = src.nodata if src.nodata is not None else np.nan  # Handle missing nodata
                img[img == nodata_value] = np.nan  # Mask nodata values
                rasters.append(img)
                nodata_masks.append(np.isnan(img))  # Store nodata mask

        # Stack rasters into a multi-band array (bands, height, width)
        raster_stack = np.stack(rasters, axis=0)

        # UHI is the first band
        uhi_band = raster_stack[0]  
        feature_bands = raster_stack[1:]  # All other bands

        height, width = raster_stack.shape[1], raster_stack.shape[2]

        # Calculate the average for each raster
        X_patches = np.mean(feature_bands, axis=(1, 2))  # Average over the height and width dimensions
        center_value = np.nanmean(uhi_band)  # Average the center values (handle NaNs)

        # Append the averaged result for this batch
        total_X_patches.append(X_patches)
        total_y_centers.append(center_value)

    # Combine all batches into final arrays
    avg_X_patches = np.array(total_X_patches)
    avg_y_centers = np.array(total_y_centers).reshape(-1, 1)

    return avg_X_patches, avg_y_centers  # Return averaged values across all raster batches
