In [1]:
import sys
sys.path.append("../..")

#Defining libraries
import os
import math
from datetime import date, timedelta
import pandas as pd
import xarray as xr
import plotly.graph_objects as go
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from shapely.geometry import box
from scipy.interpolate import griddata, interpn
import datacube
from copy import deepcopy
import statsmodels.api as sm

import rasterio
from rasterio.plot import show
from rasterio.mask import mask
from rasterio.windows import Window
from rasterio.warp import reproject, Resampling
#from rasterio.enums import Resampling
from rasterio.transform import from_origin
from rasterio.windows import Window

import matplotlib.pyplot as plt

from modules import processing_module as processing

2025-02-16 14:37:29.418378: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-16 14:37:29.419786: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-16 14:37:29.442375: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-16 14:37:29.443120: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
'''
1-11-> Residential urban areas 
2-121,13->Industrial and abbandoned urban areas
3-122,123,124 Transportation infrastructure (streets, highways, airports, and ports)
4-14->Urban green areas
5-2->Agricultural areas
6-3->Forest
7-4/5->Hydro and humid bodies
'''
#Convert from copernicus code 2018 to an internal code
URBAN = 1
INDUSTRIAL = 2
TRANSPORTATION = 3
URBAN_VEGETATION = 4
RURAL = 5
FOREST = 6
WATER = 7
LC_NO_DATA = 9999
NO_DATA= -9999
    
# Function to check if the file is a tiff and must be read.
def check_wrong_files(f):
    if f == 'clip': return True #avoid entering the "clip" folder
    if 'csv'in f: return True
    if f in ['LC08_L2SP_194028_20170524_20200903_02_T1']: return True #Not consider the 2017 image as it biases the model
    if 'ipynb' in f: return True #avoid entering the "ipynb_checkpoint" file
    if 'tar' in f: return True #avoid entering "tar" files
    if 'aux' in f: return True #avoid entering "aux" files
    return False

def match_landsat_to_landcover(landsat):
    year = int(landsat[17:21])
    if year in [2015,2016]:
        return str(2015)
    elif year in [2017,2018,2019]:
        return str(2018)
    elif year in [2020,2021,2022]:
        return str(2021)

In [3]:
# City parameters and global variables
city_info = {
    "resolution": 5,
    "epsg": 32632,
    "capitalized": "Milan"
}

city = 'MILANO'
current_city_info = city_info
city_epsg = current_city_info['epsg']
data_folder = "data"
#landcover_path = f'{landcover_base_path}/DUSAF_MCM_mapped_{year}.tif'

landsat_raster_folder = "/home/user/ODC_harmonia/Landsat/Milan/data"
sat_images_path = f"{landsat_raster_folder}/clip"
file_list = os.listdir(f"{sat_images_path}")
landcover_base_path = f'{data_folder}/landcover'
#landsat_raster_file_list = os.listdir(f"{landsat_raster_folder}")

total_samples_per_raster = 50000


In [4]:
predict_lst = None
predict_ndvi = None
predict_ndbi = None
predict_albedo = None

In [5]:
all_samples = []

In [11]:
#Commented out for legacy
'''
samples = pd.DataFrame()
predict_n = 0
predict_lst = None
predict_ndvi = None
predict_ndbi = None
predict_albedo = None

sample_n = [
    int(total_samples_per_raster / 4), # urban, uhi 1
    int(total_samples_per_raster / 4), # urban, uhi 0
    int(total_samples_per_raster / 4), # rural/vegetation/bareland, uhi 1
    int(total_samples_per_raster / 4), # rural/vegetation/bareland, uhi 0
]

for f in file_list:
    if check_wrong_files(f): continue

    print(f'Processing {f}')
    file_date_string = f.split('_')[3] #example: LC08_L2SP_194028_20160825_20200906_02_T1_LST
    year = match_landsat_to_landcover(f)
    landcover_path = f'{landcover_base_path}/DUSAF_{year}_MCM_mapped.tif'
    
    with rasterio.open(landcover_path, driver="GTiff") as landcover_raster:
        landcover_array = landcover_raster.read(1)
        #print(landcover_raster.profile)
        print('Read land cover')
        rows, cols = landcover_array.shape
        x_positions = np.arange(0, cols)
        y_positions = np.arange(0, rows)
        x, y = np.meshgrid(x_positions, y_positions)
        x_flat = x.flatten()
        y_flat = y.flatten()
        values_flat = landcover_array.flatten()

        # Create a DataFrame for the Landcover 
        landcover_df = pd.DataFrame({'x': x_flat, 'y': y_flat, 'landcover': values_flat})
        landcover_df['landcover'] = landcover_df['landcover']
    if not isinstance(predict_lst,np.ndarray) and not isinstance(predict_ndvi,np.ndarray) and not isinstance(predict_ndbi,np.ndarray) and not isinstance(predict_albedo,np.ndarray):
        predict_lst = np.zeros_like(landcover_array)
        predict_ndvi = np.zeros_like(landcover_array)
        predict_ndbi = np.zeros_like(landcover_array)
        predict_albedo = np.zeros_like(landcover_array)

    #columns in the end: x,y,landcover,ndvi,raster
    train_df = landcover_df.copy()

    #add the uhi column
    with rasterio.open(f"{sat_images_path}/{f}/{f}_uhi.tif", driver="GTiff") as uhi_raster:
        print('read UHI')
        uhi_array = uhi_raster.read(1) #UHI band
        uhi_flat = uhi_array.flatten()
        train_df['uhi'] = pd.Series(uhi_flat).astype('int16')

    #add the uhi intensity column
    #Uncomment to switch to UHI Intensity instead of UHI binary
    
    with rasterio.open(f"{sat_images_path}/{f}/{f}_uhi_int.tif", driver="GTiff") as uhii_raster:
        print('read UHII')
        uhii_array = uhii_raster.read(1) #UHI band
        uhii_flat = uhii_array.flatten()
        train_df['uhii'] = pd.Series(uhii_flat).astype('float32')
    
    #add the ndvi column
    with rasterio.open(f"{sat_images_path}/{f}/{f}_NDVI.TIF", driver="GTiff") as ndvi_raster:
        print('read NDVI')
        ndvi_array = ndvi_raster.read(1) #UHI band
        ndvi_flat = ndvi_array.flatten()
        train_df['ndvi'] = pd.Series(ndvi_flat).astype('float32')
    
    #add the ndbi column
    with rasterio.open(f"{sat_images_path}/{f}/{f}_NDBI.TIF", driver="GTiff") as ndbi_raster:
        print('read NDBI')
        ndbi_array = ndbi_raster.read(1) #UHI band
        ndbi_flat = ndbi_array.flatten()
        train_df['ndbi'] = pd.Series(ndbi_flat).astype('float32')
    
    #add the albedo column
    with rasterio.open(f"{sat_images_path}/{f}/{f}_albedo.TIF", driver="GTiff") as albedo_raster:
        print('read albedo')
        albedo_array = albedo_raster.read(1) #UHI band
        albedo_flat = albedo_array.flatten()
        train_df['albedo'] = pd.Series(albedo_flat).astype('float32')

    #add the LST column
    with rasterio.open(f"{sat_images_path}/{f}/{f}_LST.TIF", driver="GTiff") as lst_raster:
        print('read LST')
        lst_array = lst_raster.read(1) #UHI band
        lst_flat = lst_array.flatten()
        train_df['lst'] = pd.Series(lst_flat).astype('float32')

    if int(year) >= 2020:
        predict_n += 1
        predict_lst = np.where(landcover_array != LC_NO_DATA, (predict_lst + lst_array), -9999)
        predict_ndvi = np.where(landcover_array != LC_NO_DATA, (predict_ndvi + ndvi_array), -9999)
        predict_ndbi = np.where(landcover_array != LC_NO_DATA, (predict_ndbi + ndbi_array), -9999)
        predict_albedo = np.where(landcover_array != LC_NO_DATA, (predict_albedo + albedo_array), -9999)


    train_df['raster'] = int(file_date_string)

    #remove nodata (-9999) from the dataframe
    train_df = train_df.loc[
        (train_df['landcover'] != LC_NO_DATA)
    ]
   #urban, uhi = 0
    condition = (
        ((train_df['landcover'] == URBAN) | 
         (train_df['landcover'] == INDUSTRIAL) | 
         (train_df['landcover'] == TRANSPORTATION)) & 
        (train_df['uhi'] == 0)
    )
    sampling = train_df.loc[
        condition
    ].sample(n=sample_n[0])
    samples = pd.concat([samples, sampling])

    #urban, uhi = 1
    condition = (
        ((train_df['landcover'] == URBAN) | 
         (train_df['landcover'] == INDUSTRIAL) | 
         (train_df['landcover'] == TRANSPORTATION)) & 
        (train_df['uhi'] == 1)
    )
    sampling = train_df.loc[
        condition
    ].sample(n=sample_n[1])
    samples = pd.concat([samples, sampling])

    #rural/forest/bareland, uhi = 0
    condition = (
        ((train_df['landcover'] == URBAN_VEGETATION) | 
         (train_df['landcover'] == RURAL) | 
         (train_df['landcover'] == FOREST)) & 
         (train_df['uhi'] == 0)
    ) 
    sampling = train_df.loc[
        condition
    ].sample(n=sample_n[2])
    samples = pd.concat([samples, sampling])

    #rural/forest/bareland, uhi = 1
    condition = (
        ((train_df['landcover'] == URBAN_VEGETATION) | 
         (train_df['landcover'] == RURAL) | 
         (train_df['landcover'] == FOREST))& 
         (train_df['uhi'] == 1)
    ) 
    sampling = train_df.loc[
        condition
    ].sample(n=sample_n[3])
    samples = pd.concat([samples, sampling])
    
    
    all_samples.append(samples)
'''


Processing LC08_L2SP_194028_20180815_20200831_02_T1
Read land cover
read UHI
read UHII
read NDVI
read NDBI
read albedo
read LST
Processing LC08_L2SP_194028_20190717_20200827_02_T1
Read land cover
read UHI
read UHII
read NDVI
read NDBI
read albedo
read LST
Processing LC08_L2SP_194028_20190818_20200827_02_T1
Read land cover
read UHI
read UHII
read NDVI
read NDBI
read albedo
read LST
Processing LC08_L2SP_194028_20150722_20200908_02_T1
Read land cover
read UHI
read UHII
read NDVI
read NDBI
read albedo
read LST
Processing LC08_L2SP_194028_20180730_20200831_02_T1
Read land cover
read UHI
read UHII
read NDVI
read NDBI
read albedo
read LST
Processing LC08_L2SP_194028_20220725_20220802_02_T1
Read land cover
read UHI
read UHII
read NDVI
read NDBI
read albedo
read LST
Processing LC08_L2SP_194028_20220709_20220721_02_T1
Read land cover
read UHI
read UHII
read NDVI
read NDBI
read albedo
read LST
Processing LC08_L2SP_194028_20210706_20210713_02_T1
Read land cover
read UHI
read UHII
read NDVI
read ND

In [10]:
samples = pd.DataFrame()
predict_n= None
sample_n = [
    int(total_samples_per_raster / 4),  # urban, uhi 1
    int(total_samples_per_raster / 4),  # urban, uhi 0
    int(total_samples_per_raster / 4),  # rural/vegetation/bareland, uhi 1
    int(total_samples_per_raster / 4),  # rural/vegetation/bareland, uhi 0
]

for f in file_list:
    if check_wrong_files(f):
        continue

    print(f'Processing {f}')
    file_date_string = f.split('_')[3]  # Extract date from filename
    year = match_landsat_to_landcover(f)
    landcover_path = f'{landcover_base_path}/DUSAF_{year}_MCM_mapped.tif'
    print(landcover_path)
    
    with rasterio.open(landcover_path, driver="GTiff") as landcover_raster:
        landcover_array = landcover_raster.read(1)
        print('Read land cover')

    # Initialize prediction arrays on first iteration
    if not isinstance(predict_n,np.ndarray):
        predict_n = np.zeros_like(landcover_array, dtype=int)
        predict_lst = np.zeros_like(landcover_array, dtype=float)
        predict_ndvi = np.zeros_like(landcover_array, dtype=float)
        predict_ndbi = np.zeros_like(landcover_array, dtype=float)
        predict_albedo = np.zeros_like(landcover_array, dtype=float)

    # Load Landsat-derived rasters
    #add the uhi column
    with rasterio.open(f"{sat_images_path}/{f}/{f}_uhi.tif", driver="GTiff") as uhi_raster:
        print('read UHI')
        uhi_array = uhi_raster.read(1) #UHI band

    #add the uhi intensity column
    #Uncomment to switch to UHI Intensity instead of UHI binary
    
    with rasterio.open(f"{sat_images_path}/{f}/{f}_uhi_int.tif", driver="GTiff") as uhii_raster:
        print('read UHII')
        uhii_array = uhii_raster.read(1) #UHI band
        
    with rasterio.open(f"{sat_images_path}/{f}/{f}_NDVI.TIF", driver="GTiff") as ndvi_raster:
        ndvi_array = ndvi_raster.read(1)

    with rasterio.open(f"{sat_images_path}/{f}/{f}_NDBI.TIF", driver="GTiff") as ndbi_raster:
        ndbi_array = ndbi_raster.read(1)

    with rasterio.open(f"{sat_images_path}/{f}/{f}_albedo.TIF", driver="GTiff") as albedo_raster:
        albedo_array = albedo_raster.read(1)

    with rasterio.open(f"{sat_images_path}/{f}/{f}_LST.TIF", driver="GTiff") as lst_raster:
        lst_array = lst_raster.read(1)

    # Exclude invalid pixels from predictions
    valid_pixels = (ndvi_array != NO_DATA) & (ndbi_array != NO_DATA) & (albedo_array != NO_DATA) & (lst_array != NO_DATA)

    if int(year) >= 2020:
        predict_n += valid_pixels  # Track valid pixel count
        predict_lst[valid_pixels] += lst_array[valid_pixels]
        predict_ndvi[valid_pixels] += ndvi_array[valid_pixels]
        predict_ndbi[valid_pixels] += ndbi_array[valid_pixels]
        predict_albedo[valid_pixels] += albedo_array[valid_pixels]

    # Remove nodata pixels before sampling
    train_df = pd.DataFrame({
        'x': np.tile(np.arange(landcover_array.shape[1]), landcover_array.shape[0]),
        'y': np.repeat(np.arange(landcover_array.shape[0]), landcover_array.shape[1]),
        'landcover': landcover_array.flatten(),
        'uhi': uhi_array.flatten(),
        'uhii': uhii_array.flatten()
        'ndvi': ndvi_array.flatten(),
        'ndbi': ndbi_array.flatten(),
        'albedo': albedo_array.flatten(),
        'lst': lst_array.flatten(),
        'raster': file_date_string
    })

    train_df = train_df[
        (train_df['landcover'] != LC_NO_DATA) & 
        (train_df['ndvi'] != NO_DATA) & 
        (train_df['ndbi'] != NO_DATA) & 
        (train_df['albedo'] != NO_DATA) & 
        (train_df['lst'] != NO_DATA)
    ]

    # Sample the valid data
    for idx, (condition, n_samples) in enumerate([
        (((train_df['landcover'] == URBAN) | (train_df['landcover'] == INDUSTRIAL) | (train_df['landcover'] == TRANSPORTATION)) & (train_df['uhi'] == 0), sample_n[0]),
        (((train_df['landcover'] == URBAN) | (train_df['landcover'] == INDUSTRIAL) | (train_df['landcover'] == TRANSPORTATION)) & (train_df['uhi'] == 1), sample_n[1]),
        (((train_df['landcover'] == URBAN_VEGETATION) | (train_df['landcover'] == RURAL) | (train_df['landcover'] == FOREST)) & (train_df['uhi'] == 0), sample_n[2]),
        (((train_df['landcover'] == URBAN_VEGETATION) | (train_df['landcover'] == RURAL) | (train_df['landcover'] == FOREST)) & (train_df['uhi'] == 1), sample_n[3])
    ]):
        sampled_data = train_df.loc[condition].sample(n=n_samples, random_state=42)
        samples = pd.concat([samples, sampled_data])
    all_samples.append(samples)

Processing LC08_L2SP_194028_20180815_20200831_02_T1
data/landcover/DUSAF_2018_MCM_mapped.tif
Read land cover


KeyError: 'uhi'

In [12]:
samples_base_path = f'training_samples'    
# create the "training_samples" folder if it does not exist
os.makedirs(f"{samples_base_path}", exist_ok=True)
sufix = '_UHII_50mil'

samples_path = f'{samples_base_path}/{city}_samples{sufix}.csv'
print(f'Saving samples in {samples_path}')

samples_to_save = all_samples[len(all_samples) - 1].copy()
samples_to_save = samples_to_save.reset_index(drop=True)
samples_to_save.to_csv(samples_path)
samples_to_save

Saving samples in training_samples/MILANO_samples_UHII_50mil.csv


Unnamed: 0,x,y,landcover,uhi,uhii,ndvi,ndbi,albedo,lst,raster
0,2524,5424,2,0,-1.638275,0.603841,-0.102905,0.143713,304.742096,20180815
1,1075,1404,1,0,-0.359955,0.346324,-0.082911,0.006920,306.020416,20180815
2,10338,3614,1,0,-0.048920,0.669954,-0.226146,0.144723,306.331451,20180815
3,10598,5304,2,0,-0.127533,0.539448,-0.183797,0.124097,306.252838,20180815
4,8954,4967,2,0,-0.137787,0.836907,-0.349640,0.157759,306.242584,20180815
...,...,...,...,...,...,...,...,...,...,...
79995,6530,6946,5,1,6.176117,0.444094,0.037715,0.157283,311.591797,20200719
79996,5720,3147,6,1,2.823029,0.898784,-0.469910,0.155451,308.238708,20200719
79997,4557,5524,5,1,6.996429,0.306539,0.031769,0.203847,312.412109,20200719
79998,5497,2072,5,1,4.015930,0.809082,-0.388333,0.161463,309.431610,20200719


In [13]:
predict_n

7

In [None]:
# Compute final prediction values, avoiding division by zero
valid_mask = predict_n > 0
predict_lst[valid_mask] /= predict_n[valid_mask]
predict_ndvi[valid_mask] /= predict_n[valid_mask]
predict_ndbi[valid_mask] /= predict_n[valid_mask]
predict_albedo[valid_mask] /= predict_n[valid_mask]

# Convert to DataFrame
predict_df = pd.DataFrame({
    'x': np.tile(np.arange(landcover_array.shape[1]), landcover_array.shape[0]),
    'y': np.repeat(np.arange(landcover_array.shape[0]), landcover_array.shape[1]),
    'landcover': landcover_array.flatten().astype('int32'),
    'lst': predict_lst.flatten().astype('float32'),
    'ndvi': predict_ndvi.flatten().astype('float32'),
    'ndbi': predict_ndbi.flatten().astype('float32'),
    'albedo': predict_albedo.flatten().astype('float32')
})

In [14]:
# save the lst and ndvi predict
if predict_n == 0: predict_n = 0.0000001
predict_lst = np.where(landcover_array != LC_NO_DATA, (predict_lst / predict_n), -9999)
predict_ndvi = np.where(landcover_array != LC_NO_DATA, (predict_ndvi / predict_n), -9999)
predict_ndbi = np.where(landcover_array != LC_NO_DATA, (predict_ndbi / predict_n), -9999)
predict_albedo = np.where(landcover_array != LC_NO_DATA, (predict_albedo / predict_n), -9999)

predict_df = pd.DataFrame({'x': x_flat, 'y': y_flat})
predict_df['landcover'] = pd.Series(landcover_array.flatten()).astype('int32')
predict_df['x'] = predict_df['x'].astype('uint32')
predict_df['y'] = predict_df['y'].astype('uint32')
predict_df['lst'] = pd.Series(predict_lst.flatten()).astype('float32')
predict_df['ndvi'] = pd.Series(predict_ndvi.flatten()).astype('float32')
predict_df['ndbi'] = pd.Series(predict_ndbi.flatten()).astype('float32')
predict_df['albedo'] = pd.Series(predict_albedo.flatten()).astype('float32')




In [15]:
predict_df = predict_df.loc[
    (predict_df['landcover'] != LC_NO_DATA) & (predict_df['lst'] != 0) & (predict_df['ndvi'] != 0)
]
predict_df

Unnamed: 0,x,y,landcover,lst,ndvi,ndbi,albedo
3649,3649,0,6,-9999.000000,-9999.000000,-9999.000000,-9999.000000
16850,3648,1,6,-9999.000000,-9999.000000,-9999.000000,-9999.000000
16851,3649,1,6,-9999.000000,-9999.000000,-9999.000000,-9999.000000
16852,3650,1,6,-9999.000000,-9999.000000,-9999.000000,-9999.000000
30051,3647,2,6,-9999.000000,-9999.000000,-9999.000000,-9999.000000
...,...,...,...,...,...,...,...
141141555,12175,10690,5,311.829102,0.529260,-0.115972,0.162138
141141556,12176,10690,5,311.829102,0.529260,-0.115972,0.162138
141141557,12177,10690,5,311.308105,0.568186,-0.153507,0.158356
141141558,12178,10690,5,311.308105,0.568186,-0.153507,0.158356


In [16]:
predict_path = f'{samples_base_path}/{city}_predict{sufix}.csv'
print(f'Saving predict in {predict_path}')
predict_df = predict_df.reset_index(drop=True)

predict_df.to_csv(predict_path)

Saving predict in training_samples/MILANO_predict_UHII_50mil.csv
