In [1]:
import sys
sys.path.append("../..")

#Defining libraries
import os
import pandas as pd
import xarray as xr
import numpy as np
import datacube

import rasterio
import pickle

from sklearn.preprocessing import OneHotEncoder

from modules import processing_module as processing
from modules import ai_module as ai
from modules.ai_module_extended import AIProcessor

importer = processing.HarmoniaProcessor()

2025-02-18 12:23:42.960004: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-18 12:23:42.961338: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-18 12:23:42.985760: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-18 12:23:42.986541: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
'''
1-11-> Residential urban areas 
2-121,13->Industrial and abbandoned urban areas
3-122,123,124 Transportation infrastructure (streets, highways, airports, and ports)
4-14->Urban green areas
5-2->Agricultural areas
6-3->Forest
7-4/5->Hydro and humid bodies
'''
#Convert from copernicus code 2018 to an internal code
URBAN = 1
INDUSTRIAL = 2
TRANSPORTATION = 3
URBAN_VEGETATION = 4
RURAL = 5
FOREST = 6
WATER = 7
LC_NO_DATA = 9999
NO_DATA = -9999
# Function to check if the file is a tiff and must be read.
def check_wrong_files(f):
    if f == 'clip': return True #avoid entering the "clip" folder
    if 'ipynb' in f: return True #avoid entering the "ipynb_checkpoint" file
    if 'tar' in f: return True #avoid entering "tar" files
    if 'aux' in f: return True #avoid entering "aux" files
    return False


In [3]:
city = "MILANO"
city_epsg = 32632
data_folder = "data"
#landcover_path = f"{data_folder}/urban_atlas_landcover_comune_milano.tif"
landcover_path = f"{data_folder}/landcover/DUSAF_2021_milano_mapped.tif"
base_path = f'training_samples'
predict_base_path = f'{base_path}/{city}_predict_simulation_base_summer_mosaic.csv'
simulated_vegetation_path = f"{data_folder}/green_corridors_raster_5m_buffer_bridges.tif"
encode = True
normalize = True
train_model = False
model = 'ANN'
model_file = 'model/model_ANN_regression_test_UHI_regression_summer_avg_10000.pkl'
scaler_file = 'scaler_ANN_regression_test_UHI_regression_summer_avg_10000.pkl'

In [4]:
# SIMULATED VEGETATION Raster

with rasterio.open(simulated_vegetation_path, driver="GTiff") as simulated_vegetation_raster:
    simulated_vegetation_array = simulated_vegetation_raster.read(1)
    rows, cols = simulated_vegetation_array.shape
    x_positions = np.arange(0, cols)  # Full width
    y_positions = np.arange(0, rows)
    x, y = np.meshgrid(x_positions, y_positions)
    veg_df = pd.DataFrame({'x': x.flatten(), 'y': y.flatten(), 'new_veg': simulated_vegetation_array.flatten()})

In [5]:
prediction_base_df = importer.import_df(predict_base_path, date_format=None)

In [6]:
# Merge with Vegetation Data (ensure correct matching)
prediction_base_df = prediction_base_df.merge(veg_df, on=['x', 'y'], how='inner')

In [7]:
'''
reference_lst = prediction_base_df.loc[
    (prediction_base_df['landcover'] == URBAN_VEGETATION)|(prediction_base_df['landcover'] == RURAL)|(prediction_base_df['landcover'] == FOREST)
]['lst'].mean()
print(reference_lst)
reference_ndvi = prediction_base_df.loc[
    (prediction_base_df['landcover'] == URBAN_VEGETATION)|(prediction_base_df['landcover'] == RURAL)|(prediction_base_df['landcover'] == FOREST)
]['ndvi'].mean()
print(reference_ndvi)
'''

"\nreference_lst = prediction_base_df.loc[\n    (prediction_base_df['landcover'] == URBAN_VEGETATION)|(prediction_base_df['landcover'] == RURAL)|(prediction_base_df['landcover'] == FOREST)\n]['lst'].mean()\nprint(reference_lst)\nreference_ndvi = prediction_base_df.loc[\n    (prediction_base_df['landcover'] == URBAN_VEGETATION)|(prediction_base_df['landcover'] == RURAL)|(prediction_base_df['landcover'] == FOREST)\n]['ndvi'].mean()\nprint(reference_ndvi)\n"

In [8]:
def fetch_from_odc(odc_datasets, samples, x=None, y=None):
    odc_df = None
    for df_name in odc_datasets:
        print(f"Sampling {df_name}")
        #odc datasets to be merged
        odc_product = df_name

        datasets = dc.find_datasets(product=odc_product)
        cf_data = dc.load(datasets=datasets)
        if x is not None and y is not None:
            cf_sel = cf_data.squeeze().sel(
                y=y, 
                x=x, 
                method='nearest'
            )
        else:
            cf_sel = cf_data.squeeze()

        cf_var_name = list(cf_data.data_vars.keys())[0]
        cf_df = cf_sel.to_dataframe()
        del cf_sel
        
        cf_df.rename(columns={cf_var_name:odc_product},inplace=True)
        cf_df.drop(['time','spatial_ref'],axis=1,inplace=True)
        if 'x' in list(cf_df.columns): cf_df.drop(['x'],axis=1,inplace=True)
        if 'y' in list(cf_df.columns): cf_df.drop(['y'],axis=1,inplace=True)

        del cf_data
        
        if odc_df is None:
            odc_df = cf_df.copy()
            print(len(samples), len(odc_df))
            odc_df = pd.concat([samples, odc_df], axis=1)
        else:
            odc_df = pd.concat([odc_df, cf_df[odc_product].astype('float32')], axis=1)

    odc_df = odc_df.dropna()

    print('odc_df Ready!')
    return odc_df


In [9]:
#Example of datacube config file:
#datacube_config_path = "/home/user/datacube.conf"

datacube_config_path = "/home/user/datacube.conf"
dc = datacube.Datacube(app = "my_app", config = datacube_config_path)
products = dc.list_products()
for p in products.name.values:
    print(p)

accelerazione_suolo
aspect
building_height
corine_urban_atlas_milan
densita_popolazione
dtm_milan
dusaf
dusaf15
dusaf99
fattori_amplificazione
flood_extent
flood_extent_year
geologia
hillshade
ixelles_dem
ixelles_distance_to_roads
ixelles_distance_to_tracks
ixelles_distance_to_water
ixelles_imperviousness
ixelles_landcover
ixelles_population
ixelles_slope
litologia_superficiale
main_road_distance
metropolitana
ndvi_2000
ndvi_2002
ndvi_2014
ndvi_2019
piezometrie_profondo
piezometrie_superficiale
piraeus_building_height
piraeus_dem
piraeus_distance_to_roads
piraeus_distance_to_tracks
piraeus_imperviousness
piraeus_landcover
piraeus_landcover_for_uhi
piraeus_population
piraeus_slope
plan_curvature
profile_curvature
reticolo_idrografico
river_distance
sabbie_falda
slope
sofia_building_height
sofia_dem
sofia_distance_to_road
sofia_distance_to_train_tracks
sofia_distance_to_water
sofia_imperviousness
sofia_landcover
sofia_population
sofia_slope
soggiacenza_falda
spi
strade_ferrovie
temperatu

  from pkg_resources import iter_entry_points


In [10]:
#The datasets from the ODC from which data is sampled
odc_datasets = [
    'building_height', 'densita_popolazione',
    'main_road_distance', 'river_distance', 
    'water_distance'
]

# PREDICTION

In [11]:
model = AIProcessor('ANN','regression',False)
model.load_model(model_file)
model.load_scaler(scaler_file)

In [12]:
# get raster parameters from landcover
with rasterio.open(landcover_path, driver="GTiff") as base_raster:
    transform = base_raster.transform
    init_x = transform[2]
    init_y = transform[5]
    step_x = transform[0]
    step_y = transform[4]

#import samples
predict = prediction_base_df

predict['lst'] = predict['lst'].astype('float32')
predict['ndvi'] = predict['ndvi'].astype('float32')
predict['ndbi'] = predict['ndbi'].astype('float32')
predict['albedo'] = predict['albedo'].astype('float32')
predict['landcover'] = predict['landcover'].astype('int32')
predict['x'] = predict['x'].astype('float64')
predict['y'] = predict['y'].astype('float64')

#predict['landcover'] = predict['landcover'].apply(map_urban_atlas_class).astype('int32')

predict['x'] = predict['x'].apply(
    lambda x: init_x + (x * step_x)
)
predict['y'] = predict['y'].apply(
    lambda y: init_y + (y * step_y)
)

predict_x_positions = predict.x.values
predict_y_positions = predict.y.values

full_data_df = fetch_from_odc(
    odc_datasets, 
    predict,
    x=xr.DataArray(predict_x_positions, dims=['index']),
    y=xr.DataArray(predict_y_positions, dims=['index'])
)

#remove water pixels
full_data_df = full_data_df.loc[
    full_data_df['landcover'] != WATER
].reset_index(drop=True)
print("ready")

#remove nodata from odc datasets
for col in odc_datasets:
    full_data_df = full_data_df.loc[
        full_data_df[col] != -9999
    ].reset_index(drop=True)

full_data_df

Sampling building_height
7201808 7201808
Sampling densita_popolazione
Sampling main_road_distance
Sampling river_distance
Sampling water_distance
odc_df Ready!
ready


Unnamed: 0,x,y,landcover,lst,ndvi,ndbi,albedo,uhii,new_veg,building_height,densita_popolazione,main_road_distance,river_distance,water_distance
0,513016.1148,5042500.992,2,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,82.462112,123.693169,123.693169
1,513021.1148,5042500.992,2,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,80.777473,122.576508,122.576508
2,513026.1148,5042500.992,2,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,79.056938,121.655258,121.655258
3,513031.1148,5042500.992,2,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,77.620872,120.933868,120.933868
4,513036.1148,5042500.992,2,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,76.485291,120.208145,120.208145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7201800,515101.1148,5025940.992,5,306.759674,0.743142,-0.341891,0.127842,-3.835744,0,0.0,0.0,475.946411,29.154758,29.154758
7201801,515106.1148,5025940.992,5,306.759674,0.743142,-0.341891,0.127842,-3.835744,0,0.0,0.0,476.759918,33.541019,33.541019
7201802,515081.1148,5025935.992,5,306.472748,0.773112,-0.405859,0.149351,-4.122671,0,0.0,0.0,477.624329,11.180340,11.180340
7201803,515086.1148,5025935.992,5,306.472748,0.773112,-0.405859,0.149351,-4.122671,0,0.0,0.0,478.539429,14.142136,14.142136


In [13]:
encoders = {
        "landcover": OneHotEncoder(sparse=False, dtype='uint16', handle_unknown='ignore')
    }
if True:
    #encode categorical columns
    encoding_columns = ['landcover']
    all_encoded_columns = []
    for enc in encoding_columns:
        enc_list = full_data_df[enc].values.reshape(-1, 1)
        encoded_data = encoders[enc].fit_transform(enc_list)

        encoded_columns = [f"{enc}_{category}" for category in encoders[enc].get_feature_names_out([enc])]
        print(encoded_columns)
        full_data_df = pd.concat([full_data_df, pd.DataFrame(encoded_data, columns=encoded_columns)], axis=1)
        all_encoded_columns += encoded_columns.copy()
        full_data_df = full_data_df.drop(columns=[enc])

    full_data_df = full_data_df.dropna()
    for enc_col in all_encoded_columns:
        full_data_df[enc_col] = full_data_df[enc_col].astype('uint16')

    #drop resulting _nan columns
    _nan_columns = list(filter(lambda x: '_nan' in x, list(full_data_df.columns)))
    full_data_df = full_data_df.drop(columns=_nan_columns)

full_data_df



['landcover_landcover_1', 'landcover_landcover_2', 'landcover_landcover_3', 'landcover_landcover_4', 'landcover_landcover_5', 'landcover_landcover_6']


Unnamed: 0,x,y,lst,ndvi,ndbi,albedo,uhii,new_veg,building_height,densita_popolazione,main_road_distance,river_distance,water_distance,landcover_landcover_1,landcover_landcover_2,landcover_landcover_3,landcover_landcover_4,landcover_landcover_5,landcover_landcover_6
0,513016.1148,5042500.992,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,82.462112,123.693169,123.693169,0,1,0,0,0,0
1,513021.1148,5042500.992,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,80.777473,122.576508,122.576508,0,1,0,0,0,0
2,513026.1148,5042500.992,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,79.056938,121.655258,121.655258,0,1,0,0,0,0
3,513031.1148,5042500.992,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,77.620872,120.933868,120.933868,0,1,0,0,0,0
4,513036.1148,5042500.992,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,76.485291,120.208145,120.208145,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7201800,515101.1148,5025940.992,306.759674,0.743142,-0.341891,0.127842,-3.835744,0,0.0,0.0,475.946411,29.154758,29.154758,0,0,0,0,1,0
7201801,515106.1148,5025940.992,306.759674,0.743142,-0.341891,0.127842,-3.835744,0,0.0,0.0,476.759918,33.541019,33.541019,0,0,0,0,1,0
7201802,515081.1148,5025935.992,306.472748,0.773112,-0.405859,0.149351,-4.122671,0,0.0,0.0,477.624329,11.180340,11.180340,0,0,0,0,1,0
7201803,515086.1148,5025935.992,306.472748,0.773112,-0.405859,0.149351,-4.122671,0,0.0,0.0,478.539429,14.142136,14.142136,0,0,0,0,1,0


In [14]:
#full_data_df['landcover_landcover_5'] = 0

In [15]:
#Convert all odc dataset columns to float32 to save disk
for col in odc_datasets:
    full_data_df[col] = full_data_df[col].astype('float32')

full_data_df

Unnamed: 0,x,y,lst,ndvi,ndbi,albedo,uhii,new_veg,building_height,densita_popolazione,main_road_distance,river_distance,water_distance,landcover_landcover_1,landcover_landcover_2,landcover_landcover_3,landcover_landcover_4,landcover_landcover_5,landcover_landcover_6
0,513016.1148,5042500.992,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,82.462112,123.693169,123.693169,0,1,0,0,0,0
1,513021.1148,5042500.992,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,80.777473,122.576508,122.576508,0,1,0,0,0,0
2,513026.1148,5042500.992,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,79.056938,121.655258,121.655258,0,1,0,0,0,0
3,513031.1148,5042500.992,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,77.620872,120.933868,120.933868,0,1,0,0,0,0
4,513036.1148,5042500.992,319.141266,0.080655,0.082474,0.213476,8.545837,0,0.0,0.0,76.485291,120.208145,120.208145,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7201800,515101.1148,5025940.992,306.759674,0.743142,-0.341891,0.127842,-3.835744,0,0.0,0.0,475.946411,29.154758,29.154758,0,0,0,0,1,0
7201801,515106.1148,5025940.992,306.759674,0.743142,-0.341891,0.127842,-3.835744,0,0.0,0.0,476.759918,33.541019,33.541019,0,0,0,0,1,0
7201802,515081.1148,5025935.992,306.472748,0.773112,-0.405859,0.149351,-4.122671,0,0.0,0.0,477.624329,11.180340,11.180340,0,0,0,0,1,0
7201803,515086.1148,5025935.992,306.472748,0.773112,-0.405859,0.149351,-4.122671,0,0.0,0.0,478.539429,14.142136,14.142136,0,0,0,0,1,0


Change NDVI, NDBI, albedo, building height, and land cover of the new vegetation areas

In [19]:
reference_lst = prediction_base_df.loc[
    (prediction_base_df['landcover'] == URBAN_VEGETATION)
]['lst'].mean()
print(f'reference LST: {reference_lst} K')
reference_ndvi = prediction_base_df.loc[
    (prediction_base_df['landcover'] == URBAN_VEGETATION)
]['ndvi'].mean()
print(f'reference NDVI: {reference_ndvi}')
reference_ndbi = prediction_base_df.loc[
    (prediction_base_df['landcover'] == URBAN_VEGETATION)
]['ndbi'].mean()
print(f'reference NDBI: {reference_ndbi}')
reference_albedo = prediction_base_df.loc[
    (prediction_base_df['landcover'] == URBAN_VEGETATION)
]['albedo'].mean()
print(f'reference albedo: {reference_albedo}')

reference LST: 314.9889221191406 K
reference NDVI: 0.5751705169677734
reference NDBI: -0.171836256980896
reference albedo: 0.1505322903394699


In [20]:
full_data_df.loc[full_data_df['new_veg'] == 1, 'lst'] = reference_lst# - 6
full_data_df.loc[full_data_df['new_veg'] == 1, 'ndvi'] = reference_ndvi
full_data_df.loc[full_data_df['new_veg'] == 1, 'ndbi'] = reference_ndbi
full_data_df.loc[full_data_df['new_veg'] == 1, 'albedo'] = reference_albedo
full_data_df.loc[full_data_df['new_veg'] == 1, 'building_height'] = 0
full_data_df.loc[full_data_df['new_veg'] == 1, 'landcover'] = URBAN_VEGETATION

In [16]:
full_data_df.loc[full_data_df['new_veg'] == 1]

Unnamed: 0,x,y,lst,ndvi,ndbi,albedo,uhii,new_veg,building_height,densita_popolazione,main_road_distance,river_distance,water_distance,landcover_landcover_1,landcover_landcover_2,landcover_landcover_3,landcover_landcover_4,landcover_landcover_5,landcover_landcover_6
31,513711.1148,5042500.992,315.331299,0.347768,-0.064722,0.136593,4.735891,1,0.0,0.002,35.0,0.000000,0.000000,0,1,0,0,0,0
70,513706.1148,5042495.992,315.331299,0.347768,-0.064722,0.136593,4.735891,1,0.0,0.002,40.0,5.000000,5.000000,0,1,0,0,0,0
71,513711.1148,5042495.992,315.331299,0.347768,-0.064722,0.136593,4.735891,1,0.0,0.002,40.0,0.000000,0.000000,0,1,0,0,0,0
118,513706.1148,5042490.992,315.331299,0.347768,-0.064722,0.136593,4.735891,1,0.0,0.002,45.0,5.000000,5.000000,0,1,0,0,0,0
174,513706.1148,5042485.992,314.628510,0.474276,-0.128930,0.134148,4.033091,1,0.0,0.002,50.0,5.000000,5.000000,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7195076,514526.1148,5026295.992,313.826050,0.479106,-0.121877,0.135838,3.230621,1,0.0,0.000,0.0,31.622776,31.622776,0,0,1,0,0,0
7195077,514531.1148,5026295.992,313.826050,0.479106,-0.121877,0.135838,3.230621,1,0.0,0.000,0.0,30.413815,30.413815,0,0,1,0,0,0
7195231,514516.1148,5026290.992,313.826050,0.479106,-0.121877,0.135838,3.230621,1,0.0,0.000,0.0,31.622776,31.622776,0,0,1,0,0,0
7195232,514521.1148,5026290.992,313.826050,0.479106,-0.121877,0.135838,3.230621,1,0.0,0.000,0.0,29.154758,29.154758,0,0,1,0,0,0


In [21]:
full_data_df = full_data_df[['x', 'y','ndvi', 'ndbi', 'albedo','building_height',
       'densita_popolazione', 'main_road_distance', 'river_distance',
       'water_distance', 'landcover_landcover_1', 'landcover_landcover_2',
       'landcover_landcover_3', 'landcover_landcover_4',
       'landcover_landcover_5', 'landcover_landcover_6']]

In [22]:
predict_df = full_data_df.copy()

data_coord = pd.concat([predict_df[col] for col in ['y', 'x']], axis=1)
predict_df = predict_df.drop(columns=['x','y'])

#Fix order column for model
column_order = ['ndvi', 'ndbi', 'albedo'] #["ndvi","lst"]
column_order += odc_datasets
column_order += ["landcover_landcover_1","landcover_landcover_2","landcover_landcover_3","landcover_landcover_4","landcover_landcover_5","landcover_landcover_6"]

predict_df = predict_df[column_order]

# ignore the warnings for feature names. The important thing is that the dataset has the same order of the training one
# to remove the warnings create a dataframe with the normalized dataset and the column list
batch_size = 1000000

if normalize:
    full_data_predict = model.scaler.transform(predict_df)
else:
    full_data_predict = predict_df

# predict probabilities in batches
probs = []
for i in range(0, len(full_data_predict), batch_size):
    batch = full_data_predict[i:i+batch_size]
    batch_probs = model.predict(batch)#[:,1]
    probs.append(batch_probs)
    print(f'Done {i}')

full_data_prob = np.concatenate(probs, axis=0)
full_data_prob


Done 0
Done 1000000
Done 2000000
Done 3000000
Done 4000000
Done 5000000
Done 6000000
Done 7000000


array([[ 5.228222 ],
       [ 5.2253213],
       [ 5.2229695],
       ...,
       [-2.3796601],
       [-2.3780608],
       [-2.3749943]], dtype=float32)

In [23]:
#append the x,y coordinates to the probabilities
full_data_df = pd.DataFrame(full_data_prob).reset_index(drop=True)
full_data_df['x'] = data_coord['x'].reset_index(drop=True)
full_data_df['y'] = data_coord['y'].reset_index(drop=True)

In [24]:
full_data_df.rename(columns={0:'probability'}, inplace=True)
full_data_df = full_data_df.round({'probability': 4})
full_data_df['probability'] = full_data_df['probability'].astype('float32')


In [25]:
base_path = 'predictions'
predictions_path = f'{base_path}/{city}_UHII_green_corridor_5m.csv'
print(f'Saving to {predictions_path}')
full_data_df.to_csv(predictions_path)


Saving to predictions/MILANO_UHII_green_corridor_5m.csv
