In [1]:
import sys
sys.path.append("../..")

#Defining libraries
import os
import math
from datetime import date, timedelta
import pandas as pd
import xarray as xr
import plotly.graph_objects as go
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from shapely.geometry import box
from scipy.interpolate import griddata, interpn
import datacube
from copy import deepcopy
import statsmodels.api as sm

import rasterio
from rasterio.plot import show
import matplotlib.pyplot as plt
import rioxarray as rxr
import pickle

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier

from modules import processing_module as processing
from modules import ai_module as ai

2025-02-10 08:07:35.553732: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-10 08:07:35.576707: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-10 08:07:35.687482: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-10 08:07:35.688280: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Mapping the Urban atlas classes (Corine 2018 classes) 
#  to 5 classes (Urban, Rural, Vegetation, Water, and Bareland)
"""
11100 Continuous Urban Fabric (S.L. &amp;gt; 80%)
11210 Discontinuous Dense Urban Fabric (S.L. : 50% - 80%)
11220 Discontinuous Medium Density Urban Fabric (S.L. : 30% - 50%)
11230 Discontinuous Low Density Urban Fabric (S.L. : 10% - 30%)
11240 Discontinuous Very Low Density Urban Fabric (S.L. &amp;lt; 10%)
11300 Isolated Structures
12100 Industrial, commercial, public, military and private units
12210 Fast transit roads and associated land
12220 Other roads and associated land
12230 Railways and associated land
12300 Port areas
12400 Airports
13100 Mineral extraction and dump sites
13300 Construction sites
13400 Land without current use
14100 Green urban areas
14200 Sports and leisure facilities

21000 Arable land (annual crops)
22000 Permanent crops (vineyards, fruit trees, olive groves)
23000 Pastures
24000 Complex and mixed cultivation patterns
25000 Orchards at the fringe of urban classes

31000 Forests
32000 Herbaceous vegetation associations (natural grassland, moors...)
33000 Open spaces with little or no vegetations (beaches, dunes, bare rocks, glaciers)

40000 Wetland

50000 Water bodies    
"""

#Convert from copernicus code 2018 to an internal code
URBAN = 1
RURAL = 2
VEGETATION = 3
WATER = 4
BARELAND = 5
def map_urban_atlas_class(code_18):
    if code_18 == -9999: return -9999
    if code_18 == 11100: return URBAN
    if code_18 == 11210: return URBAN
    if code_18 == 11220: return URBAN
    if code_18 == 11230: return URBAN
    if code_18 == 11240: return URBAN
    if code_18 == 11300: return URBAN
    if code_18 == 12100: return URBAN
    if code_18 == 12210: return URBAN
    if code_18 == 12220: return URBAN
    if code_18 == 12230: return URBAN
    if code_18 == 12300: return URBAN
    if code_18 == 12400: return URBAN
    if code_18 == 13100: return URBAN
    if code_18 == 13300: return URBAN
    if code_18 == 13400: return URBAN
    if code_18 == 14100: return URBAN
    if code_18 == 14200: return URBAN
    
    if code_18 == 21000: return RURAL
    if code_18 == 22000: return RURAL
    if code_18 == 23000: return RURAL
    if code_18 == 24000: return RURAL
    if code_18 == 25000: return RURAL
    
    if code_18 == 31000: return VEGETATION
    if code_18 == 32000: return VEGETATION
    if code_18 == 33000: return BARELAND
    
    if code_18 == 40000: return WATER
    if code_18 == 50000: return WATER
    
    return -9999
    
# Function to check if the file is a tiff and must be read.
def check_wrong_files(file_name):
    if f == 'clip': return True #avoid entering the "clip" folder
    if 'ipynb' in f: return True #avoid entering the "ipynb_checkpoint" file
    if 'tar' in f: return True #avoid entering "tar" files
    if 'aux' in f: return True #avoid entering "aux" files
    return False


In [3]:
def fetch_from_odc(odc_datasets, samples, x=None, y=None):
    odc_df = None
    for df_name in odc_datasets:
        print(f"Sampling {df_name}")
        #odc datasets to be merged
        odc_product = df_name

        datasets = dc.find_datasets(product=odc_product)
        cf_data = dc.load(datasets=datasets)
        if x is not None and y is not None:
            cf_sel = cf_data.squeeze().sel(
                y=y, 
                x=x, 
                method='nearest'
            )
        else:
            cf_sel = cf_data.squeeze()

        cf_var_name = list(cf_data.data_vars.keys())[0]
        cf_df = cf_sel.to_dataframe()
        del cf_sel
        
        cf_df.rename(columns={cf_var_name:odc_product},inplace=True)
        cf_df.drop(['time','spatial_ref'],axis=1,inplace=True)
        if 'x' in list(cf_df.columns): cf_df.drop(['x'],axis=1,inplace=True)
        if 'y' in list(cf_df.columns): cf_df.drop(['y'],axis=1,inplace=True)

        del cf_data
        
        if odc_df is None:
            odc_df = cf_df.copy()
            print(len(samples), len(odc_df))
            odc_df = pd.concat([samples, odc_df], axis=1)
        else:
            odc_df = pd.concat([odc_df, cf_df[odc_product].astype('float32')], axis=1)

    odc_df = odc_df.dropna()

    print('odc_df Ready!')
    return odc_df


In [4]:
city = "MILANO"
city_epsg = 32632
data_folder = "data"
landcover_path = f"{data_folder}/urban_atlas_landcover_comune_milano.tif"
encode = True
normalize = True
train_model = False
model = 'ANN'

In [5]:
#Example of datacube config file:
#datacube_config_path = "/home/user/datacube.conf"

datacube_config_path = "/home/user/datacube.conf"
dc = datacube.Datacube(app = "my_app", config = datacube_config_path)
products = dc.list_products()
for p in products.name.values:
    print(p)

accelerazione_suolo
aspect
building_height
corine_urban_atlas_milan
densita_popolazione
dtm_milan
dusaf
dusaf15
dusaf99
fattori_amplificazione
flood_extent
flood_extent_year
geologia
hillshade
ixelles_dem
ixelles_distance_to_roads
ixelles_distance_to_tracks
ixelles_distance_to_water
ixelles_imperviousness
ixelles_landcover
ixelles_population
ixelles_slope
litologia_superficiale
main_road_distance
metropolitana
ndvi_2000
ndvi_2002
ndvi_2014
ndvi_2019
piezometrie_profondo
piezometrie_superficiale
piraeus_building_height
piraeus_dem
piraeus_distance_to_roads
piraeus_distance_to_tracks
piraeus_imperviousness
piraeus_landcover
piraeus_landcover_for_uhi
piraeus_population
piraeus_slope
plan_curvature
profile_curvature
reticolo_idrografico
river_distance
sabbie_falda
slope
sofia_building_height
sofia_dem
sofia_distance_to_road
sofia_distance_to_train_tracks
sofia_distance_to_water
sofia_imperviousness
sofia_landcover
sofia_population
sofia_slope
soggiacenza_falda
spi
strade_ferrovie
temperatu

In [6]:
#The datasets from the ODC from which data is sampled
odc_datasets = [
    'building_height', 'densita_popolazione',
    'main_road_distance', 'river_distance', 
    'water_distance'
]

# PREDICTION

In [42]:
if not train_model:
    model_score = '86'
    # Load the trained model using pickle
    model_file = f'model/model_{model}_86.pkl'
    with open(model_file, 'rb') as file:
        ai_model = pickle.load(file)

In [43]:
# predict the whole image
importer = processing.HarmoniaProcessor()

# get raster parameters from landcover
with rasterio.open(landcover_path, driver="GTiff") as base_raster:
    transform = base_raster.transform
    init_x = transform[2]
    init_y = transform[5]
    step_x = transform[0]
    step_y = transform[4]

#import samples
base_path = f'training_samples'
predict_path = f'{base_path}/{city}_predict_simulated_vegetation.csv'
predict = importer.import_df(predict_path, date_format=None)

predict['lst'] = predict['lst'].astype('float32')
predict['ndvi'] = predict['ndvi'].astype('float32')
predict['landcover'] = predict['landcover'].astype('int32')
predict['x'] = predict['x'].astype('float64')
predict['y'] = predict['y'].astype('float64')

#predict['landcover'] = predict['landcover'].apply(map_urban_atlas_class).astype('int32')

predict['x'] = predict['x'].apply(
    lambda x: init_x + (x * step_x)
)
predict['y'] = predict['y'].apply(
    lambda y: init_y + (y * step_y)
)

predict_x_positions = predict.x.values
predict_y_positions = predict.y.values

full_data_df = fetch_from_odc(
    odc_datasets, 
    predict,
    x=xr.DataArray(predict_x_positions, dims=['index']),
    y=xr.DataArray(predict_y_positions, dims=['index'])
)

#remove water pixels
full_data_df = full_data_df.loc[
    full_data_df['landcover'] != WATER
].reset_index(drop=True)
print("ready")

#remove nodata from odc datasets
for col in odc_datasets:
    full_data_df = full_data_df.loc[
        full_data_df[col] != -9999
    ].reset_index(drop=True)

full_data_df

Sampling building_height
7270553 7270553
Sampling densita_popolazione
Sampling main_road_distance
Sampling river_distance
Sampling water_distance
odc_df Ready!
ready


Unnamed: 0,x,y,lst,landcover,new_veg,ndvi,building_height,densita_popolazione,main_road_distance,river_distance,water_distance
0,513045.0,5042505.0,316.944427,1,0.0,0.018478,0.0,0.000,79.056938,112.805138,112.805138
1,513610.0,5042505.0,315.047424,2,0.0,0.222084,0.0,0.004,30.000000,10.000000,10.000000
2,513615.0,5042505.0,315.047424,2,0.0,0.222084,0.0,0.004,30.000000,11.180340,11.180340
3,513620.0,5042505.0,315.047424,2,0.0,0.222084,0.0,0.004,30.000000,14.142136,14.142136
4,513625.0,5042505.0,315.047424,2,0.0,0.222084,0.0,0.004,30.000000,18.027756,18.027756
...,...,...,...,...,...,...,...,...,...,...,...
7213011,515085.0,5025940.0,306.662994,2,0.0,0.360845,0.0,0.000,473.629608,18.027756,18.027756
7213012,515090.0,5025940.0,306.662994,2,0.0,0.360845,0.0,0.000,474.473389,21.213203,21.213203
7213013,515095.0,5025940.0,306.662994,2,0.0,0.360845,0.0,0.000,475.184174,25.000000,25.000000
7213014,515100.0,5025940.0,306.662994,2,0.0,0.360845,0.0,0.000,475.946411,29.154758,29.154758


In [44]:
encoders = {
        "landcover": OneHotEncoder(sparse=False, dtype='uint16', handle_unknown='ignore')
    }
if True:
    #encode categorical columns
    encoding_columns = ['landcover']
    all_encoded_columns = []
    for enc in encoding_columns:
        enc_list = full_data_df[enc].values.reshape(-1, 1)
        encoded_data = encoders[enc].fit_transform(enc_list)

        encoded_columns = [f"{enc}_{category}" for category in encoders[enc].get_feature_names_out([enc])]
        print(encoded_columns)
        full_data_df = pd.concat([full_data_df, pd.DataFrame(encoded_data, columns=encoded_columns)], axis=1)
        all_encoded_columns += encoded_columns.copy()
        full_data_df = full_data_df.drop(columns=[enc])

    full_data_df = full_data_df.dropna()
    for enc_col in all_encoded_columns:
        full_data_df[enc_col] = full_data_df[enc_col].astype('uint16')

    #drop resulting _nan columns
    _nan_columns = list(filter(lambda x: '_nan' in x, list(full_data_df.columns)))
    full_data_df = full_data_df.drop(columns=_nan_columns)

full_data_df

['landcover_landcover_1', 'landcover_landcover_2', 'landcover_landcover_3']


Unnamed: 0,x,y,lst,new_veg,ndvi,building_height,densita_popolazione,main_road_distance,river_distance,water_distance,landcover_landcover_1,landcover_landcover_2,landcover_landcover_3
0,513045.0,5042505.0,316.944427,0.0,0.018478,0.0,0.000,79.056938,112.805138,112.805138,1,0,0
1,513610.0,5042505.0,315.047424,0.0,0.222084,0.0,0.004,30.000000,10.000000,10.000000,0,1,0
2,513615.0,5042505.0,315.047424,0.0,0.222084,0.0,0.004,30.000000,11.180340,11.180340,0,1,0
3,513620.0,5042505.0,315.047424,0.0,0.222084,0.0,0.004,30.000000,14.142136,14.142136,0,1,0
4,513625.0,5042505.0,315.047424,0.0,0.222084,0.0,0.004,30.000000,18.027756,18.027756,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7213011,515085.0,5025940.0,306.662994,0.0,0.360845,0.0,0.000,473.629608,18.027756,18.027756,0,1,0
7213012,515090.0,5025940.0,306.662994,0.0,0.360845,0.0,0.000,474.473389,21.213203,21.213203,0,1,0
7213013,515095.0,5025940.0,306.662994,0.0,0.360845,0.0,0.000,475.184174,25.000000,25.000000,0,1,0
7213014,515100.0,5025940.0,306.662994,0.0,0.360845,0.0,0.000,475.946411,29.154758,29.154758,0,1,0


In [45]:
full_data_df['landcover_landcover_5'] = 0

In [46]:
#Convert all odc dataset columns to float32 to save disk
for col in odc_datasets:
    full_data_df[col] = full_data_df[col].astype('float32')

full_data_df

Unnamed: 0,x,y,lst,new_veg,ndvi,building_height,densita_popolazione,main_road_distance,river_distance,water_distance,landcover_landcover_1,landcover_landcover_2,landcover_landcover_3,landcover_landcover_5
0,513045.0,5042505.0,316.944427,0.0,0.018478,0.0,0.000,79.056938,112.805138,112.805138,1,0,0,0
1,513610.0,5042505.0,315.047424,0.0,0.222084,0.0,0.004,30.000000,10.000000,10.000000,0,1,0,0
2,513615.0,5042505.0,315.047424,0.0,0.222084,0.0,0.004,30.000000,11.180340,11.180340,0,1,0,0
3,513620.0,5042505.0,315.047424,0.0,0.222084,0.0,0.004,30.000000,14.142136,14.142136,0,1,0,0
4,513625.0,5042505.0,315.047424,0.0,0.222084,0.0,0.004,30.000000,18.027756,18.027756,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7213011,515085.0,5025940.0,306.662994,0.0,0.360845,0.0,0.000,473.629608,18.027756,18.027756,0,1,0,0
7213012,515090.0,5025940.0,306.662994,0.0,0.360845,0.0,0.000,474.473389,21.213203,21.213203,0,1,0,0
7213013,515095.0,5025940.0,306.662994,0.0,0.360845,0.0,0.000,475.184174,25.000000,25.000000,0,1,0,0
7213014,515100.0,5025940.0,306.662994,0.0,0.360845,0.0,0.000,475.946411,29.154758,29.154758,0,1,0,0


In [47]:
with open('scaler.pkl','rb') as f:
    scaler = pickle.load(f)

In [48]:
predict_df = full_data_df.copy()

data_coord = pd.concat([predict_df[col] for col in ['y', 'x']], axis=1)
predict_df = predict_df.drop(columns=['x','y'])

#Fix order column for model
column_order = ["ndvi","lst"]
column_order += odc_datasets
column_order += ["landcover_landcover_1","landcover_landcover_2","landcover_landcover_3","landcover_landcover_5"]

predict_df = predict_df[column_order]

# ignore the warnings for feature names. The important thing is that the dataset has the same order of the training one
# to remove the warnings create a dataframe with the normalized dataset and the column list
batch_size = 1000000

if normalize:
    full_data_predict = scaler.transform(predict_df)
else:
    full_data_predict = predict_df

# predict probabilities in batches
probs = []
for i in range(0, len(full_data_predict), batch_size):
    batch = full_data_predict[i:i+batch_size]
    batch_probs = ai_model.predict_proba(batch)[:,1]
    probs.append(batch_probs)
    print(f'Done {i}')

full_data_prob = np.concatenate(probs, axis=0)
full_data_prob


Done 0
Done 1000000
Done 2000000
Done 3000000
Done 4000000
Done 5000000
Done 6000000
Done 7000000


array([0.77897182, 0.99003225, 0.99002642, ..., 0.24064105, 0.2399068 ,
       0.2391399 ])

In [49]:
#append the x,y coordinates to the probabilities
full_data_df = pd.DataFrame(full_data_prob).reset_index(drop=True)
full_data_df['x'] = data_coord['x'].reset_index(drop=True)
full_data_df['y'] = data_coord['y'].reset_index(drop=True)

In [50]:
full_data_df.rename(columns={0:'probability'}, inplace=True)
full_data_df = full_data_df.round({'probability': 4})
full_data_df['probability'] = full_data_df['probability'].astype('float32')


In [33]:
base_path

'training_samples'

In [51]:
base_path = 'predictions'
predictions_path = f'{base_path}/{city}_UHI_ANN_sim_veg2.csv'
print(f'Saving to {predictions_path}')
full_data_df.to_csv(predictions_path)


Saving to predictions/MILANO_UHI_ANN_sim_veg2.csv
