In [7]:
import sys
sys.path.append("../..")

#Defining libraries
import os
import math
from datetime import date, timedelta
import pandas as pd
import xarray as xr
import plotly.graph_objects as go
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from shapely.geometry import box
from scipy.interpolate import griddata, interpn
import datacube
from copy import deepcopy
import statsmodels.api as sm

import rasterio
from rasterio.plot import show
import matplotlib.pyplot as plt
import rioxarray as rxr
import pickle

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier

from modules import processing_module as processing
from modules import ai_module as ai

2023-11-29 11:33:02.892520: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-29 11:33:02.914162: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-29 11:33:03.019844: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-29 11:33:03.020428: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
# Mapping the Urban atlas classes (Corine 2018 classes) 
#  to 5 classes (Urban, Rural, Vegetation, Water, and Bareland)
"""
11100 Continuous Urban Fabric (S.L. &amp;gt; 80%)
11210 Discontinuous Dense Urban Fabric (S.L. : 50% - 80%)
11220 Discontinuous Medium Density Urban Fabric (S.L. : 30% - 50%)
11230 Discontinuous Low Density Urban Fabric (S.L. : 10% - 30%)
11240 Discontinuous Very Low Density Urban Fabric (S.L. &amp;lt; 10%)
11300 Isolated Structures
12100 Industrial, commercial, public, military and private units
12210 Fast transit roads and associated land
12220 Other roads and associated land
12230 Railways and associated land
12300 Port areas
12400 Airports
13100 Mineral extraction and dump sites
13300 Construction sites
13400 Land without current use
14100 Green urban areas
14200 Sports and leisure facilities

21000 Arable land (annual crops)
22000 Permanent crops (vineyards, fruit trees, olive groves)
23000 Pastures
24000 Complex and mixed cultivation patterns
25000 Orchards at the fringe of urban classes

31000 Forests
32000 Herbaceous vegetation associations (natural grassland, moors...)
33000 Open spaces with little or no vegetations (beaches, dunes, bare rocks, glaciers)

40000 Wetland

50000 Water bodies    
"""

#Convert from copernicus code 2018 to an internal code
URBAN = 1
RURAL = 2
VEGETATION = 3
WATER = 4
BARELAND = 5
def map_urban_atlas_class(code_18):
    if code_18 == -9999: return -9999
    if code_18 == 11100: return URBAN
    if code_18 == 11210: return URBAN
    if code_18 == 11220: return URBAN
    if code_18 == 11230: return URBAN
    if code_18 == 11240: return URBAN
    if code_18 == 11300: return URBAN
    if code_18 == 12100: return URBAN
    if code_18 == 12210: return URBAN
    if code_18 == 12220: return URBAN
    if code_18 == 12230: return URBAN
    if code_18 == 12300: return URBAN
    if code_18 == 12400: return URBAN
    if code_18 == 13100: return URBAN
    if code_18 == 13300: return URBAN
    if code_18 == 13400: return URBAN
    if code_18 == 14100: return URBAN
    if code_18 == 14200: return URBAN
    
    if code_18 == 21000: return RURAL
    if code_18 == 22000: return RURAL
    if code_18 == 23000: return RURAL
    if code_18 == 24000: return RURAL
    if code_18 == 25000: return RURAL
    
    if code_18 == 31000: return VEGETATION
    if code_18 == 32000: return VEGETATION
    if code_18 == 33000: return BARELAND
    
    if code_18 == 40000: return WATER
    if code_18 == 50000: return WATER
    
    return -9999
    
# Function to check if the file is a tiff and must be read.
def check_wrong_files(file_name):
    if f == 'clip': return True #avoid entering the "clip" folder
    if 'ipynb' in f: return True #avoid entering the "ipynb_checkpoint" file
    if 'tar' in f: return True #avoid entering "tar" files
    if 'aux' in f: return True #avoid entering "aux" files
    return False


In [9]:
def fetch_from_odc(odc_datasets, samples, x=None, y=None):
    odc_df = None
    for df_name in odc_datasets:
        print(f"Sampling {df_name}")
        #odc datasets to be merged
        odc_product = df_name

        datasets = dc.find_datasets(product=odc_product)
        cf_data = dc.load(datasets=datasets)
        if x is not None and y is not None:
            cf_sel = cf_data.squeeze().sel(
                y=y, 
                x=x, 
                method='nearest'
            )
        else:
            cf_sel = cf_data.squeeze()

        cf_var_name = list(cf_data.data_vars.keys())[0]
        cf_df = cf_sel.to_dataframe()
        del cf_sel
        
        cf_df.rename(columns={cf_var_name:odc_product},inplace=True)
        cf_df.drop(['time','spatial_ref'],axis=1,inplace=True)
        if 'x' in list(cf_df.columns): cf_df.drop(['x'],axis=1,inplace=True)
        if 'y' in list(cf_df.columns): cf_df.drop(['y'],axis=1,inplace=True)

        del cf_data
        
        if odc_df is None:
            odc_df = cf_df.copy()
            print(len(samples), len(odc_df))
            odc_df = pd.concat([samples, odc_df], axis=1)
        else:
            odc_df = pd.concat([odc_df, cf_df[odc_product].astype('float32')], axis=1)

    odc_df = odc_df.dropna()

    print('odc_df Ready!')
    return odc_df


In [10]:
city = "MILANO"
city_epsg = 32632
data_folder = "data"
landcover_path = f"{data_folder}/MILANO_landcover.tif"
encode = True
normalize = True
train_model = False
model = 'ANN'

In [11]:
#Example of datacube config file:
#datacube_config_path = "/home/user/datacube.conf"

datacube_config_path = "path_to_datacube_config_file"
dc = datacube.Datacube(app = "my_app", config = datacube_config_path)
products = dc.list_products()
for p in products.name.values:
    print(p)

accelerazione_suolo
aspect
building_height
corine_urban_atlas_milan
densita_popolazione
dtm_milan
dusaf
dusaf15
dusaf99
fattori_amplificazione
flood_extent
flood_extent_year
geologia
hillshade
ixelles_dem
ixelles_distance_to_roads
ixelles_distance_to_tracks
ixelles_distance_to_water
ixelles_imperviousness
ixelles_landcover
ixelles_population
ixelles_slope
litologia_superficiale
main_road_distance
metropolitana
ndvi_2000
ndvi_2002
ndvi_2014
ndvi_2019
piezometrie_profondo
piezometrie_superficiale
piraeus_building_height
piraeus_dem
piraeus_distance_to_roads
piraeus_distance_to_tracks
piraeus_imperviousness
piraeus_landcover
piraeus_landcover_for_uhi
piraeus_population
piraeus_slope
plan_curvature
profile_curvature
reticolo_idrografico
river_distance
sabbie_falda
slope
sofia_building_height
sofia_dem
sofia_distance_to_road
sofia_distance_to_train_tracks
sofia_distance_to_water
sofia_imperviousness
sofia_landcover
sofia_population
sofia_slope
soggiacenza_falda
spi
strade_ferrovie
temperatu

In [12]:
#The datasets from the ODC from which data is sampled
odc_datasets = [
    'building_height', 'densita_popolazione',
    'main_road_distance', 'river_distance', 
    'water_distance'
]

In [13]:
importer = processing.HarmoniaProcessor()

# get raster parameters
with rasterio.open(landcover_path, driver="GTiff") as base_raster:
    transform = base_raster.transform
    init_x = transform[2]
    init_y = transform[5]
    step_x = transform[0]
    step_y = transform[4]

#import samples
base_path = f'training_samples'
samples_path = f'{base_path}/{city}_samples.csv'
samples = importer.import_df(samples_path, date_format=None)

samples['x'] = samples['x'].apply(
    lambda x: init_x + (x * step_x)
)
samples['y'] = samples['y'].apply(
    lambda y: init_y + (y * step_y)
)
samples


Unnamed: 0,x,y,landcover,uhi,ndvi,lst,raster
0,504755.0,5044760.0,1,0,0.340190,305.97598,20180815
1,527405.0,5032805.0,1,0,0.365114,304.33533,20180815
2,512730.0,5026165.0,1,0,0.485528,305.89053,20180815
3,516430.0,5043370.0,1,0,0.337092,304.33533,20180815
4,524665.0,5040390.0,1,0,0.405153,305.80850,20180815
...,...,...,...,...,...,...,...
1099995,525765.0,5031595.0,2,1,0.373092,306.35880,20200719
1099996,512985.0,5029820.0,2,1,0.324245,311.44140,20200719
1099997,512790.0,5024110.0,2,1,0.286099,308.65570,20200719
1099998,498990.0,5031685.0,2,1,0.471453,305.72305,20200719


In [14]:
single_positions = samples.drop_duplicates(subset=['x', 'y']).reset_index(drop=True)[['x','y']]
single_x_positions = single_positions.x.values
single_y_positions = single_positions.y.values

odc_df = fetch_from_odc(
    odc_datasets, 
    samples,
    x=xr.DataArray(single_x_positions, dims=['index']),
    y=xr.DataArray(single_y_positions, dims=['index'])
)

# set UHI column as integer
odc_df['uhi'] = odc_df['uhi'].apply(
    lambda x: int(x),
).astype('int8')

#coordinates as float32 to reduce size in disk
odc_df['x'] = odc_df['x'].astype('float64')
odc_df['y'] = odc_df['y'].astype('float64')
odc_df['lst'] = odc_df['lst'].astype('float32')
odc_df['ndvi'] = odc_df['ndvi'].astype('float32')
odc_df['landcover'] = odc_df['landcover'].astype('int32')
odc_df['uhi'] = odc_df['uhi'].astype('uint8')

print("ready")
odc_df
    

Sampling building_height
1100000 1083227
Sampling densita_popolazione
Sampling main_road_distance
Sampling river_distance
Sampling water_distance
odc_df Ready!
ready


Unnamed: 0,x,y,landcover,uhi,ndvi,lst,raster,building_height,densita_popolazione,main_road_distance,river_distance,water_distance
0,504755.0,5044760.0,1,0,0.340190,305.975983,20180815,0.0,0.000,749.016052,94.339813,94.339813
1,527405.0,5032805.0,1,0,0.365114,304.335327,20180815,0.0,0.000,10.000000,138.293167,138.293167
2,512730.0,5026165.0,1,0,0.485528,305.890533,20180815,0.0,0.000,93.941467,120.000000,120.000000
3,516430.0,5043370.0,1,0,0.337092,304.335327,20180815,0.0,0.000,526.165405,150.000000,150.000000
4,524665.0,5040390.0,1,0,0.405153,305.808502,20180815,0.0,0.000,320.663391,55.226803,55.226803
...,...,...,...,...,...,...,...,...,...,...,...,...
1083222,508730.0,5022140.0,2,1,0.264716,309.838348,20200719,0.0,0.000,336.823395,187.416656,187.416656
1083223,526985.0,5024355.0,2,1,0.107714,322.122711,20200719,0.0,0.000,511.590637,46.097721,46.097721
1083224,502505.0,5044070.0,2,1,0.188039,313.461456,20200719,0.0,0.000,919.959229,191.637695,191.637695
1083225,489140.0,5039245.0,2,1,0.444726,305.846100,20200719,0.0,0.000,99.624290,14.142136,14.142136


In [15]:
#Encode Columns
train_df = odc_df.copy()
if encode:
    #encode categorical columns
    encoding_columns = ['landcover']
    encoders = {
        "landcover": OneHotEncoder(sparse=False, dtype='uint16', handle_unknown='ignore')
    }

    for enc in encoding_columns:
        enc_list = train_df[enc].values.reshape(-1, 1)
        encoded_data = encoders[enc].fit_transform(enc_list)

        encoded_columns = [f"{enc}_{category}" for category in encoders[enc].get_feature_names_out([enc])]
        train_df = pd.concat(
            [train_df, pd.DataFrame(encoded_data, columns=encoded_columns, dtype='int32')], 
            axis=1,
            join='inner'
        )
        train_df = train_df.drop(columns=[enc])

    train_df = train_df.dropna()

    #drop resulting _nan columns
    _nan_columns = list(filter(lambda x: '_nan' in x, list(train_df.columns)))
    train_df = train_df.drop(columns=_nan_columns)

    
#drop raster, x, and y columns
train_df_complete = train_df.drop(columns=['raster','x','y'])

for col in odc_datasets:
    train_df_complete = train_df_complete.loc[
        train_df_complete[col] != -9999
    ]

train_df_complete


Unnamed: 0,uhi,ndvi,lst,building_height,densita_popolazione,main_road_distance,river_distance,water_distance,landcover_landcover_1,landcover_landcover_2,landcover_landcover_3,landcover_landcover_5
0,0,0.340190,305.975983,0.0,0.0,749.016052,94.339813,94.339813,1,0,0,0
1,0,0.365114,304.335327,0.0,0.0,10.000000,138.293167,138.293167,1,0,0,0
2,0,0.485528,305.890533,0.0,0.0,93.941467,120.000000,120.000000,1,0,0,0
3,0,0.337092,304.335327,0.0,0.0,526.165405,150.000000,150.000000,1,0,0,0
4,0,0.405153,305.808502,0.0,0.0,320.663391,55.226803,55.226803,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1079535,1,0.502171,306.970642,0.0,0.0,326.840942,41.231056,41.231056,0,1,0,0
1079536,1,0.377483,312.135254,0.0,0.0,150.000000,1937.588623,1937.588623,0,1,0,0
1079537,1,0.215878,311.796875,0.0,0.0,1017.951355,36.400551,36.400551,0,1,0,0
1079538,1,0.183535,316.510315,0.0,0.0,79.056938,26.925823,26.925823,0,1,0,0


In [16]:
#Prepare data for model and normalize
train_df = train_df_complete.copy()

X_train, X_test, y_train, y_test = train_test_split(train_df.drop('uhi', axis=1), train_df['uhi'], test_size=0.7, random_state=42, stratify=train_df['uhi'])

columns_list = list(X_train.columns)
X_train_df = pd.DataFrame(X_train, columns=columns_list)
X_test_df = pd.DataFrame(X_test, columns=columns_list)

scaler = None
if normalize:    
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
ai_model = None

In [17]:
%%time
if train_model:
    if model == 'RF':
        #Random forest Model
        ai_model = RandomForestClassifier(
            n_estimators = 1000, 
            max_depth=7,
            random_state = 42,
            n_jobs=-1
        )

        ai_model.fit(X_train, y_train)

        score = ai_model.score(X_test, y_test)
        print(f'Score for RF: {score}')

    elif model == 'ANN':

        hidden_layer_sizes = (30,20,2)
        ai_model = MLPClassifier(
            solver='adam', 
            activation='relu',
            alpha=1e-6, 
            hidden_layer_sizes=hidden_layer_sizes,         
            max_iter=10000,
            batch_size=200,
            learning_rate='constant',

            random_state=42,
            verbose=True
        )

        ai_model.fit(X_train, y_train)

        score = ai_model.score(X_test, y_test)
        print(f'Score for ANN: {score}')

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 2.86 µs


In [18]:
if train_model:
    if model == 'RF':
        columns_list = list(X_train_df.columns)
        importances = pd.DataFrame(ai_model.feature_importances_)
        importances['label'] = pd.Series(X_train_df.columns.values)
        importances = importances.sort_values(by=0, ascending=False).reset_index(drop=True)
        importer.show_plot(importances, 'label', 0)
    elif model == 'ANN':
        columns_list = list(X_train_df.columns)
        importances = pd.DataFrame(ai_model.coef_)
        importances['label'] = pd.Series(X_train_df.columns.values)
        importances = importances.sort_values(by=0, ascending=False).reset_index(drop=True)
        importer.show_plot(importances, 'label', 0)

In [19]:
if train_model:
    #save model
    model_score = str(score)[2:4]
    model_file = f'model/model_{model}_{model_score}.pkl'
    print(model_file)
    with open(model_file, 'wb') as file:
        pickle.dump(ai_model, file)

# PREDICTION

In [20]:
if not train_model:
    model_score = '86'
    # Load the trained model using pickle
    model_file = f'model/model_{model}_86.pkl'
    with open(model_file, 'rb') as file:
        ai_model = pickle.load(file)

In [21]:
# predict the whole image
importer = processing.HarmoniaProcessor()

# get raster parameters from landcover
with rasterio.open(landcover_path, driver="GTiff") as base_raster:
    transform = base_raster.transform
    init_x = transform[2]
    init_y = transform[5]
    step_x = transform[0]
    step_y = transform[4]

#import samples
base_path = f'training_samples'
predict_path = f'{base_path}/{city}_predict.csv'
predict = importer.import_df(predict_path, date_format=None)

predict['lst'] = predict['lst'].astype('float32')
predict['ndvi'] = predict['ndvi'].astype('float32')
predict['landcover'] = predict['landcover'].astype('int32')
predict['x'] = predict['x'].astype('float64')
predict['y'] = predict['y'].astype('float64')

predict['landcover'] = predict['landcover'].apply(map_urban_atlas_class).astype('int32')

predict['x'] = predict['x'].apply(
    lambda x: init_x + (x * step_x)
)
predict['y'] = predict['y'].apply(
    lambda y: init_y + (y * step_y)
)

predict_x_positions = predict.x.values
predict_y_positions = predict.y.values

full_data_df = fetch_from_odc(
    odc_datasets, 
    predict,
    x=xr.DataArray(predict_x_positions, dims=['index']),
    y=xr.DataArray(predict_y_positions, dims=['index'])
)

#remove water pixels
full_data_df = full_data_df.loc[
    full_data_df['landcover'] != WATER
].reset_index(drop=True)
print("ready")

#remove nodata from odc datasets
for col in odc_datasets:
    full_data_df = full_data_df.loc[
        full_data_df[col] != -9999
    ].reset_index(drop=True)

full_data_df

Sampling building_height
62989129 62989129
Sampling densita_popolazione
Sampling main_road_distance
Sampling river_distance
Sampling water_distance
odc_df Ready!
ready


Unnamed: 0,x,y,lst,ndvi,landcover,building_height,densita_popolazione,main_road_distance,river_distance,water_distance
0,495340.0,5054280.0,309.75,0.209839,3,0.0,0.0,3550.003662,916.856079,916.856079
1,495345.0,5054280.0,310.25,0.296875,3,0.0,0.0,3551.936035,920.570435,920.570435
2,495350.0,5054280.0,310.25,0.296875,3,0.0,0.0,3553.874512,924.296997,924.296997
3,495320.0,5054275.0,309.25,0.309814,3,0.0,0.0,3537.711182,898.721313,898.721313
4,495325.0,5054275.0,309.25,0.309814,3,0.0,0.0,3539.622070,902.399597,902.399597
...,...,...,...,...,...,...,...,...,...,...
61868991,537980.0,5001005.0,314.50,0.306152,1,0.0,0.0,5.000000,2205.362793,2205.362793
61868992,537985.0,5001005.0,314.50,0.306152,1,0.0,0.0,5.000000,2201.419922,2201.419922
61868993,537990.0,5001005.0,314.50,0.306152,1,0.0,0.0,0.000000,2197.481445,2197.481445
61868994,537995.0,5001005.0,314.50,0.306152,1,0.0,0.0,0.000000,2193.547363,2193.547363


In [22]:
if encode:
    #encode categorical columns
    encoding_columns = ['landcover']
    all_encoded_columns = []
    for enc in encoding_columns:
        enc_list = full_data_df[enc].values.reshape(-1, 1)
        encoded_data = encoders[enc].transform(enc_list)

        encoded_columns = [f"{enc}_{category}" for category in encoders[enc].get_feature_names_out([enc])]
        print(encoded_columns)
        full_data_df = pd.concat([full_data_df, pd.DataFrame(encoded_data, columns=encoded_columns)], axis=1)
        all_encoded_columns += encoded_columns.copy()
        full_data_df = full_data_df.drop(columns=[enc])

    full_data_df = full_data_df.dropna()
    for enc_col in all_encoded_columns:
        full_data_df[enc_col] = full_data_df[enc_col].astype('uint16')

    #drop resulting _nan columns
    _nan_columns = list(filter(lambda x: '_nan' in x, list(full_data_df.columns)))
    full_data_df = full_data_df.drop(columns=_nan_columns)

full_data_df

['landcover_landcover_1', 'landcover_landcover_2', 'landcover_landcover_3', 'landcover_landcover_5']


Unnamed: 0,x,y,lst,ndvi,building_height,densita_popolazione,main_road_distance,river_distance,water_distance,landcover_landcover_1,landcover_landcover_2,landcover_landcover_3,landcover_landcover_5
0,495340.0,5054280.0,309.75,0.209839,0.0,0.0,3550.003662,916.856079,916.856079,0,0,1,0
1,495345.0,5054280.0,310.25,0.296875,0.0,0.0,3551.936035,920.570435,920.570435,0,0,1,0
2,495350.0,5054280.0,310.25,0.296875,0.0,0.0,3553.874512,924.296997,924.296997,0,0,1,0
3,495320.0,5054275.0,309.25,0.309814,0.0,0.0,3537.711182,898.721313,898.721313,0,0,1,0
4,495325.0,5054275.0,309.25,0.309814,0.0,0.0,3539.622070,902.399597,902.399597,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61868991,537980.0,5001005.0,314.50,0.306152,0.0,0.0,5.000000,2205.362793,2205.362793,1,0,0,0
61868992,537985.0,5001005.0,314.50,0.306152,0.0,0.0,5.000000,2201.419922,2201.419922,1,0,0,0
61868993,537990.0,5001005.0,314.50,0.306152,0.0,0.0,0.000000,2197.481445,2197.481445,1,0,0,0
61868994,537995.0,5001005.0,314.50,0.306152,0.0,0.0,0.000000,2193.547363,2193.547363,1,0,0,0


In [23]:
#Convert all odc dataset columns to float32 to save disk
for col in odc_datasets:
    full_data_df[col] = full_data_df[col].astype('float32')

full_data_df

Unnamed: 0,x,y,lst,ndvi,building_height,densita_popolazione,main_road_distance,river_distance,water_distance,landcover_landcover_1,landcover_landcover_2,landcover_landcover_3,landcover_landcover_5
0,495340.0,5054280.0,309.75,0.209839,0.0,0.0,3550.003662,916.856079,916.856079,0,0,1,0
1,495345.0,5054280.0,310.25,0.296875,0.0,0.0,3551.936035,920.570435,920.570435,0,0,1,0
2,495350.0,5054280.0,310.25,0.296875,0.0,0.0,3553.874512,924.296997,924.296997,0,0,1,0
3,495320.0,5054275.0,309.25,0.309814,0.0,0.0,3537.711182,898.721313,898.721313,0,0,1,0
4,495325.0,5054275.0,309.25,0.309814,0.0,0.0,3539.622070,902.399597,902.399597,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61868991,537980.0,5001005.0,314.50,0.306152,0.0,0.0,5.000000,2205.362793,2205.362793,1,0,0,0
61868992,537985.0,5001005.0,314.50,0.306152,0.0,0.0,5.000000,2201.419922,2201.419922,1,0,0,0
61868993,537990.0,5001005.0,314.50,0.306152,0.0,0.0,0.000000,2197.481445,2197.481445,1,0,0,0
61868994,537995.0,5001005.0,314.50,0.306152,0.0,0.0,0.000000,2193.547363,2193.547363,1,0,0,0


In [24]:
predict_df = full_data_df.copy()

data_coord = pd.concat([predict_df[col] for col in ['y', 'x']], axis=1)
predict_df = predict_df.drop(columns=['x','y'])

#Fix order column for model
column_order = ["ndvi","lst"]
column_order += odc_datasets
column_order += ["landcover_landcover_1","landcover_landcover_2","landcover_landcover_3","landcover_landcover_5"]

predict_df = predict_df[column_order]

# ignore the warnings for feature names. The important thing is that the dataset has the same order of the training one
# to remove the warnings create a dataframe with the normalized dataset and the column list
batch_size = 1000000

if normalize:
    full_data_predict = scaler.transform(predict_df)
else:
    full_data_predict = predict_df

# predict probabilities in batches
probs = []
for i in range(0, len(full_data_predict), batch_size):
    batch = full_data_predict[i:i+batch_size]
    batch_probs = ai_model.predict_proba(batch)[:,1]
    probs.append(batch_probs)
    print(f'Done {i}')

full_data_prob = np.concatenate(probs, axis=0)
full_data_prob


Done 0
Done 1000000
Done 2000000
Done 3000000
Done 4000000
Done 5000000
Done 6000000
Done 7000000
Done 8000000
Done 9000000
Done 10000000
Done 11000000
Done 12000000
Done 13000000
Done 14000000
Done 15000000
Done 16000000
Done 17000000
Done 18000000
Done 19000000
Done 20000000
Done 21000000
Done 22000000
Done 23000000
Done 24000000
Done 25000000
Done 26000000
Done 27000000
Done 28000000
Done 29000000
Done 30000000
Done 31000000
Done 32000000
Done 33000000
Done 34000000
Done 35000000
Done 36000000
Done 37000000
Done 38000000
Done 39000000
Done 40000000
Done 41000000
Done 42000000
Done 43000000
Done 44000000
Done 45000000
Done 46000000
Done 47000000
Done 48000000
Done 49000000
Done 50000000
Done 51000000
Done 52000000
Done 53000000
Done 54000000
Done 55000000
Done 56000000
Done 57000000
Done 58000000
Done 59000000
Done 60000000
Done 61000000


array([0.53443959, 0.2184923 , 0.21880085, ..., 0.88678893, 0.88665401,
       0.86201955])

In [25]:
#append the x,y coordinates to the probabilities
full_data_df = pd.DataFrame(full_data_prob).reset_index(drop=True)
full_data_df['x'] = data_coord['x'].reset_index(drop=True)
full_data_df['y'] = data_coord['y'].reset_index(drop=True)

In [26]:
full_data_df.rename(columns={0:'probability'}, inplace=True)
full_data_df = full_data_df.round({'probability': 4})
full_data_df['probability'] = full_data_df['probability'].astype('float32')


In [27]:
base_path = 'predictions'
predictions_path = f'{base_path}/{city}_UHI_{model}_predictions_acc_{model_score}.csv'
print(f'Saving to {predictions_path}')
full_data_df.to_csv(predictions_path)


Saving to predictions/MILANO_UHI_ANN_predictions_acc_86.csv
