In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import rasterio as rio
import rioxarray as rxr
import datacube
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import gc

In [2]:
flood_df = pd.read_csv('Training sets/flood_training_cf_year_no_river_bed.csv')
flood_df.drop(['y','x','geologia','dusaf99','dusaf15','dusaf','dusaf_year','ndvi_2000','ndvi_2002','ndvi_2014','ndvi_2019'],axis=1,inplace=True)
flood_df['flooded'] = flood_df['flooded'].clip(upper=1)
flood_df

Unnamed: 0,twi,tri,spi,slope,water_distance,profile_curvature,plan_curvature,hillshade,aspect,dtm_milan,...,lc_14,lc_21,lc_22,lc_23,lc_31,lc_32,lc_33,lc_41,lc_51,flooded
0,10.121225,0.271741,1.796980,0.921623,74.330345,0.274237,-0.129694,181.0,30.754047,71.10500,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,6.480947,0.596306,-10.170015,2.193870,22.360680,0.290859,-0.842747,187.0,317.719330,107.76089,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,10.975347,0.275029,0.666227,0.343284,28.284271,0.298350,-0.093618,180.0,144.860950,141.87100,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,5.851149,1.874699,-9.540217,4.113334,20.000000,-4.963612,2.188366,191.0,286.229280,140.51300,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,6.548541,0.468189,-10.237609,2.050606,10.000000,-0.424163,0.419770,175.0,167.089020,128.95200,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,4.716925,2.704538,-8.405999,12.602149,395.379580,-0.547879,0.292087,186.0,30.427158,143.40000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19996,12.361214,0.206014,3.574872,0.711363,143.178220,0.424424,-0.139602,179.0,160.105320,116.32400,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19997,9.499601,0.533417,1.820767,1.286905,20.615528,1.159952,-1.036093,177.0,167.138980,137.82500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19998,8.602279,0.113127,-1.471528,0.526209,323.109900,-0.048511,0.011975,179.0,136.804200,157.60910,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
flood_df.columns

Index(['twi', 'tri', 'spi', 'slope', 'water_distance', 'profile_curvature',
       'plan_curvature', 'hillshade', 'aspect', 'dtm_milan', 'ndvi_year',
       'geo_0', 'geo_1', 'geo_2', 'geo_3', 'geo_4', 'geo_5', 'geo_6', 'lc_11',
       'lc_12', 'lc_14', 'lc_21', 'lc_22', 'lc_23', 'lc_31', 'lc_32', 'lc_33',
       'lc_41', 'lc_51', 'flooded'],
      dtype='object')

In [3]:
%%time
X_train, X_test, y_train, y_test = train_test_split(flood_df.drop('flooded', axis=1), flood_df['flooded'], test_size=0.3, random_state=42,stratify=flood_df['flooded'])
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_normalized = scaler.transform(X_train)
X_test_normalized = scaler.transform(X_test)
columns_list = list(X_train.columns)
X_train_normalized = pd.DataFrame(X_train_normalized,columns=columns_list)
X_test_normalized = pd.DataFrame(X_test_normalized,columns=columns_list)
rf_model = RandomForestClassifier(n_estimators = 1000, random_state = 42)
rf_model.fit(X_train_normalized, y_train)
score = rf_model.score(X_test_normalized, y_test)
score

CPU times: user 11.5 s, sys: 7.71 ms, total: 11.5 s
Wall time: 11.5 s


0.9773333333333334

In [4]:
columns_list = list(X_train.columns)
pd.DataFrame(rf_model.feature_importances_,index=columns_list).sort_values(0)


Unnamed: 0,0
geo_6,3e-06
geo_3,1.9e-05
lc_41,5.5e-05
geo_5,0.000131
lc_51,0.000178
lc_32,0.000333
lc_33,0.000898
geo_2,0.001264
lc_23,0.001543
geo_4,0.002557


In [5]:
test_pos_prob = rf_model.predict_proba(X_test_normalized)[:,1]
auc_roc = roc_auc_score(y_test, test_pos_prob)
auc_roc

0.9973612777777778

This part creates the susceptibility maps from the fitted model

In [6]:
#Example of datacube config file:
#datacube_config_path = "/home/user/datacube.conf"
datacube_config_path = "path_to_datacube_config_file"
dc = datacube.Datacube(app = "my_app", config = datacube_config_path)

In [7]:
# First dataset to have a base to merge
datasets = dc.find_datasets(product='dtm_milan')
cf_data = dc.load(datasets=datasets)
full_data_df = cf_data.squeeze().to_dataframe()
del cf_data
full_data_df.reset_index(inplace=True)
full_data_df.drop(['time','spatial_ref'],axis=1,inplace=True)
full_data_df.rename(columns={'elevation':'dtm_milan'},inplace=True)
full_data_df.dropna(how='any',inplace=True)
#full_data_df

In [8]:
#Dask could be used to parallelize the computations and reduce memory usage

cf_list = ['aspect','dusaf','geologia','hillshade','ndvi_2019','plan_curvature','profile_curvature',
           'water_distance','slope','spi','tri','twi']
for cf in cf_list:
    datasets = dc.find_datasets(product=cf)
    cf_data = dc.load(datasets=datasets)
    cf_var_name = list(cf_data.data_vars.keys())[0]
    cf_df = cf_data.squeeze().to_dataframe()
    del cf_data
    cf_df.reset_index(inplace=True)
    cf_df.rename(columns={cf_var_name:cf},inplace=True)
    cf_df.drop(['time','spatial_ref'],axis=1,inplace=True)
    cf_df.dropna(how='any',inplace=True)
    full_data_df = cf_df.merge(full_data_df,on=['y','x'])
    print(cf + ' done')
#full_data_df

aspect done
dusaf done
geologia done
hillshade done
ndvi_2019 done
plan_curvature done
profile_curvature done
water_distance done
slope done
spi done
tri done
twi done


Encoding of geology and land cover. The encoding must be the same used for the training set

In [9]:
geo_datasets = dc.find_datasets(product='geologia')
geo = dc.load(datasets=geo_datasets)
geo_data = geo.codice.squeeze().values
geo_data = geo_data.flatten()
geo_cat = np.unique(geo_data)
geo_cat = geo_cat.reshape(-1,1)[:-1]
del geo_data

lc_datasets = dc.find_datasets(product='dusaf')
lc = dc.load(datasets=lc_datasets)
lc_data = lc.codice.squeeze().values
lc_data = lc_data.flatten()
lc_cat = np.unique(lc_data)
lc_cat = lc_cat.reshape(-1,1)[:-1]
del lc_data


In [10]:
"""
0 - ghiaie, sabbie e limi
1 - ghiaie, sabbie
2 - ghiaie, sabbie e argille ferrettizzate
3 - argille, calcari, conglomerati
4 - ghiaie, limi e argille fortemente ferrettizzati
5 - conglomerati, sabbie, argille
6 - marne
"""

geo_cat_dict = {
    101 : 0,
    201 : 0,
    301 : 0,
    205 : 1,
    206 : 2,
    76  : 3,
    207 : 4,
    8   : 5,
    81  : 6
}

mapped_geo_cat = np.vectorize(geo_cat_dict.get)(geo_cat.flatten())
mapped_geo_cat = np.unique(mapped_geo_cat).reshape(-1,1)
mapped_geo_cat

array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6]])

In [11]:
def map_land_cat(value):
    value_str = str(int(value))
    value_cat = value_str[:2]
    if value_cat == '13':
        value_cat = '12'
    return int(value_cat)

In [12]:
mapped_lc_cat = np.vectorize(map_land_cat)(lc_cat.flatten())
mapped_lc_cat = np.unique(mapped_lc_cat).reshape(-1,1)
mapped_lc_cat

array([[11],
       [12],
       [14],
       [21],
       [22],
       [23],
       [31],
       [32],
       [33],
       [41],
       [51]])

In [13]:
geo_enc = OneHotEncoder()
geo_enc.fit(mapped_geo_cat)
lc_enc = OneHotEncoder()
lc_enc.fit(mapped_lc_cat)

In [14]:
full_data_df['geologia'] = np.vectorize(geo_cat_dict.get)(full_data_df['geologia'])
full_data_df['dusaf'] = np.vectorize(map_land_cat)(full_data_df['dusaf'])

In [15]:
encoded_geo = geo_enc.transform(full_data_df['geologia'].to_numpy().reshape(-1,1))
encoded_geo_df = pd.DataFrame(encoded_geo.toarray(),columns=geo_enc.get_feature_names_out(['geo']))
# encoded_geo_df

In [16]:
encoded_lc = lc_enc.transform(full_data_df['dusaf'].to_numpy().reshape(-1,1))
encoded_lc_df = pd.DataFrame(encoded_lc.toarray(),columns=lc_enc.get_feature_names_out(['lc']))
# encoded_lc_df

In [17]:
full_data_df = full_data_df.merge(encoded_geo_df,left_index=True,right_index=True)
full_data_df = full_data_df.merge(encoded_lc_df,left_index=True,right_index=True)
del encoded_lc_df
del encoded_geo_df
# full_data_df

In [18]:
full_data_df.drop(['geologia','dusaf'],axis=1,inplace=True);

In [19]:
data_coord = pd.concat([full_data_df.pop(x) for x in ['y', 'x']], axis=1)

In [20]:
full_data_df.rename(columns={'ndvi_2019':'ndvi_year'},inplace=True)
full_data_df = full_data_df[columns_list]

In [21]:
full_data_normalized = scaler.transform(full_data_df)

In [22]:
del full_data_df
gc.collect()

1327

In [23]:
# ignore the warnings for feature names. The important thing is that the dataset has the same order of the training one
# to remove the warnings create a dataframe with the normalized dataset and the column list
batch_size = 10000000

# predict probabilities in batches
probs = []
for i in range(0, len(full_data_normalized), batch_size):
    batch = full_data_normalized[i:i+batch_size]
    batch_probs = rf_model.predict_proba(batch)[:,1]
    probs.append(batch_probs)
    print('Done')

full_data_prob = np.concatenate(probs, axis=0)
#full_data_prob = rf_model.predict_proba(full_data_normalized)[:,1]



Done




Done




Done




Done




Done




Done




Done


In [24]:
prob_df = pd.DataFrame(full_data_prob)
prob_df

Unnamed: 0,0
0,0.047
1,0.044
2,0.066
3,0.044
4,0.027
...,...
62916645,0.024
62916646,0.021
62916647,0.021
62916648,0.016


In [25]:
prob_df_coord = prob_df.merge(data_coord,left_index=True,right_index=True)
prob_df_coord

Unnamed: 0,0,y,x
0,0.047,5054282.5,495342.5
1,0.044,5054282.5,495347.5
2,0.066,5054282.5,495352.5
3,0.044,5054282.5,495357.5
4,0.027,5054282.5,495362.5
...,...,...,...
62916645,0.024,5000997.5,537917.5
62916646,0.021,5000997.5,537922.5
62916647,0.021,5000997.5,537927.5
62916648,0.016,5000997.5,537932.5


In [26]:
prob_df_coord.rename(columns={0:'probability'},inplace=True)

In [27]:
prob_df_coord.to_csv('predictions/rf_flood_prob_cf_year_no_river_bed.csv', index=False)

In [30]:
prob_xr = prob_df_coord.to_xarray()
prob_xr

In [58]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in list(
                          locals().items())), key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                  full_data_df: 13.5 GiB
                    data_coord:  3.4 GiB
                         cf_df:  1.6 GiB
                      flood_df:  4.6 MiB
                       X_train:  3.2 MiB
            X_train_normalized:  3.1 MiB
                        X_test:  1.4 MiB
             X_test_normalized:  1.3 MiB
                       y_train: 218.8 KiB
                        y_test: 93.8 KiB


In [52]:
sys.getsizeof(full_data_df)

14495924320

In [51]:
%reset -f out
%reset -f in

Flushing output cache (3 entries)
Flushing input history


In [22]:
import gc
gc.collect()

0