## Visualise the data and the summary statistics.

In [1]:
from os.path import join
from itertools import compress
import numpy as np
import pandas as pd
import tensorflow as tf
import geopandas as gpd
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns

# helper functions
from importlib import reload
import model_utils
import viz_utils

# settings
viz = True
storm = ""  # e.g., "roanu" or "" for all events
temporal = False
binary = False
features = ['elevation', 'jrc_permwa', 'precip', 'ndvi',
            'soilcarbon', 'dist_pw', 'slope_pw', 'aqueduct',
            'aqueduct_25', 'aqueduct_50', 'aqueduct_100']

# set up the environment
wd = join("..", "data")
imdir = join(wd, 'images')

In [2]:
# load the GeoDataFrame
gdf, features = model_utils.get_data(wd, features, temporal, binary, storm)

# binarise aqueduct also
if binary:
    gdf['aqeuduct'] = gdf['aqueduct'].apply(lambda x: 1 if x > 0 else 0)

nfeatures = len(features)

print("Number of unique images (i.e., subregions):", gdf['event'].nunique())

Number of storms: 9
Number of regions: 19
Number of unique images (i.e., subregions): 30


In [3]:
# examine gdf
n = len(gdf)

if binary:
    n1 = gdf['floodfrac'].sum()
    n0 = n - n1
    print("Zeros:", n0)
    print("Ones:", n1)
    print(f"Null values: {sum(gdf.isnull().any())}")

In [4]:
gdf.describe()

Unnamed: 0,elevation,jrc_permwa,precip,ndvi,soilcarbon,dist_pw,slope_pw,aqueduct,aqueduct_25,aqueduct_50,aqueduct_100,floodfrac,wind_avg
count,122880.0,122880.0,122880.0,122880.0,122880.0,122880.0,122880.0,122880.0,122880.0,122880.0,122880.0,122880.0,122880.0
mean,-4.445891,53.488151,23.386936,2823.867862,3.052012,1293.837147,0.016267,0.128115,0.097748,0.117517,0.13164,0.049493,7.660423
std,138.083744,45.565557,13.63839,2966.811578,4.034765,2719.794429,0.47983,0.321105,0.282233,0.307752,0.324864,0.166794,2.854621
min,-1083.769231,0.0,0.686867,-1740.313043,0.0,0.0,-120.260621,0.0,0.0,0.0,0.0,0.0,2.834749
25%,-11.0,0.022727,9.066144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.808935
50%,0.531331,72.155027,26.548207,1756.119257,2.116592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.319067
75%,8.16549,98.987755,32.879065,5826.124531,4.824786,1285.433999,0.005837,0.0,0.0,0.0,0.0,0.0,10.510922
max,1496.661987,100.0,68.086121,9047.517551,46.908108,27253.237375,53.523497,1.0,1.0,1.0,1.0,1.0,11.435117


## Floodfrac-aqueduct correlations

In [5]:
# horizontal pairplot
viz = False

if viz:
    sns.set(style="ticks")
    pplot = sns.pairplot(data=gdf, y_vars=['floodfrac'], x_vars=features,
                         hue="floodfrac", palette='YlGnBu', kind='scatter')
    fig = pplot.fig
    fig.savefig(join(imdir, "eda", f"horiz_pairplot_binary{binary}.png"))

In [6]:
# view data and save files
runme = False

reload(viz_utils)

events = [*gdf.event.unique()]
nevents = len(events)

if runme:
    for event in (pbar := tqdm(events)):
        pbar.set_description("Processing %s" % event)
        fig, axs = plt.subplots(nfeatures + 1, figsize=(2, 2 * (nfeatures + 1)))
        
        # plot floodfrac
        ax = axs[0]
        gdf_event = gdf[gdf['event'] == event]
        gdf_event.plot("floodfrac", ax=ax, legend=True,
                       cmap=viz_utils.cmap_key["floodfrac"][0])
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_title(f"Flood: {event}")
        
        # plot features
        for fi, feature in enumerate(features):
            ax = axs[fi+1]
            
            cmap, under, over = viz_utils.cmap_key[feature]
            cmap = plt.get_cmap(cmap).copy()
            cmap.set_under(under, 1.0)
            cmap.set_over(over, 1.0)
            
            vmin, vmax = viz_utils.cmap_range[feature]
            
            gdf_event.plot(feature, ax=ax, cmap=cmap,
                          vmin=vmin, vmax=vmax, legend=True)
            
            ax.set_title(feature)
            ax.set_xticks([])
            ax.set_yticks([])

        fig.savefig(join(imdir, "datasets", f"{event}.png"), bbox_inches="tight")
        plt.close(fig)

In [None]:
viz = False

if viz:
    fig, ax = plt.subplots(1, 1, figsize=(12, 10))

    cmat = sns.heatmap(gdf.corr(), annot = True, ax=ax, linewidths=.5, fmt='.1g', cmap="YlGnBu_r")
    cmat.set(title = "Correlation matrix\n", xticklabels=column_strs, yticklabels=column_strs )

    fig.savefig(join(imdir, f"correlation matrix binary{binary}.png"))

In [None]:
viz = False

if viz:
    sns.set(style="ticks")
    pplot = sns.pairplot(gdf, hue="floodfrac", palette='YlGnBu', kind='scatter', diag_kind='kde');

    fig = pplot.fig
    fig.savefig(join(imdir, f"pairplot_binary{binary}.png"))
    
    !say finished making pairplot

## Flood fraction

In [35]:
from sklearn.metrics import confusion_matrix as cm

def csi(y_true, y_pred):
    """Critical success index"""
    confusion_matrix = cm(y_true, y_pred)
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1] #(row, col) = (true, pred)
    FN = confusion_matrix[1, 0]
    csi = TP / (TP + FP + FN)  # critical success index
    return csi


floodfrac = [*(gdf['floodfrac'] > 0.5).astype(int)]
aqueduct_100 = [*(gdf['aqueduct_100'] > 0.5).astype(int)]

csi_score = csi(floodfrac, aqueduct_100)
print(f"CSI: {csi_score:.4f}")

CSI: 0.1855
