# Features - statistical metrics - DEV

This code processes all variables at once. 

Advantage: The QA layer has to be loaded only once

Disadvantage: Parallelization and check for / skip over esults of existing single bands with Snakemake. 

**TODO**: Create a Snakemake task.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# - - - - - - - - - - - - - - - - - - - - 
# DEFAULT IMPORTS - IN ALL NOTEBOKS
from src import configs

prjconf = configs.ProjectConfigParser()

# - - - - - - - - - - - - - - - - - - - - 
# NOTEBOOK SPECIFIC IMPORTS
import numpy as np
from pathlib import Path
import pandas as pd
import rasterio
from tqdm import tqdm

from eobox.raster import cube
from eobox.raster import gdalutils

tilenames = prjconf.get("Params", "tiles").split(" ")

tilenames =['32UNU', '32UPU', '32UQU', '33UUP', '32TPT', '32TQT', '33TUN']

tilenames

['32UNU', '32UPU', '32UQU', '33UUP', '32TPT', '32TQT', '33TUN']

## Inputs

### Parameters for the script

In [2]:
scoll_id = 1
scoll_name = f"scoll{scoll_id:02d}"

variables = ["Red", "NIR", "SWIR1", "SWIR2"]
qa = "CLEAR"
qa_valid = [1]

print(f"{'scoll_name':12s}: {scoll_name}")
print(f"{'variables':12s}: {variables}")
print(f"{'qa':12s}: {qa}")


scoll_name  : scoll01
variables   : ['Red', 'NIR', 'SWIR1', 'SWIR2']
qa          : CLEAR


### Files

In Snakemake the list would be created by wildcard catching the tiles.

In [3]:
scolls = {tile: (prjconf.get_path("Raw", "scene_colls") / tile / f"df_{scoll_name}.csv") for tile in tilenames}
scolls

{'32UNU': PosixPath('/home/ben/Devel/Projects/classify-hls/data/raw/scene_collections/32UNU/df_scoll01.csv'),
 '32UPU': PosixPath('/home/ben/Devel/Projects/classify-hls/data/raw/scene_collections/32UPU/df_scoll01.csv'),
 '32UQU': PosixPath('/home/ben/Devel/Projects/classify-hls/data/raw/scene_collections/32UQU/df_scoll01.csv'),
 '33UUP': PosixPath('/home/ben/Devel/Projects/classify-hls/data/raw/scene_collections/33UUP/df_scoll01.csv'),
 '32TPT': PosixPath('/home/ben/Devel/Projects/classify-hls/data/raw/scene_collections/32TPT/df_scoll01.csv'),
 '32TQT': PosixPath('/home/ben/Devel/Projects/classify-hls/data/raw/scene_collections/32TQT/df_scoll01.csv'),
 '33TUN': PosixPath('/home/ben/Devel/Projects/classify-hls/data/raw/scene_collections/33TUN/df_scoll01.csv')}

## Outputs

    dst_paths = {}
    for tile in tilenames:
        dst_paths[tile] = prjconf.get_paths_features_stats_regular_raster(scoll_name, tile, variables, metrics, as_dict=True)
        for var in variables:
            print(f"First and last file (of {len(dst_paths[tile][var])}) of {(tile + ' ' + var)}")
            print("   " + dst_paths[tile][var][0])
            print("   " + dst_paths[tile][var][-1])
            print()

In [4]:
for tile in tilenames:
    print("*" * 100)
    print(tile)

    scoll_layers = prjconf.get_layer_df_of_scene_collection(scoll_name, variables + [qa], tile)
    assert len(scoll_layers["tile"].unique()) == 1
    assert len(scoll_layers["product"].unique()) == 1  
    # here L30 & S30 would also make sense but then we need to change the product string below
    # tile = scoll_layers["tile"].unique()[0]
    # product = scoll_layers["product"].unique()[0]

    dst_pattern = prjconf.get_paths_features_stats_regular_raster(scoll_name, tile, variables, as_dict=True, return_patter=True)
    print(dst_pattern)

    
    scoll = cube.EOCubeSceneCollection(df_layers=scoll_layers, 
                                       chunksize=2**9, 
                                       variables=variables, 
                                       qa=qa, 
                                       qa_valid=qa_valid 
                                      )

    scoll.create_statistical_metrics(
        percentiles=[.05, .1, .25, .5, .75, .9, .95],
        iqr=True,
        diffs=True,
        dst_pattern=dst_pattern,
        dtypes="int16",
        compress='lzw',
        nodata=None,
        num_workers=6)

    ## Create VRTs
    # Create a time series layer stack (VRT) for each variable. 
    dst_dir = prjconf.get_path("Processed", "raster", tile=tile) / scoll_name
    dst_dir_vsts_stack = prjconf.get_path("Processed", "raster", tile=tile) / "VRTs" / "ts_per_band" / tile / f"{scoll_name}"
    dst_dir_vsts_stack.mkdir(parents=True, exist_ok=True)
    dst_dir_vsts_stack

    for var in scoll.variables:
        print("*" * 80)
        print(var)
        input_file_list = list(list(Path(dst_dir).glob(f"*_min_*{var}*")))
        input_file_list += list(np.sort(list(Path(dst_dir).glob(f"*_p??_*{var}*"))))
        input_file_list += list(list(Path(dst_dir).glob(f"*_max_*{var}*")))
        input_file_list += list(np.sort(list(Path(dst_dir).glob(f"*_p??-p??_*{var}*"))))
        input_file_list += list(list(Path(dst_dir).glob(f"*_mean_*{var}*")))
        input_file_list += list(list(Path(dst_dir).glob(f"*_std_*{var}*")))
        output_file = Path(dst_dir_vsts_stack) / f"{tile.lower()}__{scoll_name}__stats__{var}.vrt"
        print(output_file)
        print("Layers:")
        print("\n".join([p.stem for p in input_file_list]))
        gdalutils.buildvrt(input_file_list, output_file, relative=True, separate=True)


****************************************************************************************************
32UNU
/home/ben/Devel/Projects/classify-hls/data/processed/L3/raster/32UNU/scoll01/32unu__scoll01__stats__{metric}__{var}.vrt
Red: 23 / 64 chunks already processed and skipped.
NIR: 23 / 64 chunks already processed and skipped.


  0%|          | 0/4 [00:00<?, ?it/s]

SWIR1: 23 / 64 chunks already processed and skipped.
SWIR2: 22 / 64 chunks already processed and skipped.


5it [59:37, 778.40s/it]                       

KeyboardInterrupt: 

In [None]:
1