# Calculating virtual time series with ``EOCube``

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from pathlib import Path
import rasterio
import seaborn as sns

# from eobox.raster import MultiRasterIO
from eobox import sampledata
from eobox.raster import cube



def get_sampledata(year):
    dataset = sampledata.get_dataset("lsts")
    layers_paths = [Path(p) for p in dataset["raster_files"]]
    layers_df = pd.Series([p.stem for p in layers_paths]).str.split("_", expand=True) \
    .rename({0: "sceneid", 1:"band"}, axis=1)

    layers_df["date"] = pd.to_datetime(layers_df.sceneid.str[9:16], format="%Y%j")
    layers_df["uname"] = layers_df.sceneid.str[:3] + "_" + layers_df.date.dt.strftime("%Y-%m-%d") + "_" + layers_df.band.str[::] 
    layers_df["path"] = layers_paths

    layers_df = layers_df.sort_values(["date", "band"])
    layers_df = layers_df.reset_index(drop=True)

    layers_df_year = layers_df[(layers_df.date >= str(year)) & (layers_df.date < str(year+1))]
    layers_df_year = layers_df_year.reset_index(drop=True)
    return layers_df_year


## Input data constraints

With ``EOCube`` we can for example calculate temporal features from stacks of data that do not fit in memory.

We accept the following data constrains:

* All layers, i.e. a single band of an acquisition, are available as single band GDAL readable raster files.
* All files represent raster that are exactly spatially aligned with the same resolution, extend, projection, number of rows and columns etc. 
* (If all the above holds true you might band-subset or cut the data as VRTs to meet the spatial extend and single layer file constraint.
* The data needs to come in a pandas dataframe containing at least the columns *path* containing the respective information. 

For calculating temporal features *date*, *band*, *sceneid* - as in the following sample dataset - are useful additional columns that we can access later in your custom code.

In [2]:
df_layers = get_sampledata(2008)
display(df_layers.head())
df_layers.band.value_counts()

Unnamed: 0,sceneid,band,date,uname,path
0,LT50350322008110PAC01,b3,2008-04-19,LT5_2008-04-19_b3,/home/ben/anaconda/envs/eocube/lib/python3.6/s...
1,LT50350322008110PAC01,b4,2008-04-19,LT5_2008-04-19_b4,/home/ben/anaconda/envs/eocube/lib/python3.6/s...
2,LT50350322008110PAC01,b5,2008-04-19,LT5_2008-04-19_b5,/home/ben/anaconda/envs/eocube/lib/python3.6/s...
3,LT50350322008110PAC01,fmask,2008-04-19,LT5_2008-04-19_fmask,/home/ben/anaconda/envs/eocube/lib/python3.6/s...
4,LE70350322008118EDC00,b3,2008-04-27,LE7_2008-04-27_b3,/home/ben/anaconda/envs/eocube/lib/python3.6/s...


b3       23
b5       23
b4       23
fmask    23
Name: band, dtype: int64

## Processing Steps

Often we want to work with dataframes.
For example, we have the following function which calculates virtual time series from a dataframe where the pixels /samples are in the rows and the layers are in the columns. 

In [3]:
def create_virtual_time_series(df, rule="2W", verbose=False):
    
    if verbose:
        print("Shape of input dataframe:" ,df.shape)
    
    transpose_before_return = False
    if isinstance(df.columns, pd.DatetimeIndex):
        transpose_before_return = True
        df = df.transpose()
        
    # define the virtual points in the time series index
    idx_virtual = df.resample(rule).asfreq().index
    if verbose:
        print("Length of virtual time series:" ,len(idx_virtual))
    # add the existing time series points to the virtual time series index 
    idx_virtual_and_data = idx_virtual.append(df.index).unique()
    idx_virtual_and_data = idx_virtual_and_data.sort_values()
    if verbose:
        print("Length of virtual and data time series:" ,len(idx_virtual_and_data))
    # extend the time series data such that it contains all existing and virtual time series points 
    df = df.reindex(index=idx_virtual_and_data)
    # interpolate between dates and forward/backward fill edges with closest values
    df = df.interpolate(method='time')
    df = df.bfill()
    df = df.ffill()
    df = df.loc[idx_virtual]
    
    if transpose_before_return:
        df = df.transpose()
    if verbose:
        print("Shape of output dataframe:" ,df.shape)
    return df

If we want to use this data with ``EOCube`` we have to do the following processing steps:

1. get a ``EOCubeChunk`` object read the data with ``EOCubeChunk.read_data()``

2. convert the data with ``EOCubeChunk.convert_data_to_dataframe()``

3. run custom code, here: calculate the virtual time series with the following steps
    
    1. create a subset such that the fmask data is in its own dataframe

    2. get a dataframe with booleans such that valid pixels (clear sky observations) are ``True`` and others are ``False``
    
    3. create a subsets of the dataframe such that the data of each band is in its own dataframe 
    
    4. apply the boolean mask to the band dataframes such that invalid pixels are nan
    
    5. calculate the virtual time series for each band, i.e. apply the ``create_virtual_time_series`` function above
    
    6. stack the results together with meaningful column / layer names

4. create destination files paths

5. save the results with ``EOCubeChunk.write_dataframe()``

Let's do it step by step based on one chunk. Later we will then wrap everythin in one function and run it for all chunks.

## Development of code for one chunk

In [4]:
# later we will make this more generic and this will be some of the arguments
variable_names = ["b3", "b4", "b5"]
qa_name = "fmask"
qa_valids = [0, 1]

eoc = cube.EOCube(df_layers, chunksize=2**5)

# 1.
eoc_chunk = eoc.get_chunk(0)
eoc_chunk.read_data()
# 2.
eoc_chunk = eoc_chunk.convert_data_to_dataframe()
eoc_chunk.data.head(3)

uname,LT5_2008-04-19_b3,LT5_2008-04-19_b4,LT5_2008-04-19_b5,LT5_2008-04-19_fmask,LE7_2008-04-27_b3,LE7_2008-04-27_b4,LE7_2008-04-27_b5,LE7_2008-04-27_fmask,LT5_2008-05-05_b3,LT5_2008-05-05_b4,...,LT5_2008-10-28_b5,LT5_2008-10-28_fmask,LE7_2008-11-21_b3,LE7_2008-11-21_b4,LE7_2008-11-21_b5,LE7_2008-11-21_fmask,LE7_2008-12-07_b3,LE7_2008-12-07_b4,LE7_2008-12-07_b5,LE7_2008-12-07_fmask
0,4467,5026,548,3,3907,4862,584,3,3707,4329,...,1788,0,-9999,-9999,-9999,255,2376,3708,665,3
1,4297,4686,571,3,16000,4683,584,3,3680,4297,...,1650,0,-9999,-9999,-9999,255,2340,3657,711,3
2,4242,4720,548,3,16000,4358,562,3,3574,4133,...,1512,0,-9999,-9999,-9999,255,2233,3251,665,3


In [5]:
# 3.A.
ilocs_qa = np.where((eoc_chunk.df_layers["band"] == qa_name).values)[0]
df_qa = eoc_chunk.data.iloc[:, ilocs_qa]
df_qa.columns = eoc_chunk.df_layers["date"].iloc[ilocs_qa]
# 3.B.
df_clearsky = df_qa.isin(qa_valids)
df_clearsky.head(2)

date,2008-04-19 00:00:00,2008-04-27 00:00:00,2008-05-05 00:00:00,2008-05-21 00:00:00,2008-05-29 00:00:00,2008-06-06 00:00:00,2008-06-14 00:00:00,2008-06-22 00:00:00,2008-06-30 00:00:00,2008-07-08 00:00:00,...,2008-08-09 00:00:00,2008-08-17 00:00:00,2008-08-25 00:00:00,2008-09-02 00:00:00,2008-09-18 00:00:00,2008-09-26 00:00:00,2008-10-12 00:00:00,2008-10-28 00:00:00,2008-11-21 00:00:00,2008-12-07 00:00:00
0,False,False,False,True,False,True,False,True,True,True,...,True,False,True,True,True,False,False,True,False,False
1,False,False,True,True,False,True,False,True,True,True,...,True,False,True,True,True,False,False,True,False,False


In [6]:
# 3.C. - F.
result = []
result_layernames = []
for var in variable_names:
    ilocs_var = np.where((eoc_chunk.df_layers["band"] == var).values)[0]
    df_var = eoc_chunk.data.iloc[:, ilocs_var]
    df_var.columns = eoc_chunk.df_layers["date"].iloc[ilocs_var]
    df_var = df_var.where(df_clearsky, other=np.nan)
    
    df_result = create_virtual_time_series(df_var, rule="2W", verbose=False)
    df_result = df_result.astype(np.int16)
    layernames_result = "ls-icoll-2008-a" + "__" + "vts" + "__" + df_result.columns.strftime("%Y-%m-%d") + "__" + var
    df_result.columns = layernames_result
    result.append(df_result)

# can we do now if we have a lot of data:
# del eoc_chunk.data
    
result = pd.concat(result, axis=1)
result.head(2)

Unnamed: 0,ls-icoll-2008-a__vts__2008-04-20__b3,ls-icoll-2008-a__vts__2008-05-04__b3,ls-icoll-2008-a__vts__2008-05-18__b3,ls-icoll-2008-a__vts__2008-06-01__b3,ls-icoll-2008-a__vts__2008-06-15__b3,ls-icoll-2008-a__vts__2008-06-29__b3,ls-icoll-2008-a__vts__2008-07-13__b3,ls-icoll-2008-a__vts__2008-07-27__b3,ls-icoll-2008-a__vts__2008-08-10__b3,ls-icoll-2008-a__vts__2008-08-24__b3,...,ls-icoll-2008-a__vts__2008-08-10__b5,ls-icoll-2008-a__vts__2008-08-24__b5,ls-icoll-2008-a__vts__2008-09-07__b5,ls-icoll-2008-a__vts__2008-09-21__b5,ls-icoll-2008-a__vts__2008-10-05__b5,ls-icoll-2008-a__vts__2008-10-19__b5,ls-icoll-2008-a__vts__2008-11-02__b5,ls-icoll-2008-a__vts__2008-11-16__b5,ls-icoll-2008-a__vts__2008-11-30__b5,ls-icoll-2008-a__vts__2008-12-14__b5
0,1274,1274,1274,845,524,263,258,436,882,376,...,1161,1246,1247,1495,1606,1716,1788,1788,1788,1788
1,3680,3680,1790,832,500,247,301,436,770,368,...,1072,1287,1214,1437,1517,1598,1650,1650,1650,1650


In [7]:
# 4.
dst_basedir = Path("./xxx_uncontrolled_vts_develop") / "ls-icoll-2008-a" / "vts"
dst_paths = [dst_basedir / (col + ".vrt") for col in result.columns]

In [8]:
# 5.
eoc_chunk.write_dataframe(result, dst_paths)

What we now got is the following files:

In [9]:
created_files = list(dst_basedir.rglob("*.tif"))
print(f"{len(created_files)} files created. Showing the three first paths:")
created_files[:3]

54 files created. Showing the three first paths:


[PosixPath('xxx_uncontrolled_vts_develop/ls-icoll-2008-a/vts/xchunks_ls-icoll-2008-a__vts__2008-10-19__b3/ls-icoll-2008-a__vts__2008-10-19__b3_ji-0.tif'),
 PosixPath('xxx_uncontrolled_vts_develop/ls-icoll-2008-a/vts/xchunks_ls-icoll-2008-a__vts__2008-11-02__b4/ls-icoll-2008-a__vts__2008-11-02__b4_ji-0.tif'),
 PosixPath('xxx_uncontrolled_vts_develop/ls-icoll-2008-a/vts/xchunks_ls-icoll-2008-a__vts__2008-08-10__b5/ls-icoll-2008-a__vts__2008-08-10__b5_ji-0.tif')]

The paths and chunk files are somehow not the ones that we defined. 
But this because we only processed one chunk. 
Actually we want to wrap all the steps 1-5 in a function and pass it to ``EOCube`` which will then calculate all the chunks and also create the VRTs for us.

## Wrapping all steps in one function

 Process all chunks with ``EOCube``

Now we have used the ``EOCubeChunk`` class to develop our code and check if it works for the first chunk. 
Now it is time to wrap-up everything and run it for the whole data.

So lets wrap our custom steps in a function that takes a ``EOCubeChunk`` as a first argument and an arbitrary number of key word arguments and runs through the whole processing steps including the writing. 

Note, that it is important that the paths of the resulting layers are returned by the function such that the merged layers (VRTs) can be written. 

In [10]:
def eoc_chunk_calculate_vts(eoc_chunk, variable_names, qa_name, qa_valids, vts_resolution, fun_dst_path):

    def create_virtual_time_series(df, rule="2W", verbose=False):

        if verbose:
            print("Shape of input dataframe:", df.shape)

        transpose_before_return = False
        if isinstance(df.columns, pd.DatetimeIndex):
            transpose_before_return = True
            df = df.transpose()

        # define the virtual points in the time series index
        idx_virtual = df.resample(rule).asfreq().index
        if verbose:
            print("Length of virtual time series:" ,len(idx_virtual))
        # add the existing time series points to the virtual time series index 
        idx_virtual_and_data = idx_virtual.append(df.index).unique()
        idx_virtual_and_data = idx_virtual_and_data.sort_values()
        if verbose:
            print("Length of virtual and data time series:" ,len(idx_virtual_and_data))
        # extend the time series data such that it contains all existing and virtual time series points 
        df = df.reindex(index=idx_virtual_and_data)
        # interpolate between dates and forward/backward fill edges with closest values
        df = df.interpolate(method='time')
        df = df.bfill()
        df = df.ffill()
        df = df.loc[idx_virtual]

        if transpose_before_return:
            df = df.transpose()
        if verbose:
            print("Shape of output dataframe:", df.shape)
        return df
    # 1.
    eoc_chunk.read_data()
    
    # 2.
    eoc_chunk = eoc_chunk.convert_data_to_dataframe()
                      
    # 3.A.
    ilocs_qa = np.where((eoc_chunk.df_layers["band"] == qa_name).values)[0]
    df_qa = eoc_chunk.data.iloc[:, ilocs_qa]
    df_qa.columns = eoc_chunk.df_layers["date"].iloc[ilocs_qa]
    # 3.B.
    df_clearsky = df_qa.isin(qa_valids)
    df_clearsky.head(2)
                      
    # 3.C.- 3.F.
    result = []
    result_layernames = []
    for var in variable_names:
        ilocs_var = np.where((eoc_chunk.df_layers["band"] == var).values)[0]
        df_var = eoc_chunk.data.iloc[:, ilocs_var]
        df_var.columns = eoc_chunk.df_layers["date"].iloc[ilocs_var]
        df_var = df_var.where(df_clearsky, other=np.nan)

        df_result = create_virtual_time_series(df_var, rule=vts_resolution, verbose=False)
        df_result = df_result.astype(np.int16)
        df_result.columns = df_result.columns.strftime("%Y-%m-%d") + "__" + var
        result.append(df_result)
    result = pd.concat(result, axis=1)

    # 4.
    dst_paths = [fun_dst_path(colname) for colname in result.columns]
    
    # 5.
    eoc_chunk.write_dataframe(result, dst_paths)
    return dst_paths

## Process all chunks

In [11]:
variable_names = ["b3", "b4", "b5"]
qa_name = "fmask"
qa_valids = [0, 1]
vts_resolution = "2W"
def fun_dst_path(result_df_column_name):
    dst_basedir = Path("./xxx_uncontrolled_vts") / "ls-icoll-2008-a" / "vts"
    dst_path = dst_basedir / ("ls-icoll-2008-a" + "__" + "vts" + "__" + result_df_column_name + ".vrt")
    return dst_path

eoc.apply_and_write(fun=eoc_chunk_calculate_vts, 
                   variable_names = ["b3", "b4", "b5"],
                   qa_name = "fmask",
                   qa_valids = [0, 1],
                   vts_resolution = "2W",
                   fun_dst_path = fun_dst_path)


100%|██████████| 4/4 [00:04<00:00,  1.06s/it]


What we get are the VRTs, each of which being a mosaic of the chunks.

In [12]:
created_vrts = list(Path("./xxx_uncontrolled_vts").rglob("*.vrt"))
print(f"{len(created_vrts)} files created. Showing the three first paths:")
created_vrts[:3]

54 files created. Showing the three first paths:


[PosixPath('xxx_uncontrolled_vts/ls-icoll-2008-a/vts/ls-icoll-2008-a__vts__2008-09-21__b4.vrt'),
 PosixPath('xxx_uncontrolled_vts/ls-icoll-2008-a/vts/ls-icoll-2008-a__vts__2008-06-01__b4.vrt'),
 PosixPath('xxx_uncontrolled_vts/ls-icoll-2008-a/vts/ls-icoll-2008-a__vts__2008-09-07__b4.vrt')]

## Running a classification

Code similar to the *Image Classification with MultiRasterIO* tutorial but prediction based on ``EOCube``.

In [13]:
from eobox import raster

import geopandas as gpd

from sklearn.ensemble import RandomForestClassifier

### Extract reference data

In [15]:
src_vector = "./refdata.gpkg"
burn_attribute = "pid"  # should be unique for the polygons and not contain zero
src_raster = created_vrts
feature_names = [Path(src).stem for src in src_raster]
extraction_dir = Path("./xxx_uncontrolled_vts_classi/traindata")

raster.extract(src_vector=src_vector,
                    burn_attribute=burn_attribute,
                    src_raster=src_raster,
                    dst_names=feature_names,
                    dst_dir=extraction_dir)

0

In [16]:
df_extracted = raster.load_extracted(extraction_dir)
gdf = gpd.read_file(src_vector)
df_extracted = df_extracted.merge(gdf[["pid", "cid"]], how="left", left_on="aux_vector_pid", right_on="pid")
df_extracted.head(2)

Unnamed: 0,ls-icoll-2008-a__vts__2008-05-18__b3,ls-icoll-2008-a__vts__2008-08-10__b4,ls-icoll-2008-a__vts__2008-10-19__b4,ls-icoll-2008-a__vts__2008-05-18__b5,ls-icoll-2008-a__vts__2008-11-02__b4,ls-icoll-2008-a__vts__2008-04-20__b3,ls-icoll-2008-a__vts__2008-11-02__b3,ls-icoll-2008-a__vts__2008-07-27__b4,ls-icoll-2008-a__vts__2008-09-21__b3,ls-icoll-2008-a__vts__2008-06-01__b3,...,ls-icoll-2008-a__vts__2008-05-04__b5,ls-icoll-2008-a__vts__2008-11-30__b4,ls-icoll-2008-a__vts__2008-11-16__b5,ls-icoll-2008-a__vts__2008-07-13__b5,ls-icoll-2008-a__vts__2008-09-07__b5,ls-icoll-2008-a__vts__2008-08-10__b3,ls-icoll-2008-a__vts__2008-05-04__b4,ls-icoll-2008-a__vts__2008-11-30__b5,pid,cid
0,971,1625,1088,728,869,1169,296,2022,336,717,...,572,869,475,842,683,282,2165,475,6,2
1,1209,1561,1237,836,1023,1277,381,2064,343,897,...,572,1023,510,938,777,284,2329,510,6,2


### Build a Model

Lets train a Random Forest model with the whole reference data on four Sentinel-2 bands.

With the extracted data that can be done as follows:

In [17]:
y = df_extracted["cid"].values
X = df_extracted[feature_names].values
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Create a prediction function

Often we want as an outcome the class or prediction layer, probability layers and confidence layers.
We can derive these classification outcomes as follows: 

In [22]:
df = df_extracted[feature_names]
def predict_extended(df, clf):
    
    def convert_to_uint8(arr):
        return arr.astype(np.uint8)
    
    probs = clf.predict_proba(df.values)
    pred_idx = probs.argmax(axis=1)
    pred = np.zeros_like(pred_idx).astype(np.uint8)
    for i in range(probs.shape[1]):
        pred[pred_idx==i] = clf.classes_[i]
    # get reliability layers (maximum probability and margin, i.e. maximum probability minus second highest probability)
    probs_sorted = np.sort(probs, axis=1)
    max_prob = probs_sorted[:,probs_sorted.shape[1]-1]
    margin = probs_sorted[:,probs_sorted.shape[1]-1] - probs_sorted[:,probs_sorted.shape[1]-2]
    
    probs = convert_to_uint8(probs*100)
    max_prob = convert_to_uint8(max_prob*100)
    margin = convert_to_uint8(margin*100)
    
    ndigits = len(str(max(clf.classes_)))
    prob_names = [f"prob_{cid:0{ndigits}d}" for cid in clf.classes_]
    df_result = pd.concat([pd.DataFrame({"pred":pred, "max_prob":max_prob, "margin":margin}),
                           pd.DataFrame(probs, columns=prob_names)], axis=1)
    return df_result

df_result = predict_extended(df_extracted[feature_names], clf)
df_result.head()              

Unnamed: 0,pred,max_prob,margin,prob_1,prob_2
0,2,93,86,7,93
1,2,96,92,4,96
2,2,100,100,0,100
3,2,98,96,2,98
4,2,86,72,14,86


### Predict a chunk

First we need an ``EOCube`` object with the feature used in the model.
Then we get the data of a chunk to check if our function works.

In [30]:
df_features = pd.DataFrame({"uname": feature_names,
                            "path": created_vrts})
eoc_vts = cube.EOCube(df_features, chunksize=2**5)

eoc_chunk_vts = eoc_vts.get_chunk(0)
eoc_chunk_vts.read_data()
eoc_chunk_vts = eoc_chunk_vts.convert_data_to_dataframe()
display(eoc_chunk_vts.data.head(3))
print("Shape of eoc_chunk_vts.data : ", eoc_chunk_vts.data.shape)
df_result = predict_extended(eoc_chunk_vts.data, clf)
display(df_result.head())

uname,ls-icoll-2008-a__vts__2008-09-21__b4,ls-icoll-2008-a__vts__2008-06-01__b4,ls-icoll-2008-a__vts__2008-09-07__b4,ls-icoll-2008-a__vts__2008-12-14__b3,ls-icoll-2008-a__vts__2008-07-13__b4,ls-icoll-2008-a__vts__2008-04-20__b3,ls-icoll-2008-a__vts__2008-09-21__b5,ls-icoll-2008-a__vts__2008-08-10__b3,ls-icoll-2008-a__vts__2008-04-20__b4,ls-icoll-2008-a__vts__2008-06-15__b4,...,ls-icoll-2008-a__vts__2008-06-01__b5,ls-icoll-2008-a__vts__2008-05-04__b4,ls-icoll-2008-a__vts__2008-05-18__b3,ls-icoll-2008-a__vts__2008-07-13__b5,ls-icoll-2008-a__vts__2008-11-30__b5,ls-icoll-2008-a__vts__2008-11-16__b5,ls-icoll-2008-a__vts__2008-07-27__b5,ls-icoll-2008-a__vts__2008-10-05__b3,ls-icoll-2008-a__vts__2008-08-10__b5,ls-icoll-2008-a__vts__2008-08-24__b5
0,3013,2679,3119,894,2696,1274,1495,882,2215,3024,...,1795,2215,1274,959,1788,1788,1227,655,1161,1246
1,2791,2676,2940,767,2658,3680,1437,770,4297,2972,...,1704,4297,1790,1025,1650,1650,1226,590,1072,1287
2,2654,2794,2733,680,2631,3574,1243,543,4133,2925,...,1653,4133,2182,1089,1512,1512,1217,771,932,1254


Shape of eoc_chunk_vts.data :  (1024, 54)


Unnamed: 0,pred,max_prob,margin,prob_1,prob_2
0,2,72,43,28,72
1,2,83,65,17,83
2,2,85,70,15,85
3,2,77,54,23,77
4,2,78,56,22,78


In [35]:
def fun_dst_path(colname):
    dst_path = Path(f"./xxx_uncontrolled_vts_classi/ls-icoll-2008-a__vts__b3-4-5__{colname}.vrt")
    return dst_path

def eoc_chunk_classifier(eoc_chunk, clf, fun_dst_path):

    def predict_extended(df, clf):
        probs = clf.predict_proba(df.values)
        pred_idx = probs.argmax(axis=1)
        pred = np.zeros_like(pred_idx).astype(np.uint8)
        for i in range(probs.shape[1]):
            pred[pred_idx==i] = clf.classes_[i]
        # get reliability layers (maximum probability and margin, i.e. maximum probability minus second highest probability)
        probs_sorted = np.sort(probs, axis=1)
        max_prob = probs_sorted[:,probs_sorted.shape[1]-1]
        margin = probs_sorted[:,probs_sorted.shape[1]-1] - probs_sorted[:,probs_sorted.shape[1]-2]

        ndigits = len(str(max(clf.classes_)))
        prob_names = [f"prob_{cid:0{ndigits}d}" for cid in clf.classes_]
        df_result = pd.concat([pd.DataFrame({"pred":pred, "max_prob":max_prob*100, "margin":margin*100}),
                               pd.DataFrame(probs*100, columns=prob_names)], axis=1)
        df_result = df_result.astype(np.uint8)  # BE CAREFUL IF THIS DOES NOT FIT YOUR CLASS IDs
        return df_result
                              
    eoc_chunk.read_data()
    eoc_chunk = eoc_chunk.convert_data_to_dataframe()
    result = predict_extended(eoc_chunk.data, clf)
    dst_paths = [fun_dst_path(colname) for colname in result.columns]
    eoc_chunk.write_dataframe(result, dst_paths)
    return dst_paths


In [36]:
eoc_chunk_classifier(eoc_chunk_vts, clf, fun_dst_path)

[PosixPath('xxx_uncontrolled_vts_classi/ls-icoll-2008-a__vts__b3-4-5__pred.vrt'),
 PosixPath('xxx_uncontrolled_vts_classi/ls-icoll-2008-a__vts__b3-4-5__max_prob.vrt'),
 PosixPath('xxx_uncontrolled_vts_classi/ls-icoll-2008-a__vts__b3-4-5__margin.vrt'),
 PosixPath('xxx_uncontrolled_vts_classi/ls-icoll-2008-a__vts__b3-4-5__prob_1.vrt'),
 PosixPath('xxx_uncontrolled_vts_classi/ls-icoll-2008-a__vts__b3-4-5__prob_2.vrt')]

In [37]:
eoc_vts.apply_and_write(fun=eoc_chunk_classifier,
                        clf=clf, 
                        fun_dst_path=fun_dst_path)

100%|██████████| 4/4 [00:00<00:00, 10.08it/s]
