# Performance comparisons

Comparison of reead and computational performances of dask dataframes derived from npy files and from parquet.

For these comparisons it is assumed that some profiling magics are available. 
The following links might be helpeful:

* https://pynash.org/2013/03/06/timing-and-profiling/
* http://gouthamanbalaraman.com/blog/profiling-python-jupyter-notebooks.html


## Load packages, functions and configs

In [25]:
%load_ext line_profiler
%load_ext memory_profiler

%load_ext autoreload
%autoreload 2
%matplotlib inline

# - - - - - - - - - - - - - - - - - - - - 
# DEFAULT IMPORTS - IN ALL NOTEBOKS
from src import configs

prjconf = configs.ProjectConfigParser()

# - - - - - - - - - - - - - - - - - - - - 
# NOTEBOOK SPECIFIC IMPORTS
import dask.dataframe as dd
import numpy as np
from pathlib import Path
import pandas as pd
import rasterio
import shutil
import sys
from tqdm import tqdm

from eobox.raster.extraction import load_extracted_partitions_dask

def sizeof_fmt(num, suffix='B'):
    """Get a string of the filesize given in bytes in a human readable format."""
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler
The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Define the refset to play with

In [2]:
refset_id = "Refset01"
id_vectordata, tilenames = prjconf.get_clean_refset_parameters(refset_id)
dir_refset = prjconf.get_path(refset_id, "rootdir")

print("ALWAYS USE THESE THREE TOGETHER:")
print("id_vectordata  :", id_vectordata)
print("tilenames      :", tilenames)
print("dir_refset     :", dir_refset)

ALWAYS USE THESE THREE TOGETHER:
id_vectordata  : clc_lte50ha
tilenames      : ['32TPT', '32TQT', '32UNU', '32UPU', '32UQU', '33TUN', '33UUP']
dir_refset     : /home/ben/Devel/Projects/classify-hls/data/processed/refset01


## Load dask dataframes

In [76]:
dir_parquet_aux = dir_refset / "optimized_refsets" / "parquet" / "aux"
dir_parquet_features_bands_vts4w = dir_refset / "optimized_refsets" / "parquet" / "features_bands_vts4w"
dir_parquet = dir_refset / "optimized_refsets" / "parquet" / "aux_u_features_bands_vts4w"

In [77]:
src_dirs_tiles = {tile:prjconf.get_path(refset_id, "extracted", tile=tile) for tile in tilenames}
df_npy_aux = load_extracted_partitions_dask(src_dir=src_dirs_tiles,
                                                 global_index_col="aux_index_global", # e.g. "aux_index_global",
                                                 patterns=["aux_*.npy"],
                                                 verbosity=0)
df_npy_features = load_extracted_partitions_dask(src_dir=src_dirs_tiles,
                                                 global_index_col="aux_index_global", # e.g. "aux_index_global",
                                                 patterns=["aux_index_global.npy", "*_vts4w_*.npy"],
                                                 verbosity=0)
df_npy = load_extracted_partitions_dask(src_dir=src_dirs_tiles,
                                                 global_index_col="aux_index_global", # e.g. "aux_index_global",
                                                 patterns=["*.npy"],
                                                 verbosity=0)

df_parquet_aux = dd.read_parquet(str(dir_parquet_aux))
df_parquet_features = dd.read_parquet(str(dir_parquet_features_bands_vts4w))
df_parquet_concat = dd.concat([df_parquet_aux, df_parquet_features], axis=1)
df_parquet = dd.read_parquet(str(dir_parquet))

# df_npy_repart = df_full.repartition(divisions=df_parquet_aux.divisions)

In [27]:
max_index = df_parquet_aux.index.max().compute()

## Loading ramdom rows

### Load a single row

In [104]:
np.random.seed(seed=123)
sample_indices = np.random.randint(0, max_index + 1, 1)

#### AUX

In [86]:
%timeit df_npy_aux.loc[sample_indices].compute()

%timeit df_parquet_aux.loc[sample_indices].compute()

# A test of exchanging the two lines resulted in very similar times:
# PARQUET: 84.7 ms ± 1.58 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)
# NPY: 126 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)
# conclusion - sequence of timeit calls in one cell does not matter.

127 ms ± 5.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
87.1 ms ± 5.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### FEATURES

In [87]:
%timeit df_npy_features.loc[sample_indices].compute()

%timeit df_parquet_features.loc[sample_indices].compute()

229 ms ± 14.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
95.1 ms ± 2.79 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### AUX + FEATURES

In [88]:
%timeit df_npy.loc[sample_indices].compute()

%timeit df_parquet.loc[sample_indices].compute()

%timeit df_parquet_concat.loc[sample_indices].compute()

306 ms ± 15.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
174 ms ± 2.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
213 ms ± 7.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Load 10 rows

In [103]:
np.random.seed(seed=123)
sample_indices = np.random.randint(0, max_index + 1, 10)

#### AUX

In [90]:
%timeit df_npy_aux.loc[sample_indices].compute()

%timeit df_parquet_aux.loc[sample_indices].compute()

# A test of exchanging the two lines resulted in very similar times:
# PARQUET: 84.7 ms ± 1.58 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)
# NPY: 126 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)
# conclusion - sequence of timeit calls in one cell does not matter.

666 ms ± 20.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
630 ms ± 25.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### FEATURES

In [91]:
%timeit df_npy_features.loc[sample_indices].compute()

%timeit df_parquet_features.loc[sample_indices].compute()

704 ms ± 13.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
569 ms ± 21.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### AUX + FEATURES

In [92]:
%timeit df_npy.loc[sample_indices].compute()

%timeit df_parquet.loc[sample_indices].compute()

%timeit df_parquet_concat.loc[sample_indices].compute()

1.21 s ± 21.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.19 s ± 73.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.39 s ± 24.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Load 100 rows

In [102]:
np.random.seed(seed=123)
sample_indices = np.random.randint(0, max_index + 1, 100)

#### AUX

In [94]:
%timeit df_npy_aux.loc[sample_indices].compute()

%timeit df_parquet_aux.loc[sample_indices].compute()

# A test of exchanging the two lines resulted in very similar times:
# PARQUET: 84.7 ms ± 1.58 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)
# NPY: 126 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)
# conclusion - sequence of timeit calls in one cell does not matter.

734 ms ± 18.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
748 ms ± 18.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### FEATURES

In [95]:
%timeit df_npy_features.loc[sample_indices].compute()

%timeit df_parquet_features.loc[sample_indices].compute()

756 ms ± 24.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
680 ms ± 49.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### AUX + FEATURES

In [96]:
%timeit df_npy.loc[sample_indices].compute()

%timeit df_parquet.loc[sample_indices].compute()

%timeit df_parquet_concat.loc[sample_indices].compute()

1.31 s ± 40.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.39 s ± 23.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.71 s ± 48.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Load 1000 rows

In [101]:
np.random.seed(seed=123)
sample_indices = np.random.randint(0, max_index + 1, 1000)

#### AUX

In [98]:
%timeit df_npy_aux.loc[sample_indices].compute()

%timeit df_parquet_aux.loc[sample_indices].compute()

# A test of exchanging the two lines resulted in very similar times:
# PARQUET: 84.7 ms ± 1.58 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)
# NPY: 126 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)
# conclusion - sequence of timeit calls in one cell does not matter.

758 ms ± 45.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
771 ms ± 18.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### FEATURES

In [100]:
%timeit df_npy_features.loc[sample_indices].compute()

%timeit df_parquet_features.loc[sample_indices].compute()

742 ms ± 9.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
700 ms ± 34.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### AUX + FEATURES

In [99]:
%timeit df_npy.loc[sample_indices].compute()

%timeit df_parquet.loc[sample_indices].compute()

%timeit df_parquet_concat.loc[sample_indices].compute()

1.32 s ± 15.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.39 s ± 22.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.72 s ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Conclusions

When loading few pixels npy seems slower than parquet.

When loading from concatenated dask dataframes time increases significantly.

When loading more pixels dask is faster.

## Loading pixels of particular polygons

## Machine Learning

In [36]:
df = dd.read_parquet(str(dir_parquet))
df

Unnamed: 0_level_0,pred,max_prob,margin,prob_111,prob_112,prob_121,prob_122,prob_123,prob_124,prob_131,prob_132,prob_133,prob_141,prob_142,prob_211,prob_221,prob_222,prob_231,prob_242,prob_243,prob_311,prob_312,prob_313,prob_321,prob_322,prob_324,prob_331,prob_332,prob_333,prob_411,prob_412,prob_511,prob_512
npartitions=8,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
,uint16,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


Let us not care about best practices and splitting the data but just get a small training data set to be able to buld a model and predict everything.

In [17]:
tmp = df[["aux_vector_cid_l3"]].compute()

In [18]:
sample = tmp.groupby("aux_vector_cid_l3").apply(lambda x: x.sample(n=100))
sample.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,aux_vector_cid_l3
aux_vector_cid_l3,aux_index_global,Unnamed: 2_level_1
111,2989015,111
111,913383,111
111,976,111
111,913427,111
111,2987386,111


In [19]:
indices = sample.index.get_level_values("aux_index_global")

In [20]:
feature_cols = [col for col in df.columns if "vts4w" in col]
feature_cols[:3]

['scoll01__vts4w__2018-01-07__NIR',
 'scoll01__vts4w__2018-01-07__Red',
 'scoll01__vts4w__2018-01-07__SWIR1']

In [21]:
trainset = df.loc[indices.values, ["aux_vector_cid_l3"] + feature_cols].compute()

assert set(trainset.index.values) == set(indices.values)
trainset.head()

Unnamed: 0_level_0,aux_vector_cid_l3,scoll01__vts4w__2018-01-07__NIR,scoll01__vts4w__2018-01-07__Red,scoll01__vts4w__2018-01-07__SWIR1,scoll01__vts4w__2018-01-07__SWIR2,scoll01__vts4w__2018-02-04__NIR,scoll01__vts4w__2018-02-04__Red,scoll01__vts4w__2018-02-04__SWIR1,scoll01__vts4w__2018-02-04__SWIR2,scoll01__vts4w__2018-03-04__NIR,...,scoll01__vts4w__2018-10-14__SWIR1,scoll01__vts4w__2018-10-14__SWIR2,scoll01__vts4w__2018-11-11__NIR,scoll01__vts4w__2018-11-11__Red,scoll01__vts4w__2018-11-11__SWIR1,scoll01__vts4w__2018-11-11__SWIR2,scoll01__vts4w__2018-12-09__NIR,scoll01__vts4w__2018-12-09__Red,scoll01__vts4w__2018-12-09__SWIR1,scoll01__vts4w__2018-12-09__SWIR2
aux_index_global,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
976,111,1050,765,1360,1249,1249,876,1518,1386,1599,...,2094,1762,1696,1059,2094,1762,1696,1059,2094,1762
6437,111,1154,506,901,734,1507,638,1253,971,1902,...,1602,1100,1810,612,1389,1005,1489,634,1175,910
409,111,49,71,33,31,49,71,33,31,49,...,282,217,461,267,282,217,461,267,282,217
7223,111,1211,679,1276,1030,1376,754,1423,1178,1562,...,1240,842,1475,659,1274,958,1360,719,1308,1073
830,111,965,527,1054,923,1113,609,1246,1089,1373,...,1997,1637,1696,859,1997,1637,1696,859,1997,1637


In [22]:
X = trainset.loc[:, feature_cols].values
y = trainset.loc[:, "aux_vector_cid_l3"].values

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
clf = RandomForestClassifier()
clf = clf.fit(X, y)



Build and test your custom function.

In [25]:
def predict_extended(df, clf):
    def convert_to_uint8(arr):
        return arr.astype(np.uint8)
    
    dtype_classes = rasterio.dtypes.get_minimum_dtype(clf.classes_)
    
    probs = clf.predict_proba(df.values)
    pred_idx = probs.argmax(axis=1)
    pred = np.zeros_like(pred_idx).astype(dtype_classes)
    for i in range(probs.shape[1]):
        pred[pred_idx == i] = clf.classes_[i]
    # get reliability layers (maximum probability and margin, i.e. maximum probability minus second highest probability)
    probs_sorted = np.sort(probs, axis=1)
    max_prob = probs_sorted[:, probs_sorted.shape[1] - 1]
    margin = (
        probs_sorted[:, probs_sorted.shape[1] - 1] - probs_sorted[:, probs_sorted.shape[1] - 2]
    )

    probs = convert_to_uint8(probs * 100)
    max_prob = convert_to_uint8(max_prob * 100)
    margin = convert_to_uint8(margin * 100)

    ndigits = len(str(max(clf.classes_)))
    prob_names = [f"prob_{cid:0{ndigits}d}" for cid in clf.classes_]
    df_result = pd.concat(
        [
            pd.DataFrame({"pred": pred, "max_prob": max_prob, "margin": margin}),
            pd.DataFrame(probs, columns=prob_names),
        ],
        axis=1,
    )
    return df_result

In [26]:
tmp = df.loc[:, feature_cols].head()
predict_extended(tmp, clf)

Unnamed: 0,pred,max_prob,margin,prob_111,prob_112,prob_121,prob_122,prob_123,prob_124,prob_131,...,prob_321,prob_322,prob_324,prob_331,prob_332,prob_333,prob_411,prob_412,prob_511,prob_512
0,142,20,10,0,0,10,10,10,10,0,...,10,0,0,10,0,0,0,0,0,0
1,331,20,10,0,10,10,0,0,0,0,...,0,0,0,20,0,0,0,10,0,0
2,112,30,9,0,30,20,0,0,0,0,...,0,0,0,20,0,0,0,0,0,0
3,121,20,0,0,0,20,10,0,0,10,...,10,0,0,0,0,0,0,0,0,0
4,112,30,9,10,30,20,0,0,0,10,...,0,0,0,10,0,0,0,0,0,0


In [27]:
df[feature_cols]

Unnamed: 0_level_0,scoll01__vts4w__2018-01-07__NIR,scoll01__vts4w__2018-01-07__Red,scoll01__vts4w__2018-01-07__SWIR1,scoll01__vts4w__2018-01-07__SWIR2,scoll01__vts4w__2018-02-04__NIR,scoll01__vts4w__2018-02-04__Red,scoll01__vts4w__2018-02-04__SWIR1,scoll01__vts4w__2018-02-04__SWIR2,scoll01__vts4w__2018-03-04__NIR,scoll01__vts4w__2018-03-04__Red,scoll01__vts4w__2018-03-04__SWIR1,scoll01__vts4w__2018-03-04__SWIR2,scoll01__vts4w__2018-04-01__NIR,scoll01__vts4w__2018-04-01__Red,scoll01__vts4w__2018-04-01__SWIR1,scoll01__vts4w__2018-04-01__SWIR2,scoll01__vts4w__2018-04-29__NIR,scoll01__vts4w__2018-04-29__Red,scoll01__vts4w__2018-04-29__SWIR1,scoll01__vts4w__2018-04-29__SWIR2,scoll01__vts4w__2018-05-27__NIR,scoll01__vts4w__2018-05-27__Red,scoll01__vts4w__2018-05-27__SWIR1,scoll01__vts4w__2018-05-27__SWIR2,scoll01__vts4w__2018-06-24__NIR,scoll01__vts4w__2018-06-24__Red,scoll01__vts4w__2018-06-24__SWIR1,scoll01__vts4w__2018-06-24__SWIR2,scoll01__vts4w__2018-07-22__NIR,scoll01__vts4w__2018-07-22__Red,scoll01__vts4w__2018-07-22__SWIR1,scoll01__vts4w__2018-07-22__SWIR2,scoll01__vts4w__2018-08-19__NIR,scoll01__vts4w__2018-08-19__Red,scoll01__vts4w__2018-08-19__SWIR1,scoll01__vts4w__2018-08-19__SWIR2,scoll01__vts4w__2018-09-16__NIR,scoll01__vts4w__2018-09-16__Red,scoll01__vts4w__2018-09-16__SWIR1,scoll01__vts4w__2018-09-16__SWIR2,scoll01__vts4w__2018-10-14__NIR,scoll01__vts4w__2018-10-14__Red,scoll01__vts4w__2018-10-14__SWIR1,scoll01__vts4w__2018-10-14__SWIR2,scoll01__vts4w__2018-11-11__NIR,scoll01__vts4w__2018-11-11__Red,scoll01__vts4w__2018-11-11__SWIR1,scoll01__vts4w__2018-11-11__SWIR2,scoll01__vts4w__2018-12-09__NIR,scoll01__vts4w__2018-12-09__Red,scoll01__vts4w__2018-12-09__SWIR1,scoll01__vts4w__2018-12-09__SWIR2
npartitions=8,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
0,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16,int16
500000,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3500000,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3585912,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [28]:
from sklearn.model_selection import train_test_split

In [29]:
dst_predictions = dir_parquet = dir_refset / "optimized_refsets" / "aux_vsts_parquet_pred_dummy"

In [192]:
df.loc[:, feature_cols].map_partitions(predict_extended, clf=clf).to_parquet(str(dst_predictions))

In [33]:
list(dst_predictions.glob("*"))

[PosixPath('/home/ben/Devel/Projects/classify-hls/data/processed/refset00/optimized_refsets/aux_vsts_parquet_pred_dummy/part.4.parquet'),
 PosixPath('/home/ben/Devel/Projects/classify-hls/data/processed/refset00/optimized_refsets/aux_vsts_parquet_pred_dummy/part.2.parquet'),
 PosixPath('/home/ben/Devel/Projects/classify-hls/data/processed/refset00/optimized_refsets/aux_vsts_parquet_pred_dummy/part.5.parquet'),
 PosixPath('/home/ben/Devel/Projects/classify-hls/data/processed/refset00/optimized_refsets/aux_vsts_parquet_pred_dummy/part.3.parquet'),
 PosixPath('/home/ben/Devel/Projects/classify-hls/data/processed/refset00/optimized_refsets/aux_vsts_parquet_pred_dummy/_common_metadata'),
 PosixPath('/home/ben/Devel/Projects/classify-hls/data/processed/refset00/optimized_refsets/aux_vsts_parquet_pred_dummy/part.6.parquet'),
 PosixPath('/home/ben/Devel/Projects/classify-hls/data/processed/refset00/optimized_refsets/aux_vsts_parquet_pred_dummy/part.0.parquet'),
 PosixPath('/home/ben/Devel/Proj

In [37]:
df_pred = dd.read_parquet(str(dst_predictions))
df_pred

Unnamed: 0_level_0,pred,max_prob,margin,prob_111,prob_112,prob_121,prob_122,prob_123,prob_124,prob_131,prob_132,prob_133,prob_141,prob_142,prob_211,prob_221,prob_222,prob_231,prob_242,prob_243,prob_311,prob_312,prob_313,prob_321,prob_322,prob_324,prob_331,prob_332,prob_333,prob_411,prob_412,prob_511,prob_512
npartitions=8,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
,uint16,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


## APPENDIX

This was an approach to create smaller partitions but it takes ages to build and compute...

    counts_partitions = df_aux.partition.value_counts()
    counts_partitions

    def create_chunk_indices(n_pixels_per_chunk, n_pixels):
        indices_start = list(range(0, n_pixels, n_pixels_per_chunk))
        indices_end = [i for i in indices_start[1::]] + [n_pixels]
        return {"start": indices_start, "end": indices_end}
        print(indices_start)
        print(indices_end)

    chunk_sizes = {tile:create_chunk_indices(300000, counts_partitions[tile]) for tile in tilenames} 
    chunk_sizes

    import numpy as np
    import dask.delayed
    from dask import delayed
    import dask.dataframe as dd

    @delayed
    def _load_column(path, index=None):
        """Load a single dataframe column given a numpy file path."""
        if index is None:
            arr = np.load(str(path), allow_pickle=True)
        else:
            arr = np.load(str(path), mmap_mode="r", allow_pickle=True)[index]
        df = pd.DataFrame(arr)
        df.columns = [path.stem]
        # if index is not None:  ### SET THIS ???
        #     df.index = index
        return df

    @delayed
    def _concat_columns(column_list):
        """Concatenate single dataframe columns."""
        return pd.concat(column_list, axis=1)


    # @delayed
    def load_extracted_dask(npy_path_list, index=None):
        column_list = []
        for npy_path in npy_path_list:
            column_list.append(_load_column(npy_path, index=index))
        df = _concat_columns(column_list)
        df = dd.from_delayed(df)
        return df

    patterns = ["aux_index_global.npy", 
                "aux_vector_pid.npy", 
                "aux_vector_cid_l3.npy", 
                "aux_coord_y.npy", 
                "aux_coord_x.npy"]

    paths_npy = {tile: get_paths_of_extracted(src_dirs_tiles[tile], patterns) for tile in tilenames}

    dfs = []
    for i, tile in enumerate(tilenames):
        print(tile, " - first file")
        chunk_sizes_this_tile = chunk_sizes[tile]
        for start, end in zip(chunk_sizes_this_tile["start"], chunk_sizes_this_tile["end"]):
            print(start, end)
            dfs.append(load_extracted_dask(paths_npy[tile], 
                                           index=list(range(start, end))))

    df_chunked = dd.concat(dfs)
    df_chunked 
    df_chunked.compute()