#### [00] Importing packages

import several packages that would be used for acquiring sample data.

In [1]:
import pandas as pd 
import numpy as np

import geopandas as gpd
import rasterio 
from rasterstats import zonal_stats

from multiprocessing import Pool,Process

import gcsfs

from glob import glob

import pathlib

import warnings
warnings.filterwarnings("ignore")

from omegaconf import DictConfig,OmegaConf

#### Editing PIC => go to config/config.yaml

In [2]:
conf = OmegaConf.load('config/config.yaml')
pic_=conf['config']['pic_']

#### [01] Read data

In [3]:
gdf_ = gpd.read_file("gs://bps-gcp-bucket/MLST2023/sample/"+pic_+"_sample2022_edit.gpkg").to_crs("EPSG:4326")[[
    'ID_GRID','geometry','kdPIC','nama_valid','sub_grid']]  ## ganti dengan file gpkg
gdf_['cen_x'] =gdf_.geometry.centroid.x
gdf_['cen_y'] =gdf_.geometry.centroid.y
gdf_.head(2)

Unnamed: 0,ID_GRID,geometry,kdPIC,nama_valid,sub_grid,cen_x,cen_y
0,ID-3043,"POLYGON ((114.48113 -8.11080, 114.48131 -8.110...",F,100201.0,ID-3043,114.481219,-8.110705
1,ID-3043,"POLYGON ((114.45973 -8.11225, 114.45991 -8.112...",F,100201.0,ID-3043,114.459815,-8.112159


In [5]:
group_gdf_=gdf_.groupby('sub_grid')
y_=list(group_gdf_.groups.keys())
# y_[0:]

In [7]:
list_file_done=glob('ml_output/01_acquisition_data/sample_calc/*.csv')
list_file_done=[i.split('/')[3].replace('calc_','').replace('.csv','') for i in list_file_done]
y_=[u for u in y_ if u not in list_file_done]
print('Total file undone: ',len(y_))
print(y_)

Total file undone:  14
['ID-3496', 'ID-3497_1', 'ID-3964', 'ID-3965', 'ID-3981', 'ID-4092', 'ID-4215', 'ID-4262', 'ID-4311', 'ID-4362', 'ID-4363', 'ID-4696', 'ID-4921', 'ID-5122']


#### [02]  Develop user defined data

specify the "kdPIC" and the band that would be calculated

In [8]:
kode_pic_ = gdf_.kdPIC.unique()

band_ = ['B1_p15','B2_p15','B3_p15','B4_p15',
        'B5_p15','B6_p15','B7_p15','B8_p15',
        'B8A_p15','B11_p15','B12_p15',
        'NDVI_p50','NDWI_p50','NDBI_p50',
        'SAVI_p50','EVI_p50','GNDVI_p50']

created user defined syntax for defining the raster name and calculating the statistics from them

In [9]:
def get_name(grid, pic):
    filename= 'duatahun_'+grid+'_QALPN1_PakKus_sentinel2_10m.tif'
    filename_full = 'gs://bps-gcp-bucket/citra-sentinel2/'+pic+'/'+filename
    return filename, filename_full


def get_index(gdf_, filename_full):
    res = []
    for i in range(1,18):
        tmp = pd.DataFrame(zonal_stats(gdf_.geometry,filename_full,
                                       stats='mean', band=i,  all_touched=True)).rename(columns={"mean":band_[i-1]}) #[0]['mean']
        if i==1:
            res = tmp
        else:
            res = pd.concat([res,tmp], axis=1)
    result_ = pd.concat([gdf_.reset_index(), res], axis=1)
    return result_

def split_dataframe(df, chunk_size = 2000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

def parallel_get_index(i):
    temp=pd.DataFrame()
    global gdf_
    global group_gdf_
    te_=gdf_.iloc[group_gdf_.groups[i]]
    p=te_.kdPIC.unique()[0]
    filename, filename_full = get_name(i, p)
    te_s=split_dataframe(te_)
    for j in te_s:
        res = get_index(j, filename_full) 
        temp=res.append(temp)
    temp[['ID_GRID','sub_grid','kdPIC','nama_valid','B1_p15','B2_p15','B3_p15','B4_p15','B5_p15','B6_p15','B7_p15',
          'B8_p15','B8A_p15','B11_p15','B12_p15','NDVI_p50','NDWI_p50','NDBI_p50','SAVI_p50',
          'EVI_p50','GNDVI_p50','cen_x','cen_y']].to_csv('ml_output/01_acquisition_data/sample_calc/calc_'+i+'.csv',sep=';')

#### [03] Parallel processing for acquiring data

In [10]:
if len(y_)>0:
    with Pool(28) as p:
        p.map(parallel_get_index,y_)

#### [04] Exporting data

In [11]:
list_data_=glob('ml_output/01_acquisition_data/sample_calc/*.csv')
df_=pd.DataFrame()
for i in list_data_:
    temp_=pd.read_csv(i,sep=';')[['ID_GRID','sub_grid','kdPIC','nama_valid','B1_p15','B2_p15','B3_p15','B4_p15','B5_p15','B6_p15','B7_p15',
          'B8_p15','B8A_p15','B11_p15','B12_p15','NDVI_p50','NDWI_p50','NDBI_p50','SAVI_p50',
          'EVI_p50','GNDVI_p50','cen_x','cen_y']]
    df_=temp_.append(df_)
    df_=df_[['ID_GRID','sub_grid','kdPIC','nama_valid','B1_p15','B2_p15','B3_p15','B4_p15','B5_p15','B6_p15','B7_p15',
          'B8_p15','B8A_p15','B11_p15','B12_p15','NDVI_p50','NDWI_p50','NDBI_p50','SAVI_p50',
          'EVI_p50','GNDVI_p50','cen_x','cen_y']]
df_.to_csv("gs://bps-gcp-bucket/MLST2023/sample/sample_"+str(pic_) + ".csv",index=False)

In [12]:
for i in list_data_:
    file_to_rem = pathlib.Path(i)
    file_to_rem.unlink()