In [7]:
import pystac_client
import planetary_computer as pc
import pandas as pd
from datetime import datetime, timedelta
from odc.stac import stac_load
import numpy as np
from PIL import Image
import os
from glob import glob
import multiprocessing
from pandarallel import pandarallel
import math

# from dotenv import load_dotenv
# load_dotenv()
# pc.settings.set_subscription_key(os.getenv('PC_SDK_SUBSCRIPTION_KEY'))

# Make data constants
SIZE = 'fixed' # 'adaptative'
DEGREE = 0.0014589825157734703 # = ha_to_degree(2.622685) # Field size (ha) mean = 2.622685 (train + test)

In [8]:
def explode_sar(row):
    sar_data = row['SAR data']
    for key in sar_data.keys():
        row[key] = sar_data[key]
    return row


def ha_to_degree(field_size): # Field_size (ha)
    ''' 
    1° ~= 111km
    1ha = 0.01km2
    then, side_size = sqrt(0.01 * field_size) (km)
    so, degree = side_size / 111 (°)
    '''
    side_size = math.sqrt(0.01 * field_size) 
    degree = side_size / 111
    return degree


def create_folders():
    os.makedirs('../data/processed', exist_ok=True)
    if SIZE == 'fixed':
        degree = str(round(DEGREE, 5)).replace(".", "-")
        save_folder = f'../data/processed/fixed_{degree}'
    elif SIZE == 'adaptative':
        save_folder = f'../data/processed/adaptative'
        
    os.makedirs(save_folder, exist_ok=True)
    return save_folder


def band_to_name(band):
    if band == 'B05':
        band = 'rededge1'
    elif band == 'B06':
        band = 'rededge2'
    elif band == 'B07':
        band = 'rededge3'
    elif band == 'B11':
        band = 'swir'
    return band


def get_bbox(longitude, latitude):
    if SIZE == 'fixed':
        degree = DEGREE
    elif SIZE == 'adaptative':
        field_size = float(row['Field size (ha)'])
        degree = ha_to_degree(field_size)
        
    min_longitude = longitude - degree / 2
    min_latitude = latitude - degree / 2
    max_longitude = longitude + degree / 2
    max_latitude = latitude + degree / 2 
    return (min_longitude, min_latitude, max_longitude, max_latitude)

def get_time_period(havest_date, history_days):
    havest_datetime = datetime.strptime(havest_date, '%d-%m-%Y')
    sowing_datetime = havest_datetime - timedelta(days=history_days)
    return f'{sowing_datetime.strftime("%Y-%m-%d")}/{havest_datetime.strftime("%Y-%m-%d")}'


def get_data(bbox, time_period, bands, scale):
    catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1", modifier=pc.sign_inplace)
    search = catalog.search(collections=["sentinel-2-l2a"], bbox=bbox, datetime=time_period)
    items = search.item_collection()
    data = stac_load(items, bands=bands, crs="EPSG:4326", resolution=scale, bbox=bbox)
    return data


def save_data(row, df, history_days=130, history_dates=24, resolution=10):
    scale = resolution / 111320.0
    bands = ['red', 'green', 'blue', 'B05', 'B06', 'B07', 'nir', 'B11']
    
    longitude = row['Longitude']
    latitude = row['Latitude']
    bbox = get_bbox(longitude, latitude)

    havest_date = row['Date of Harvest']
    time_period = get_time_period(havest_date, history_days)
    
    data = get_data(bbox, time_period, bands, scale)

    sar_data_list = []
    for i in range(1, history_dates + 1):
        sar_data_dict = {}
        time = data.time[-i].values
        sar_data_dict['date'] = pd.to_datetime(time).strftime('%d-%m-%Y')

        for band in bands:
            array = data[band][-i].to_numpy()
            band = band_to_name(band)
            sar_data_dict[band] = np.nanmean(array)
        
        sar_data_list.append(sar_data_dict)

    return sar_data_list


def make_data(path, save_folder):
    df = pd.read_csv(path)
    print(f'\nRetrieve SAR data from {path.split("/")[-1]}...')
    df['SAR data'] = df.parallel_apply(lambda row: save_data(row, df), axis=1)
    df = df.explode('SAR data')
    print(f'\nExplode SAR data from {path.split("/")[-1]}...')
    df = df.parallel_apply(explode_sar, axis=1)
    df = df.drop(columns='SAR data')
    print(f'\nSave SAR data from {path.split("/")[-1]}...')
    df.to_csv(f'{save_folder}/{path.split("/")[-1]}', index=False)
    print(f'\nSAR data from {path.split("/")[-1]} saved!')

In [9]:
pandarallel.initialize(progress_bar=True, nb_workers=16)
save_folder = create_folders()

train_path = '../data/raw/train.csv'
make_data(train_path, save_folder)

test_path = '../data/raw/test.csv'
make_data(test_path, save_folder)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.

Retrieve SAR data from train.csv...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=18), Label(value='0 / 18'))), HBox…


Explode SAR data from train.csv...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=418), Label(value='0 / 418'))), HB…


Save SAR data from train.csv...

Retrieve SAR data from test.csv...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4), Label(value='0 / 4'))), HBox(c…


Explode SAR data from test.csv...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=75), Label(value='0 / 75'))), HBox…


Save SAR data from test.csv...
