In [25]:
import os
import math
from tqdm import tqdm
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import xarray as xr

import pystac_client
import planetary_computer as pc
from odc.stac import stac_load

import multiprocessing as mp
from random import uniform, random


# Make data constants
SIZE = 'adaptative' # 'fixed'
FACTOR = 1 # for 'adaptative' 
NUM_AUGMENT = 2
MAX_AUGMENT = 5
DEGREE = 0.0014589825157734703 # = ha_to_degree(2.622685) # Field size (ha) mean = 2.622685 (train + test)

dict_band_name = {
    'B05': 'rededge1',
    'B06': 'rededge2',
    'B07': 'rededge3',
    'B11': 'swir'
}


def ha_to_degree(field_size): # Field_size (ha)
    ''' 
    1° ~= 111km
    1ha = 0.01km2
    then, side_size = sqrt(0.01 * field_size) (km)
    so, degree = side_size / 111 (°)
    '''
    side_size = math.sqrt(0.01 * field_size) 
    degree = side_size / 111
    return degree


def create_folders() -> str:
    if NUM_AUGMENT > 1:
        save_folder = f'../data/processed/augment_{NUM_AUGMENT}_{MAX_AUGMENT}'
    elif SIZE == 'fixed':
        degree = str(round(DEGREE, 5)).replace(".", "-")
        save_folder = f'../data/processed/fixed_{degree}'
    elif SIZE == 'adaptative':
        save_folder = f'../data/processed/adaptative_factor_{FACTOR}'
        
    os.makedirs(save_folder, exist_ok=True)
    return save_folder


def get_factors():
    factors = []
    for _ in range(4):
        factor = uniform(1, MAX_AUGMENT)
        if random() < 0.5: factor = 1 / factor
        factors.append(factor)

    return factors


def get_bbox(longitude, latitude, field_size):
    if SIZE == 'fixed':
        degree = DEGREE
    elif SIZE == 'adaptative':
        degree = ha_to_degree(field_size) * FACTOR
        
    length = degree / 2
    factors = get_factors()
    min_longitude = longitude - factors[0] * length
    min_latitude = latitude - factors[1] * length
    max_longitude = longitude + factors[2] * length
    max_latitude = latitude + factors[3] * length
    
    return (min_longitude, min_latitude, max_longitude, max_latitude)

def get_time_period(havest_date: str, history_days: int)->str:
    havest_datetime = datetime.strptime(havest_date, '%d-%m-%Y')
    sowing_datetime = havest_datetime - timedelta(days=history_days)
    return f'{sowing_datetime.strftime("%Y-%m-%d")}/{havest_datetime.strftime("%Y-%m-%d")}'


def get_data(bbox, time_period: str, bands: list[str], scale: float):
    catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1", modifier=pc.sign_inplace)
    search = catalog.search(collections=["sentinel-2-l2a"], bbox=bbox, datetime=time_period)
    items = search.item_collection()
    data = stac_load(items, bands=bands, crs="EPSG:4326", resolution=scale, bbox=bbox)
    return data


def process_data(xds: xr.Dataset, row: pd.Series, history_dates:int)->xr.Dataset:
    xds = xds.drop(['spatial_ref', 'SCL'])
    xds = xds.mean(dim=['latitude', 'longitude'], skipna=True)
    xds = xds.sortby('time', ascending=False)
    xds = xds.isel(time=slice(None, history_dates))
    xds['time'] = xds['time'].dt.strftime("%Y-%m-%d")
    xds['state_dev'] =  ('time', np.arange(history_dates)[::-1])
    xds = xds.swap_dims({'time': 'state_dev'})
    xds = xds.rename_vars(dict_band_name)
    xds = xds.expand_dims({'ts_id': 1, 'ts_obs': 1})
    xds['ts_obs'] = [row.name]
    return xds


def save_data(row, history_days, history_dates, resolution):
    scale = resolution / 111320.0
    bands = ['red', 'green', 'blue', 'B05', 'B06', 'B07', 'nir', 'B11', 'SCL']
    
    longitude = row['Longitude']
    latitude = row['Latitude']
    field_size = float(row['Field size (ha)'])
    bbox = get_bbox(longitude, latitude, field_size)

    havest_date = row['Date of Harvest']
    time_period = get_time_period(havest_date, history_days)
    
    data = get_data(bbox, time_period, bands, scale)

    cloud_mask = ((data.SCL != 0) & 
                  (data.SCL != 1) & 
                  (data.SCL != 3) & 
                  (data.SCL != 6) & 
                  (data.SCL != 8) & 
                  (data.SCL != 9) & 
                  (data.SCL != 10))

    data = data.where(cloud_mask)
    data = process_data(data, row, history_dates)
    
    return data


def save_data_app(index_row, history_days=130, history_dates=24, resolution=10):
    for _ in range(index_row[0]):
        data = save_data(index_row[1][1], history_days, history_dates, resolution)
    return data


def get_index_count(df: pd.DataFrame, path: str)->pd.Index:
    list_data = []
    index_count = df.index.value_counts().sort_index(ascending=True) + NUM_AUGMENT - 1

    if os.path.exists(path=path):
        xdf = xr.open_dataset(path, engine='scipy')
        unique, counts = np.unique(xdf['ts_obs'].values, return_counts=True)
        index_count -= pd.Series(counts, index=unique).sort_index(ascending=True)
        list_data.append(xdf)
    
    index_count = index_count[index_count != 0]
    return index_count, list_data


def make_data(path, save_folder, augment):
    save_file = f'{save_folder}/{path.split("/")[-1].split(".")[0]}.nc'

    df: pd.DataFrame = pd.read_csv(path)
    df.index.name = 'ts_obs'
    # df.reset_index(inplace=True)
    # df.index.name = 'ts_id'

    index_count, list_data = get_index_count(df, save_file)
    df = df.loc[index_count.index]
    # print(f'\nRetrieve SAR data from {path.split("/")[-1]}...')
    # try:
        # with mp.Pool(8) as pool:
    for data in tqdm(zip(index_count[:2], df.iloc[:2].iterrows())):
        data = save_data_app(data)
        return data
        # list_data.append(data)
    # except:
    #     "Error occure during the data retrieval."
    # finally:
    #     data = xr.concat(list_data, dim='ts_obs')
    #     data = data.merge(df.to_xarray())

    # print(f'\nSave SAR data from {path.split("/")[-1]}...')
    # data.to_netcdf(save_file, engine='scipy')
    # print(f'\nSAR data from {path.split("/")[-1]} saved!')

In [26]:
save_folder = create_folders()
train_path = '../../data/raw/train.csv'
data = make_data(train_path, save_folder, augment=NUM_AUGMENT)

0it [00:00, ?it/s]

In [24]:
data[0]

10

In [2]:
path = '../../data/raw/train.csv'
df = pd.read_csv(path)

# data, data_filter = save_data_app((df.iloc[0].index, df.iloc[0]))
data, data_filter = save_data_app((df.iloc[0].index, df.iloc[0]))
# data_2 = save_data_app((df.iloc[1].index, df.iloc[30]))
# data_3 = save_data_app((df.iloc[1].index, df.iloc[50]))

In [3]:
data

In [400]:
data.to_netcdf('./test.nc', engine='scipy')

In [368]:
xdf = data.copy(deep=True)
xdf = xdf.drop(['spatial_ref', 'SCL'])
xdf = xdf.mean(dim=['latitude', 'longitude'], skipna=True)
xdf = xdf.sortby('time', ascending=False)
xdf = xdf.isel(time=slice(None, history_dates))
xdf['time'] = xdf['time'].dt.strftime("%Y-%m-%d")
xdf['state_dev'] =  ('time', np.arange(history_dates)[::-1])
xdf = xdf.swap_dims({'time': 'state_dev'})
xdf = xdf.rename_vars(dict_band_name)
df_t = pd.DataFrame([df.iloc[0]]*history_dates, index=xdf.indexes['state_dev'])
xdf = xdf.merge(df_t.to_xarray())

In [387]:
xdf

In [386]:
xdf_concat.mean('ts_id', skipna=True).to_netcdf('./test.nc')

In [349]:
xdf_concat.fillna(xdf_concat.mean('ts_id', skipna=True))

In [278]:
# index = pd.MultiIndex.from_arrays([xdf['time'].values, np.arange(0, xdf['time'].shape[0])], names=['time', 'state_dev'])
# xdf.assign_coords({'time': index})

ValueError: conflicting multi-index level name 'time' with dimension 'time'

In [136]:
fig = px.line(y=[(data_filter.nir - data_filter.red) / (data_filter.nir + data_filter.red), (data.nir - data.red) / (data.nir + data.red)], x=data.time)
fig.show()

Fill using mean of diff between data and filtered data adding to data to obtain filtered data 

In [132]:
test = (data[['red','green', 'blue', 'rededge1', 'rededge2', 'rededge3', 'nir', 'swir']] - data_filter[['red','green', 'blue', 'rededge1', 'rededge2', 'rededge3', 'nir', 'swir']]).mean()
df_test = data_filter.copy(deep=True)

df_test.loc[:, ['red','green', 'blue', 'rededge1', 'rededge2', 'rededge3', 'nir', 'swir']] = df_test[['red','green', 'blue', 'rededge1', 'rededge2', 'rededge3', 'nir', 'swir']].fillna(data[['red','green', 'blue', 'rededge1', 'rededge2', 'rededge3', 'nir', 'swir']] - test)

In [135]:
fig = px.line(y=[(df_test.nir - df_test.red) / (df_test.nir + df_test.red), (data.nir - data.red) / (data.nir + data.red)], x=df_test.time)
fig.show()

Fill using pandas interpolation

In [155]:
df_test = data_filter.copy(deep=True)

df_test.loc[:, ['red','green', 'blue', 'rededge1', 'rededge2', 'rededge3', 'nir', 'swir']] = df_test[['red','green', 'blue', 'rededge1', 'rededge2', 'rededge3', 'nir', 'swir']].interpolate(method="linear", axis="index", limit=4, limit_direction='both')

In [157]:
fig = px.line(y=[(df_test.nir - df_test.red) / (df_test.nir + df_test.red), (data.nir - data.red) / (data.nir + data.red)], x=df_test.time)
fig.show()

Smooth the data and fill nan on filter data

In [172]:
from scipy.signal import savgol_filter

df_test = data_filter.copy(deep=True)

arr = savgol_filter(data[['red','green', 'blue', 'rededge1', 'rededge2', 'rededge3', 'nir', 'swir']], axis=0, window_length=12, polyorder=4)
test = pd.DataFrame(arr, index=df_test.index, columns=['red','green', 'blue', 'rededge1', 'rededge2', 'rededge3', 'nir', 'swir'])

df_test.loc[:, ['red','green', 'blue', 'rededge1', 'rededge2', 'rededge3', 'nir', 'swir']] = df_test[['red','green', 'blue', 'rededge1', 'rededge2', 'rededge3', 'nir', 'swir']].fillna(test)

In [175]:
fig = px.line(y=[(df_test.nir - df_test.red) / (df_test.nir + df_test.red), (data.nir - data.red) / (data.nir + data.red)], x=df_test.time)
fig.show()

Use mean value of the same time index

In [215]:
def make_timeseries(df):
    df['timeserieindex'] = df['District'].copy(deep=True)
    for col in ['Latitude', 'Longitude', 'Season(SA = Summer Autumn, WS = Winter Spring)', 'Rice Crop Intensity(D=Double, T=Triple)', 'Date of Harvest', 'Field size (ha)', 'Rice Yield (kg/ha)']:
        df['timeserieindex'] += ' ' + df[col].astype('str')

    df['timeserieindex'] = pd.factorize(df['timeserieindex'].astype('str'))[0]
    return df

In [8]:
import xarray as xr
xr.open_dataset('../../data/processed/adaptative_factor_2/train_filter_processed.nc', engine='scipy')
# df = pd.read_csv('../../data/processed/adaptative_factor_3/train.csv')

In [219]:
df = make_timeseries(df)
xarray.DataArray(df, coords=['place', 'index', 'bands'], )

In [1]:
test = [345,
376,
401,
412,
420,
277,
59,
298,
185,
422,
464,
150,
233,
349,
156,
445,
164,
116,
20,
403,
76,
505,
109,
288,
89,
91,
433,
203,
122,
393,
84,
323,
110,
219,
437,
318,
112,
347,
130,
284,
128,
52,
481,
96,
461,
443,
141,
158,
194,
427,
357,
269,
47,
223,
296,
324,
436,
132,
419,
367,
471,
429,
550,
345,
90,
310,
252,
186,
88,
83,
241,
547,
430,
106,
187,
184,
157,
27,
160,
104,
272,
396,
480,
314,
103,
144,
1,
145,
462,
316,
487,
351,
211,
152,
536,
134,
494,
534,
372,
250,
169,
175,
350,
458,
531,
221,
167,
411,
73,
477,
176,
417,
523,
93,
381,
263,
18,
431,
265,
139,
509,
360,
504,
432,
7,
450,
383,
114,
322,
319,
399,
133,
389,
301,
338,
147,
507,
16,
545,
454,
244,
70,
416,
183,
455,
409,
392,
425,
214,
395,
465,
124,
29,
151,
470,
370,
126,
491,
510,
410,
238,
522,
498,
460,
343,
467,
386,
13,
67,
353,
135,
259,
336,
434,
228,
86,
171,
60,
142,
38,
94,
68,
123,
541,
486,
542,
439,
201,
163,
524,
413,
485,
220,
92,
254,
334,
426,
435,
453,
46,
521,
26,
500,
207,
146,
196,
193,
441,
229,
32,
497,
446,
289,
315,
384,
278,
405,
105,
339,
22,
489,
138,
483,
546,
418,
190,
346,
191,
295,
528,
230,
369,
362,
475,
237,
217,
514,
490,
98,
100,
335,
373,
159,
364,
236,
181,
0,
166,
282,
526,
320,
337,
332,
213,
552,
328,
512,
117,
243,
378,
519,
140,
466,
206,
51,
294,
371,
492,
9,
554,
479,
271,
516,
97,
375,
28,
77,
270,
515,
553,
356,
131,
503,
304,
11,
21,
469,
15,
390,
174,
234,
208,
377,
379,
99,
43,
488,
78,
267,
231,
438,
111,
482,
538,
143,
224,
81,
14,
311,
391,
283,
232,
246,
198,
37,
179,
473,
58,
226,
204,
448,
79,
539,
548,
161,
397,
388,
40,
253,
506,
456,
101,
209,
474,
321,
210,
382,
136,
50,
459,
472,
212,
66,
281,
457,
359,
30,
48,
153,
532,
290,
189,
402,
199,
476,
49,
529,
23,
285,
257,
95,
463,
274,
300,
537,
517,
549,
421,
317,
297,
119,
129,
155,
276,
331,
468,
188,
352,
279,
513,
72,
85,
556,
452,
172,
404,
354,
137,
406,
62,
165,
361,
56,
249,
280,
2,
292,
264,
340,
35,
348,
178,
34,
428,
449,
266,
451,
173,
177,
330,
127,
302,
442,
385,
69,
268,
10,
329,
222,
33,
154,
182,
120,
341,
484,
245,
12,
307,
121,
102,
75,
502,
325,
53,
511,
162,
4,
273,
255,
251,
31,
496,
518,
45,
501,
407]

In [9]:
import torch

class DLDataset(Dataset):
    def __init__(self, weather, vi, df, test=False, times=120):
        self.weather = weather
        self.vi = vi
        self.df = df
        self.test = test
        self.times = times

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        district = row['District']
        latitude = row['Latitude']
        longitude = row['Longitude']
        date_of_harvest = row['Date of Harvest']
        
        vi = self.vi[(self.vi['District'] == district) &
                     (self.vi['Latitude'] == latitude) &
                     (self.vi['Longitude'] == longitude) &
                     (self.vi['Date of Harvest'] == date_of_harvest)]
        
        vi['date'] = pd.to_datetime(vi['date'], format='%d-%m-%Y')
        all_dates = pd.date_range(vi['date'].min(), vi['date'].max(), freq='d').strftime('%d-%m-%Y')
        all_dates = all_dates.tolist()[-self.times:]
        
        weather = self.weather[(self.weather['name'] == district) &
                               (self.weather['datetime'].isin(all_dates))]
        
        weather['datetime'] = pd.to_datetime(weather['datetime'], format='%d-%m-%Y')
        
        vi = vi.sort_values('date').reset_index(drop=True)
        not_vi_columns = ['District', 'Latitude', 'Longitude', 'Date of Harvest', 'date']
        vi = vi.drop(columns=not_vi_columns)
        s_input = torch.tensor(vi.values, dtype=torch.float)
        
        weather = weather.sort_values('datetime').reset_index(drop=True)
        not_weather_columns = ['name', 'datetime']
        weather = weather.drop(columns=not_weather_columns)
        m_input = torch.tensor(weather.values, dtype=torch.float)
        
        g_columns = ['Rice Crop Intensity(D=Double, T=Triple)', 'Field size (ha)']
        g_input = torch.tensor(row[g_columns].astype('float64').values, dtype=torch.float)
        
        if self.test:
            label = row['Predicted Rice Yield (kg/ha)']
        else:
            label = row['Rice Yield (kg/ha)']
        
        item = {
            'district': district, 
            'latitude': latitude, 
            'longitude': longitude, 
            'date_of_harvest': date_of_harvest,
            's_input': s_input,
            'm_input': m_input,
            'g_input': g_input,
            'labels': label
        }
        
        return item
    


3
5
6
8
17
19
24
25
36
39
41
42
44
54
55
57
61
63
64
65
71
74
80
82
87
107
108
113
115
118
125
148
149
168
170
180
192
195
197
200
202
205
215
216
218
225
227
235
239
240
242
247
248
256
258
260
261
262
275
286
287
291
293
299
303
305
306
308
309
312
313
326
327
333
342
344
355
358
363
365
366
368
374
380
387
394
398
400
408
414
415
423
424
440
444
447
478
493
495
499
508
520
525
527
530
533
535
540
543
544
551
555
