In [38]:
import pandas as pd
import numpy as np
import os
import sys
import torch
sys.path.append('../../')
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from src.data.data_utils import *
from src.data.dataset import *

In [51]:
data_root = '/scratch/bbug/ayang1/raw_data/lucas/s1_lucas_2018'
save_path = '/scratch/bbug/ayang1/datasets/lucas_dataset_large/vh_vv_full'

# Collect ee generated time series data
ds = [
    'S1_point_10days_10m_1Jan-31Dec_Ireland_ratio-db.csv',
    'S1_point_10days_10m_1Jan-31Dec_Italy_ratio-db.csv',
    'S1_point_10days_10m_1Jan-31Dec_Netherlands_ratio-db.csv',
    'S1_point_10days_10m_1Jan-31Dec_Portugal_ratio-db.csv',
    'S1_point_10days_10m_1Jan-31Dec_Belgium_ratio-db.csv',
    'S1_point_10days_10m_1Jan-31Dec_Bulgaria_ratio-db.csv',
    'S1_point_10days_10m_1Jan-31Dec_Hungary_ratio-db.csv','S1_point_10days_10m_1Jan-31Dec_Slovenia_ratio-db.csv'
]

In [None]:
# Lucas labels 
labels = pd.read_csv('/scratch/bbug/ayang1/raw_data/lucas/lucas_2018/copernicus_filtered/lucas_2018_filtered.csv')

dfs = []
for path in ds:
    dfs.append(pd.read_csv(os.path.join(data_root, path)))

data= pd.concat(dfs)

In [None]:
# Add labels to signals based on point id
crop_data = add_lucas_labels(data, labels)
crop_data.drop('system:index', axis=1, inplace=True)
crop_data = crop_data.loc[crop_data['LABEL']!='NOT_CROP']

In [None]:
# Filter out classes with less than 1000 samples
counts = np.unique(crop_data['LABEL'], return_counts=True)
to_drop = [counts[0][i] for i in range(len(counts[0])) if counts[1][i] < 1000]
crop_data = crop_data.loc[~crop_data['LABEL'].isin(to_drop)]

print('Dropped classes:', to_drop)

In [41]:
crop_data = pd.read_csv(os.path.join(save_path, 'lucas_large.csv'))
crop_data.drop(['Unnamed: 0', 'POINT_ID'], axis=1, inplace=True)

In [42]:
crop_data.head()

Unnamed: 0,VHVV_20180101,VHVV_20180111,VHVV_20180121,VHVV_20180131,VHVV_20180210,VHVV_20180220,VHVV_20180302,VHVV_20180312,VHVV_20180322,VHVV_20180401,...,VV_20181018,VV_20181028,VV_20181107,VV_20181117,VV_20181127,VV_20181207,VV_20181217,country,LABEL,level1_label
0,0.125109,0.230073,0.177701,0.12745,0.135653,0.119989,0.147544,0.168147,0.102177,0.15739,...,-11.19083,-12.11418,-10.784936,-11.45184,-13.372685,-14.198997,-11.428949,Ireland,B13,1
1,0.200931,0.259788,0.167076,0.24068,0.189085,0.191945,0.293391,0.200305,0.295674,0.238959,...,-11.012028,-10.562534,-9.808117,-12.352945,-12.027524,-13.005736,-11.338793,Ireland,B13,1
2,0.176189,0.230273,0.123326,0.161628,0.16033,0.16222,0.252259,0.183998,0.242298,0.159819,...,-10.563501,-10.399241,-9.947539,-11.762892,-12.379931,-12.977741,-11.503675,Ireland,B13,1
3,0.176559,0.22715,0.124222,0.170073,0.156813,0.137737,0.178656,0.181782,0.187252,0.16833,...,-10.436569,-10.491647,-10.352579,-11.613546,-12.566136,-13.436553,-11.531074,Ireland,B13,1
4,0.169583,0.223531,0.145587,0.159445,0.165361,0.163532,0.173392,0.127245,0.154878,0.173577,...,-10.820527,-10.832357,-11.041098,-11.660205,-12.633287,-13.149514,-12.314212,Ireland,B13,1


In [43]:
# Save datasets as npy files
arr = crop_data.to_numpy()
# vhvv = arr[:, np.newaxis, 0:36]
vh = arr[:, np.newaxis, 36:72].astype(np.float32)
vv = arr[:, np.newaxis, 72:108].astype(np.float32)
labels = arr[:, 109].astype(str)

data = np.concatenate([vh, vv], axis=1)

In [44]:
# Generate masks
seq_len = vv.shape[2]
masks = []
for i in range(data.shape[0]):
    mask = np.ones((2, seq_len))
    mask[:, np.random.randint(0, seq_len):] = 0
    masks.append(mask)
    

In [45]:
train_signals, test_signals, train_labels, test_labels = train_test_split(data, labels, test_size=0.3, random_state=42)
train_signals, val_signals, train_labels, val_labels = train_test_split(train_signals, train_labels, test_size=0.1, random_state=42)

In [50]:
train_masks = np.array(masks[:train_signals.shape[0]])
val_masks = np.array(masks[train_signals.shape[0]:train_signals.shape[0]+val_signals.shape[0]])
test_masks = np.array(masks[train_signals.shape[0]+val_signals.shape[0]:])

assert train_labels.shape[0] == train_masks.shape[0] == train_signals.shape[0]
assert val_labels.shape[0] == val_masks.shape[0] == val_signals.shape[0]
assert test_labels.shape[0] == test_masks.shape[0] == test_signals.shape[0]

assert len(np.unique(train_labels)) == len(np.unique(val_labels)) == len(np.unique(test_labels))

In [52]:
print(len(train_labels), len(val_labels), len(test_labels))

62528 6948 29776


In [53]:
np.save(os.path.join(save_path, 'train_signals.npy'), train_signals, allow_pickle=True)
np.save(os.path.join(save_path, 'train_labels.npy'), train_labels, allow_pickle=True)
np.save(os.path.join(save_path, 'train_masks.npy'), train_masks, allow_pickle=True)

np.save(os.path.join(save_path, 'val_signals.npy'), val_signals, allow_pickle=True)
np.save(os.path.join(save_path, 'val_labels.npy'), val_labels, allow_pickle=True)
np.save(os.path.join(save_path, 'val_masks.npy'), val_masks, allow_pickle=True)

np.save(os.path.join(save_path, 'test_signals.npy'), test_signals, allow_pickle=True) 
np.save(os.path.join(save_path, 'test_labels.npy'), test_labels, allow_pickle=True)
np.save(os.path.join(save_path, 'test_masks.npy'), test_masks, allow_pickle=True)

In [34]:
ds = CropTypeDataset(save_path, 'train', include_masks=True)

In [37]:
ds[0]

(tensor([[-16.6133, -15.6661, -16.1065, -16.6811, -16.7711, -18.6049, -17.7341,
          -19.2199, -16.9886, -17.6081, -16.9677, -17.4429, -19.6478, -18.3894,
          -17.3364, -16.3715, -15.9568, -17.4481, -18.1410, -17.2195, -17.0704,
          -19.5750, -18.6725, -19.4411, -18.8879, -17.4893, -18.4655, -20.8458,
          -19.9017, -18.1979, -18.5535, -17.7300, -17.9124, -17.0974, -18.3492,
          -17.5191],
         [-10.2536,  -9.7675,  -8.9075, -10.4543, -11.1093, -12.6301, -11.6318,
          -13.1842, -11.2858, -12.7128, -10.9008, -11.0370,  -9.9478, -11.7135,
          -11.3061, -11.0687, -12.1598, -10.4776, -11.8476, -11.9927, -12.7566,
          -13.0993, -12.2197, -10.9117, -11.6222, -10.8223, -11.3570, -11.3515,
          -12.4061, -11.3578, -10.5994, -10.6958, -11.4247, -10.0893, -10.7328,
          -10.6686]]),
 tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
 