In [5]:
import pandas as pd
import numpy as np
import os
import sys
import torch
from torch.utils.data import Dataset, DataLoader

sys.path.append('../../')
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from src.data.data_utils import *
from src.data.dataset import *

In [2]:
data_root = '/scratch/bbug/ayang1/raw_data/lucas/s1_lucas_2018'
save_path = '/scratch/bbug/ayang1/datasets/lucas_dataset_large/'

# Collect ee generated time series data
# ds = [
#     'S1_point_10days_10m_1Jan-31Dec_Ireland_ratio-db.csv',
#     'S1_point_10days_10m_1Jan-31Dec_Italy_ratio-db.csv',
#     'S1_point_10days_10m_1Jan-31Dec_Netherlands_ratio-db.csv',
#     'S1_point_10days_10m_1Jan-31Dec_Portugal_ratio-db.csv',
#     'S1_point_10days_10m_1Jan-31Dec_Belgium_ratio-db.csv',
#     'S1_point_10days_10m_1Jan-31Dec_Bulgaria_ratio-db.csv',
#     'S1_point_10days_10m_1Jan-31Dec_Hungary_ratio-db.csv',
# ]

ds = ['S1_point_10days_10m_1Jan-31Dec_Slovenia_ratio-db.csv']



In [30]:

# Lucas labels 
labels = pd.read_csv('/scratch/bbug/ayang1/raw_data/lucas/lucas_2018/copernicus_filtered/lucas_2018_filtered.csv')

dfs = []
for path in ds:
    dfs.append(pd.read_csv(os.path.join(data_root, path)))

data= pd.concat(dfs)

  labels = pd.read_csv('/scratch/bbug/ayang1/raw_data/lucas/lucas_2018/copernicus_filtered/lucas_2018_filtered.csv')


In [31]:
# Add labels to signals based on point id
crop_data = add_lucas_labels(data, labels)
crop_data.drop('system:index', axis=1, inplace=True)
print(np.unique(crop_data['LABEL'], return_counts=True))
crop_data = crop_data.loc[crop_data['LABEL']!='NOT_CROP']

Loading files
Creating dataset of size 20707


100%|██████████| 20707/20707 [00:05<00:00, 3768.62it/s]


(array(['B11', 'B15', 'B16', 'B18', 'B51', 'B52', 'B55', 'NOT_CROP'],
      dtype=object), array([  125,     4,   234,    69,     8,   100,   126, 20041]))


In [24]:
# Filter out classes with less than 1000 samples
counts = np.unique(crop_data['LABEL'], return_counts=True)
to_drop = [counts[0][i] for i in range(len(counts[0])) if counts[1][i] < 1000]
# crop_data = crop_data.loc[~crop_data['LABEL'].isin(to_drop)]

print('Dropped classes:', to_drop)
print(len(crop_data))

Dropped classes: ['B11', 'B15', 'B16', 'B18', 'B51', 'B52', 'B55']
666


In [5]:
# crop_data.to_csv(os.path.join(save_path, ''))
crop_data = pd.read_csv(os.path.join(save_path, 'lucas_large.csv'))
crop_data.drop(['Unnamed: 0', 'POINT_ID'], axis=1, inplace=True)

In [None]:
crop_data.head()

Unnamed: 0,POINT_ID,VHVV_20180101,VHVV_20180111,VHVV_20180121,VHVV_20180131,VHVV_20180210,VHVV_20180220,VHVV_20180302,VHVV_20180312,VHVV_20180322,...,VV_20181008,VV_20181018,VV_20181028,VV_20181107,VV_20181117,VV_20181127,VV_20181207,VV_20181217,.geo,LABEL
6396,46682550,0.161721,0.153333,0.138939,0.15357,0.132272,0.263998,0.220625,0.163274,0.145163,...,-10.978528,-12.056416,-10.774232,-9.553301,-9.440448,-10.040397,-11.787124,-12.719151,"{""type"":""MultiPoint"",""coordinates"":[]}",B11
6397,46682550,0.136804,0.118816,0.12069,0.139844,0.125986,0.257054,0.200515,0.120664,0.139629,...,-11.474958,-10.978122,-10.350073,-8.732989,-9.447535,-9.61753,-11.726073,-11.28205,"{""type"":""MultiPoint"",""coordinates"":[]}",B11
6398,46682550,0.200223,0.123761,0.162866,0.16213,0.185823,0.26306,0.173952,0.106127,0.089005,...,-10.308127,-9.005195,-9.401302,-8.783696,-8.736219,-9.261295,-11.213137,-10.753014,"{""type"":""MultiPoint"",""coordinates"":[]}",B11
6399,46682550,0.186205,0.130751,0.144223,0.180731,0.220439,0.300093,0.21738,0.131332,0.108336,...,-9.953787,-8.617062,-8.267812,-8.489451,-6.921855,-8.259746,-9.347631,-8.49242,"{""type"":""MultiPoint"",""coordinates"":[]}",B11
6400,46682550,0.182978,0.160903,0.219331,0.178556,0.084817,0.099239,0.217228,0.097813,0.153357,...,-11.015224,-11.433616,-9.907618,-10.787681,-9.382615,-11.181036,-11.785312,-12.185155,"{""type"":""MultiPoint"",""coordinates"":[]}",B11


In [6]:
# Save datasets as npy files
arr = crop_data.to_numpy()
# vhvv = arr[:, np.newaxis, 0:36]
vh = arr[:, np.newaxis, 36:72].astype(np.float32)
vv = arr[:, np.newaxis, 72:108].astype(np.float32)
labels = arr[:, 109].astype(str)

data = np.concatenate([vh, vv], axis=1)

In [13]:
# Generate masks
seq_len = vv.shape[2]
masks = []
for i in range(data.shape[0]):
    mask = np.ones(seq_len)
    mask[np.random.randint(seq_len):] = 0
    masks.append(mask)

[array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [16]:
train_signals, test_signals, train_labels, test_labels = train_test_split(data, labels, test_size=0.3, random_state=42)
train_signals, val_signals, train_labels, val_labels = train_test_split(train_signals, train_labels, test_size=0.1, random_state=42)

In [17]:
train_masks = np.array(masks[:train_signals.shape[0]])
val_masks = np.array(masks[train_signals.shape[0]:train_signals.shape[0]+val_signals.shape[0]])
test_masks = np.array(masks[train_signals.shape[0]+val_signals.shape[0]:])

assert train_labels.shape[0] == train_masks.shape[0] == train_signals.shape[0]
assert val_labels.shape[0] == val_masks.shape[0] == val_signals.shape[0]
assert test_labels.shape[0] == test_masks.shape[0] == test_signals.shape[0]

assert len(np.unique(train_labels)) == len(np.unique(val_labels)) == len(np.unique(test_labels))

In [18]:
print(len(train_labels), len(val_labels), len(test_labels))

62528 6948 29776


In [20]:
# np.save(os.path.join(save_path, 'train_signals.npy'), train_signals, allow_pickle=True)
# np.save(os.path.join(save_path, 'train_labels.npy'), train_labels, allow_pickle=True)
np.save(os.path.join(save_path, 'vh_vv_full/train_masks.npy'), train_masks, allow_pickle=True)

# np.save(os.path.join(save_path, 'val_signals.npy'), val_signals, allow_pickle=True)
# np.save(os.path.join(save_path, 'val_labels.npy'), val_labels, allow_pickle=True)
np.save(os.path.join(save_path, 'vh_vv_full/val_masks.npy'), val_masks, allow_pickle=True)

# np.save(os.path.join(save_path, 'test_signals.npy'), test_signals, allow_pickle=True) 
# np.save(os.path.join(save_path, 'test_labels.npy'), test_labels, allow_pickle=True)
np.save(os.path.join(save_path, 'vh_vv_full/test_masks.npy'), test_masks, allow_pickle=True)

In [3]:
ds = CropTypeDataset(save_path+'vh_vv_full/', 'train', include_masks=True)

In [6]:
dataloader = DataLoader(ds, batch_size=7)

In [13]:
with torch.no_grad():
    for batch in tqdm(dataloader, total=len(dataloader)):
        x, y, m = batch
        break

  0%|          | 0/8933 [00:01<?, ?it/s]


In [30]:
masked = torch.stack([i * m for i in torch.unbind(x, axis=1)], axis=1)

In [33]:
x

tensor([[[-16.6133, -15.6661, -16.1065, -16.6811, -16.7711, -18.6049, -17.7341,
          -19.2199, -16.9886, -17.6081, -16.9677, -17.4429, -19.6478, -18.3894,
          -17.3364, -16.3715, -15.9568, -17.4481, -18.1410, -17.2195, -17.0704,
          -19.5750, -18.6725, -19.4411, -18.8879, -17.4893, -18.4655, -20.8458,
          -19.9017, -18.1979, -18.5535, -17.7300, -17.9124, -17.0974, -18.3492,
          -17.5191],
         [-10.2536,  -9.7675,  -8.9075, -10.4543, -11.1093, -12.6301, -11.6318,
          -13.1842, -11.2858, -12.7128, -10.9008, -11.0370,  -9.9478, -11.7135,
          -11.3061, -11.0687, -12.1598, -10.4776, -11.8476, -11.9927, -12.7566,
          -13.0993, -12.2197, -10.9117, -11.6222, -10.8223, -11.3570, -11.3515,
          -12.4061, -11.3578, -10.5994, -10.6958, -11.4247, -10.0893, -10.7328,
          -10.6686]],

        [[-21.9999, -21.4414, -22.2465, -21.6359, -21.1325, -20.9929, -16.2642,
          -17.0048, -18.5600, -16.4261, -18.8989, -18.2951, -21.8485, -19.07

In [32]:
masked

tensor([[[-16.6133, -15.6661, -16.1065, -16.6811, -16.7711, -18.6049, -17.7341,
          -19.2199, -16.9886, -17.6081, -16.9677, -17.4429, -19.6478, -18.3894,
          -17.3364, -16.3715, -15.9568, -17.4481, -18.1410, -17.2195, -17.0704,
          -19.5750, -18.6725, -19.4411,  -0.0000,  -0.0000,  -0.0000,  -0.0000,
           -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,
           -0.0000],
         [-10.2536,  -9.7675,  -8.9075, -10.4543, -11.1093, -12.6301, -11.6318,
          -13.1842, -11.2858, -12.7128, -10.9008, -11.0370,  -9.9478, -11.7135,
          -11.3061, -11.0687, -12.1598, -10.4776, -11.8476, -11.9927, -12.7566,
          -13.0993, -12.2197, -10.9117,  -0.0000,  -0.0000,  -0.0000,  -0.0000,
           -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,
           -0.0000]],

        [[-21.9999,  -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,
           -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.00