In [13]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.simplefilter('ignore')

# Create VI Datasets

In [14]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from config import interim_data_dir, raw_data_dir

from src.utils import read_shapefile

dataset_version = "v4"

vi_methods = [
    'ndvi',
    'cvi',
    'evi',
    'arvi2',
    'datt1',
    'atsavi',
    'maccioni',
    'band_2',
    'band_3',
    'band_4',
    'band_5',
    'band_6',
    'band_7',
    'band_8',
    'band_8a',
    'band_11',
    'band_12',
]


In [15]:
res_groups = {
    "60": ["B01", "B09", "B10"],
    "20": ["B05", "B06", "B07", "B8A", "B11", "B12"],
    "10": ["B02", "B03", "B04", "B08"],
}

In [16]:
def calc_ndvi(bands_data):
    return (bands_data['B08'] - bands_data['B04'])/(bands_data['B08'] + bands_data['B04'])

def calc_cvi(bands_data):
    """
    (𝐵𝑎𝑛𝑑8 ∗ 𝐵𝑎𝑛𝑑4) / (𝐵𝑎𝑛𝑑3)^2
    """
    return (bands_data['B08'] * bands_data['B04']) / (bands_data['B03']**2)

def calc_datt1(bands_data):
    """
    𝐵𝑎𝑛𝑑8 − 𝐵𝑎𝑛𝑑5 / 𝐵𝑎𝑛𝑑8 − 𝐵𝑎𝑛𝑑4
    """
    return (bands_data['B08'] - bands_data['B05'])/(bands_data['B08'] - bands_data['B04'])

def calc_evi(bands_data):
    """
    2.5* (𝐵𝑎𝑛𝑑8 − 𝐵𝑎𝑛𝑑4) / (𝐵𝑎𝑛𝑑8 + 6 ∗ 𝐵𝑎𝑛𝑑4 − 7.5 ∗ 𝐵𝑎𝑛𝑑2 + 1)
    """
    return 2.5* (bands_data['B08'] - bands_data['B04']) / (bands_data['B08'] + 6*bands_data['B04'] - 7.5 * bands_data['B02'] + 1)

def calc_arvi2(bands_data):
    return -0.18 + 1.17 * calc_ndvi(bands_data)

def calc_atsavi(bands_data):
    """
    a ∗ (Band8 − a ∗ Band4 − b) / (Band8 + Band4 − ab + X(1 + 𝑎2))
    
    a = 1.22, b=0.03, X=0.08
    """
    
    a = 1.22
    b=0.03
    X=0.08
    
    return a * (bands_data['B08'] - a*bands_data['B04'] - b) /\
            (bands_data['B08'] + bands_data['B04'] - a*b + X * (1 + a**2))

def calc_maccioni(bands_data):
    """
    (𝐵𝑎𝑛𝑑7 − 𝐵𝑎𝑛𝑑5) / (𝐵𝑎𝑛𝑑7 − 𝐵𝑎𝑛𝑑4)
    """
    return (bands_data['B07'] - bands_data['B05']) / (bands_data['B07'] - bands_data['B04'])


# Just the bands
def calc_band_2(bands_data):
    return bands_data['B02']

def calc_band_3(bands_data):
    return bands_data['B03']

def calc_band_4(bands_data):
    return bands_data['B04']

def calc_band_5(bands_data):
    return bands_data['B05']

def calc_band_6(bands_data):
    return bands_data['B06']

def calc_band_7(bands_data):
    return bands_data['B07']

def calc_band_8(bands_data):
    return bands_data['B08']

def calc_band_8a(bands_data):
    return bands_data['B8A']

def calc_band_11(bands_data):
    return bands_data['B11']

def calc_band_12(bands_data):
    return bands_data['B12']


In [17]:
# Number of time stamps
N_ts = 11

# Number of bands
N_b = 10

VALID_AGG_METHODS = ['mean','median','min','max','std']


def load_for_id(farm_id, dataset, bands_subset=None):
    """
    Load all data for a farm
    
    Return:
    
    dictionary: {band: {date: ndarray}, {...}}
    """
    farm_dir = interim_data_dir / 'masks_resampled' / dataset / str(farm_id)

    dates = farm_dir.glob('*/')
    farm_data = {}
    
    for date_path in dates:
        
        date = date_path.stem

        bands = date_path.glob("*.npy")
        
        farm_data[date_path.stem] = {}
        
        for band_path in bands:
            band = band_path.stem
            
            if bands_subset and band in bands_subset:
                arr = np.load(band_path)
                farm_data[date][band] = arr
            else:
                arr = np.load(band_path)
                farm_data[date][band] = arr
        
    return farm_data

def agg_arr(arr, agg_method):
    """
    aggregate an array with a nan-aware numpy method
    """
    assert agg_method in VALID_AGG_METHODS
    return eval(f'np.nan{agg_method}')(arr.flatten())

def calc_vi_ts(farm_data, farm_id, agg_methods, which_vi='ndvi'):
    """
    Calculate vegetation index time series for a farm
    """
    vi_data = {}
    for date, bands_data in farm_data.items():
        # Get aggregated statistics of VI signals for this date
        vi_data[date] = [agg_arr(eval(f'calc_{which_vi}')(bands_data), agg_method) for agg_method in agg_methods]
        
    vi = pd.DataFrame.from_dict(vi_data, orient='index')
    vi.index = pd.to_datetime(vi.index)
    vi.index.name='time'
    vi.columns = [f'{which_vi}_{agg_method}' for agg_method in agg_methods]
    
    return pd.concat([vi], keys=[farm_id], names=['farm_id'])

def create_features_dataset(dataset='train', bands_subset=None, agg_methods=['mean','median'], limit_farms=None, add_bands=True):
    """
    
    """
    
    shp_df = read_shapefile(dataset)
    
    ids_list = shp_df.index.to_list()
    
    if limit_farms:
        ids_list = ids_list[:limit_farms]

    vi_dfs = []
    for farm_id in tqdm(ids_list, "farms"):
        
        farm_data = load_for_id(farm_id, dataset, bands_subset=bands_subset)
        
        vi_data = [calc_vi_ts(farm_data, farm_id, agg_methods, vi_method) for vi_method in vi_methods]
        

        vi_df = pd.concat(vi_data, axis=1)

        vi_dfs.append(vi_df)

    df = pd.concat(vi_dfs, axis=0).reset_index()
    
    if dataset=='train':
        df = df.join(shp_df[['y']], on='farm_id')
        
    df.time = pd.to_datetime(df.time)
    
    # Sort each farm by time
    df = df.sort_values(by=['farm_id','time'])
    
    # After sorting the index will be jumbled, lets throw that away
    df.reset_index(drop=True, inplace=True)
    
    return df
    
    

In [18]:
print('-'*50,'Train','-'*50)

train_features_df = create_features_dataset(
    dataset='train',
    agg_methods=['mean','median']
)

print('\n')
print('-'*50,'Test','-'*50)
test_features_df = create_features_dataset(
    dataset='test', 
    agg_methods=['mean','median']
)

-------------------------------------------------- Train --------------------------------------------------


farms: 100%|██████████| 2494/2494 [05:40<00:00,  7.03it/s]




-------------------------------------------------- Test --------------------------------------------------


farms: 100%|██████████| 1074/1074 [02:23<00:00,  7.64it/s]


## Save to disk

In [19]:
from config import processed_data_dir
from src.utils import safe_create_dir

out_dir = processed_data_dir / 'VI_datasets' / dataset_version
safe_create_dir(out_dir)

train_features_df.to_csv(out_dir/'train.csv')
test_features_df.to_csv(out_dir/'test.csv')