In [1]:
import glob
import joblib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import xarray as xr

from scipy.signal import savgol_filter

from sklearn.preprocessing import MinMaxScaler, StandardScaler

FOLDER = 'adaptative_factor_2'
BANDS = ['red', 'green', 'blue', 'rededge1', 'rededge2', 'rededge3', 'nir', 'swir']
VI = ['ndvi', 'savi', 'evi', 'rep','osavi','rdvi','mtvi1','lswi']
M_COLUMNS = ['tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover', 'windspeed', 'winddir', 
             'sealevelpressure', 'cloudcover', 'solarradiation', 'solarenergy', 'uvindex', 'moonphase', 'solarexposure']

S_COLUMNS = ['ndvi', 'savi', 'evi', 'rep', 'osavi', 'rdvi', 'mtvi1', 'lswi']
G_COLUMNS = ['Field size (ha)', 'Rice Crop Intensity(D=Double, T=Triple)']
TARGET = 'Rice Yield (kg/ha)'

xdf = xr.open_dataset('../../data/processed/augment_10_5/test.nc')
xdf

In [2]:
def add_observation(xdf: xr.Dataset, test: bool)->xr.Dataset:
    if test:
        path = '../../data/raw/test.csv'
    else:
        path = '../../data/raw/train.csv'

    df = pd.read_csv(path)
    df.index.name = 'ts_obs'
    xdf = xr.merge([xdf, df.to_xarray()])
    return xdf

xdf = add_observation(xdf, True)
xdf

In [3]:
def add_weather(xdf: xr.Dataset)->xr.Dataset:
    xdf = xdf

    weather = []
    for path in glob.glob('../../data/raw/weather/*.csv'):
        weather.append(pd.read_csv(path))

    df_weather = pd.concat(weather, axis='index')
    df_weather['datetime'] = pd.to_datetime(df_weather['datetime'])
    df_weather['name'] = df_weather['name'].str.replace(' ', '_')
    df_weather.set_index(['datetime', 'name'], inplace=True)
    xdf_weather = df_weather.to_xarray().set_coords(['datetime', 'name'])
    xdf_weather['datetime'] = xdf_weather['datetime'].dt.strftime('%Y-%m-%d')

    xdf = xr.merge([xdf, xdf_weather])

    return xdf

xdf = add_weather(xdf)
xdf

In [4]:
def compute_vi(xdf: xr.Dataset)->xr.Dataset:

    def compute_ndvi(xdf: xr.Dataset)->xr.Dataset:
        return (xdf.nir - xdf.red) / (xdf.nir + xdf.red)

    def compute_savi(xdf, L=0.5):
        return 1 + L * (xdf.nir - xdf.red) / (xdf.nir + xdf.red + L)

    def compute_evi(xdf, G=2.5, L=1, C1=6, C2=7.5):
        return G * (xdf.nir - xdf.red) / (xdf.nir + C1 * xdf.red - C2 * xdf.blue + L)

    def compute_rep(xdf: xr.Dataset)->xr.Dataset:
        rededge = (xdf.red + xdf.rededge3) / 2
        return 704 + 35 * (rededge - xdf.rededge1) / (xdf.rededge2 - xdf.rededge1)

    def compute_osavi(xdf: xr.Dataset)->xr.Dataset:
        return (xdf.nir - xdf.red) / (xdf.nir + xdf.red + 0.16)

    def compute_rdvi(xdf: xr.Dataset)->xr.Dataset:
        return (xdf.nir - xdf.red) / np.sqrt(xdf.nir + xdf.red)

    def compute_mtvi1(xdf: xr.Dataset)->xr.Dataset:
        return 1.2 * (1.2 * (xdf.nir - xdf.green) - 2.5 * (xdf.red - xdf.green))

    def compute_lswi(xdf: xr.Dataset)->xr.Dataset:
        return (xdf.nir - xdf.swir) / (xdf.nir + xdf.swir)

    # compute all vegetable indice
    xdf['ndvi'] = compute_ndvi(xdf)
    xdf['savi'] = compute_savi(xdf)
    xdf['evi'] = compute_evi(xdf)
    xdf['rep'] = compute_rep(xdf)
    xdf['osavi'] = compute_osavi(xdf)
    xdf['rdvi'] = compute_rdvi(xdf)
    xdf['mtvi1'] = compute_mtvi1(xdf)
    xdf['lswi'] = compute_lswi(xdf)

    return xdf

xdf = compute_vi(xdf)
xdf

In [5]:
def statedev_fill(xdf: xr.Dataset)->xr.Dataset:
    # replace infinite value by na
    def replaceinf(arr:  np.ndarray)->np.ndarray:
        if np.issubdtype(arr.dtype, np.number):
            arr[np.isinf(arr)] = np.nan
        return arr
    xr.apply_ufunc(replaceinf, xdf[S_COLUMNS])
    # compute mean of all stage of developpement and all obsevation
    xdf_mean = xdf.mean(dim='ts_aug', skipna=True)
    # fill na value with computed mean
    xdf = xdf.fillna(xdf_mean)
    # compute mean of all stage of developpement of rice field to complete last na values
    xdf_mean = xdf_mean.mean(dim='ts_obs', skipna=True)
    # fill na value with computed mean
    xdf = xdf.fillna(xdf_mean)

    return xdf

xdf = statedev_fill(xdf)
xdf

In [6]:
def app_savgol_filter(arr: np.ndarray, axis, window_length, polyorder, mode):
    arr_sav = savgol_filter(arr, axis=axis, window_length=window_length, polyorder=polyorder, mode=mode)
    print(np.isnan(arr_sav).any())
    return arr_sav

def smooth(xdf: xr.Dataset)->xr.Dataset:
    # apply savgol_filter to vegetable indice
    xdf_s = xr.apply_ufunc(savgol_filter, xdf[S_COLUMNS], kwargs={'axis': 2, 'window_length': 12, 'polyorder': 4, 'mode': 'mirror'})
    # merge both dataset and override old vegetable indice and bands
    return xr.merge([xdf_s, xdf], compat='override')

xdf = smooth(xdf)
xdf

In [7]:
def categorical_encoding(xdf: xr.Dataset)->xr.Dataset:
    xdf['Rice Crop Intensity(D=Double, T=Triple)'] = xdf['Rice Crop Intensity(D=Double, T=Triple)'].str.replace("D", "2").str.replace("T", "3").astype(np.int8)
    return xdf

xdf = categorical_encoding(xdf)
xdf

In [8]:
def features_modification(xdf: xr.Dataset, test: bool)->xr.Dataset:
    xdf['sunrise'] = xdf['sunrise'].astype(np.datetime64)
    xdf['sunset'] = xdf['sunset'].astype(np.datetime64)

    xdf['solarexposure'] = (xdf['sunset'] - xdf['sunrise']).dt.seconds

    xdf['time'] = xdf['time'].astype(np.datetime64)
    xdf['datetime'] = xdf['datetime'].astype(np.datetime64)
    xdf = xdf.reset_coords('time')

    columns = S_COLUMNS + G_COLUMNS + M_COLUMNS + ['time'] # time is the key to link with weather data 
    if not test:
        columns.append(TARGET)
    xdf = xdf[columns]

    return xdf

xdf = features_modification(xdf, True)
xdf

In [11]:
from typing import Union

class DatasetScaler:
    def __init__(self) -> None:
        pass
    
    def fit(self, xdf: xr.Dataset):
        def fit_scaler(xdf: xr.Dataset, columns: list[str], mode: str='standard'):
            if mode == 'standard':
                scaler = StandardScaler()
            elif mode == 'minmax':
                scaler = MinMaxScaler()
            df = xdf[columns].to_dataframe()
            scaler.fit(df[columns])
            return scaler

        # Fit S data scaler
        self.scaler_s = fit_scaler(xdf, S_COLUMNS)
        # Fit G data scaler
        self.scaler_g = fit_scaler(xdf, G_COLUMNS)
        # Fit M data scaler
        self.scaler_m = fit_scaler(xdf, M_COLUMNS)
        # Fit Target data scaler
        self.scaler_t = fit_scaler(xdf, [TARGET], 'minmax')
    

    def transform(self, xdf: xr.Dataset, target: bool=False)->xr.Dataset:
        def transform_data(xdf: xr.Dataset, columns: str, scaler: Union[StandardScaler, MinMaxScaler])->xr.Dataset:
            df = xdf[columns].to_dataframe()
            df.loc[:, columns] = scaler.transform(df[columns])
            xdf_scale = df.to_xarray()
            xdf = xr.merge([xdf_scale, xdf], compat='override')
            return xdf
        
        # Scale S data
        xdf = transform_data(xdf, S_COLUMNS, self.scaler_s)
        # Scale G data
        xdf = transform_data(xdf, G_COLUMNS, self.scaler_g)
        # Scale M data
        xdf = transform_data(xdf, M_COLUMNS, self.scaler_m)

        if target:
            # Scale M data
            xdf = transform_data(xdf, [TARGET], self.scaler_t)

        return xdf
    
    def fit_transform(self, xdf: xr.Dataset)->xr.Dataset:
        self.fit(xdf)
        xdf = self.transform(xdf, True)
        return xdf
        

def scale_data(xdf: xr.Dataset, path: str, test: bool)->xr.Dataset:
    # Path for saving scaler
    path = '/'.join(path.split('/')[:-1]) + "/scaler_dataset.joblib"
    
    if not test:
        scaler = DatasetScaler()
        xdf = scaler.fit_transform(xdf)
        # joblib.dump(scaler, path)
    else:
        scaler: DatasetScaler = joblib.load(path)
        xdf = scaler.transform(xdf)
    
    return xdf

xdf = scale_data(xdf, '../../data/processed/augment_10_5/test.nc', True)
xdf

ModuleNotFoundError: No module named 'datascaler'

In [None]:
def create_id(xdf: xr.Dataset)->xr.Dataset:
    ts_id = np.arange(xdf.dims['ts_obs'] * xdf.dims['ts_aug'])
    ts_id = ts_id.reshape((xdf.dims['ts_obs'], xdf.dims['ts_aug']))
    xdf = xdf.assign_coords({'ts_id': (('ts_obs', 'ts_aug'), ts_id)})
    return xdf

xdf = create_id(xdf)
xdf