In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.simplefilter('ignore')

# Create ts_fresh Dataset


In [16]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from config import interim_data_dir, processed_data_dir

from src.utils import safe_create_dir

dataset_version = 'v1'

input_dir = processed_data_dir / 'VI_datasets' / dataset_version

## Load VI Dataset (longform)

In [17]:
def impute_nan_inf(df):
    """
    Impute bad values (nan and inf) using
    the median of that feature on that time stamp
    for all farms
    
    TODO: This can be improved
    """
    
    # replace all nans variants with np.nan
    df = df.replace([np.nan, None, np.inf, -np.inf], np.nan)

    # replace all nans with the median value of all farms on that timestamp
    for ts, ts_df in df.groupby('time'):
        df.loc[ts_df.index] = ts_df.fillna(ts_df.median())
        
    return df

train_df = pd.read_csv(input_dir/'train.csv', index_col=0)
test_df = pd.read_csv(input_dir/'test.csv', index_col=0)

train_df = impute_nan_inf(train_df)
test_df = impute_nan_inf(test_df)

In [18]:
train_df.head()

Unnamed: 0,farm_id,time,ndvi_min,ndvi_mean,ndvi_median,ndvi_std,ndvi_max,cvi_min,cvi_mean,cvi_median,...,evi_mean,evi_median,evi_std,evi_max,arvi2_min,arvi2_mean,arvi2_median,arvi2_std,arvi2_max,y
0,1,2017-01-01,0.454174,0.671766,0.692653,0.065668,0.745895,0.001221,3.001651,0.947159,...,2.474928,0.0,2.691551,6.758784,0.351384,0.605966,0.630404,0.076831,0.692697,Vineyard
1,1,2017-01-31,0.586915,0.718396,0.729027,0.046242,0.781602,0.047159,2.744553,1.052384,...,2.208423,0.0,2.330548,5.673348,0.506691,0.660523,0.672961,0.054103,0.734474,Vineyard
2,1,2017-02-10,0.633239,0.746055,0.755768,0.038175,0.79881,0.031403,1.97707,1.065654,...,2.510243,0.0,2.629651,5.792201,0.56089,0.692884,0.704249,0.044665,0.754608,Vineyard
3,1,2017-03-12,0.514993,0.668629,0.684211,0.065733,0.765019,0.00361,2.180335,0.865489,...,3.435664,0.0,3.770609,11.882745,0.422541,0.602296,0.620526,0.076907,0.715073,Vineyard
4,1,2017-03-22,0.511155,0.659258,0.673052,0.055294,0.747845,0.010952,2.783775,1.226074,...,4.197762,0.0,4.720135,14.271617,0.418052,0.591332,0.60747,0.064695,0.694979,Vineyard


## Ts-fresh

In [19]:
from tsfresh import extract_relevant_features, extract_features

In [41]:
reduced_features_list = ['ndvi','reip','dartt3']
reduced_cols_list = ['farm_id','time'] + list(train_df.columns[train_df.columns.str.contains('|'.join(reduced_features_list))] )

labels = train_df.groupby('farm_id').apply(lambda x: x.y.unique()[0])
labels.name='y'

In [None]:
train_reduced = train_df[reduced_cols_list]
test_reduced = test_df[reduced_cols_list]

train_fresh = extract_relevant_features(train_reduced, labels, column_id='farm_id', column_sort='time', n_jobs=10)

In [42]:
train_selected = train_selected.join(labels)

In [35]:
test_fresh = extract_features(test_reduced, column_id='farm_id', column_sort='time', n_jobs=10)
test_selected = test_fresh[train_selected.columns]



Feature Extraction:   0%|          | 0/50 [00:00<?, ?it/s][A[A

Feature Extraction:   2%|▏         | 1/50 [00:20<16:54, 20.70s/it][A[A

Feature Extraction:   4%|▍         | 2/50 [00:20<11:39, 14.56s/it][A[A

Feature Extraction:   6%|▌         | 3/50 [00:21<08:01, 10.24s/it][A[A

Feature Extraction:   8%|▊         | 4/50 [00:21<05:32,  7.22s/it][A[A

Feature Extraction:  10%|█         | 5/50 [00:21<03:50,  5.12s/it][A[A

Feature Extraction:  12%|█▏        | 6/50 [00:21<02:40,  3.65s/it][A[A

Feature Extraction:  14%|█▍        | 7/50 [00:21<01:52,  2.61s/it][A[A

Feature Extraction:  16%|█▌        | 8/50 [00:22<01:19,  1.89s/it][A[A

Feature Extraction:  18%|█▊        | 9/50 [00:22<00:56,  1.39s/it][A[A

Feature Extraction:  20%|██        | 10/50 [00:22<00:41,  1.05s/it][A[A

Feature Extraction:  22%|██▏       | 11/50 [00:40<03:59,  6.14s/it][A[A

Feature Extraction:  24%|██▍       | 12/50 [00:40<02:47,  4.41s/it][A[A

Feature Extraction:  26%|██▌       | 13/5

## Save Dataset

In [43]:
from src.utils import safe_create_dir

output_dir = processed_data_dir / 'ts_fresh' / dataset_version
safe_create_dir(output_dir)

train_selected.to_csv(output_dir/'train.csv')
test_selected.to_csv(output_dir/'test.csv')