In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.simplefilter('ignore')

# Create Wide-from Dataset


In [2]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from config import interim_data_dir, processed_data_dir

from src.utils import safe_create_dir

dataset_version = 'v8'

input_dir = processed_data_dir / 'VI_datasets' / dataset_version

## Load VI Dataset (longform)

In [3]:
def impute_nan_inf(df):
    """
    Impute bad values (nan and inf) using
    the median of that feature on that time stamp
    for all farms
    
    TODO: This can be improved
    """
    
    # replace all nans variants with np.nan
    df = df.replace([np.nan, None, np.inf, -np.inf], np.nan)

    # replace all nans with the median value of all farms on that timestamp
    for ts, ts_df in df.groupby('time'):
        df.loc[ts_df.index] = ts_df.fillna(ts_df.median())
        
    return df

train_df = pd.read_csv(input_dir/'train.csv', index_col=0)
test_df = pd.read_csv(input_dir/'test.csv', index_col=0)

train_df = impute_nan_inf(train_df)
test_df = impute_nan_inf(test_df)

In [4]:
train_df.head()

Unnamed: 0,farm_id,time,reip_mean,reip_median,reip_std,bri_mean,bri_median,bri_std,ipvi_mean,ipvi_median,...,gbndvi_mean,gbndvi_median,gbndvi_std,mnsi_mean,mnsi_median,mnsi_std,msbi_mean,msbi_median,msbi_std,y
0,1,2017-01-01,724.649613,717.385581,12.226598,9.620789e-08,9.386547e-08,1.626602e-08,0.430733,0.433335,...,0.352311,0.36362,0.055971,855.420664,0.0,897.213657,2606.382007,0.0,2724.941007,Vineyard
1,1,2017-01-31,723.528583,716.55992,11.197213,1.204282e-07,1.194364e-07,1.727727e-08,0.44937,0.452082,...,0.411538,0.422254,0.046439,908.336686,0.0,951.899592,2666.004071,0.0,2786.599432,Vineyard
2,1,2017-02-10,731.982657,725.883626,16.406339,6.848604e-08,6.636422e-08,7.633275e-09,0.445108,0.448902,...,0.428149,0.438856,0.043234,1029.193979,0.0,1080.842066,3122.322521,0.0,3264.75752,Vineyard
3,1,2017-03-12,721.881252,716.976952,9.816248,1.170843e-07,1.168176e-07,1.857573e-08,0.427278,0.432228,...,0.32858,0.343184,0.062545,753.733093,0.0,790.747738,2333.032314,0.0,2439.22883,Vineyard
4,1,2017-03-22,720.824076,716.020232,9.010497,1.133601e-07,1.127421e-07,1.520321e-08,0.422748,0.424777,...,0.309208,0.315113,0.048871,740.986736,0.0,776.110971,2322.442543,0.0,2426.135581,Vineyard


## Ts-fresh

In [5]:
from tsfresh import extract_relevant_features

In [10]:
reduced_features_list = ['ndvi','reip','dartt3']
reduced_cols_list = list(train_df.columns[train_df.columns.str.contains('|'.join(reduced_features_list))] )
reduced_cols_list

['reip_mean',
 'reip_median',
 'reip_std',
 'ndvi_mean',
 'ndvi_median',
 'ndvi_std',
 'gbndvi_mean',
 'gbndvi_median',
 'gbndvi_std']

In [11]:
labels = train_df.groupby('farm_id').apply(lambda x: x.y.unique()[0])

train_reduced = train_df[['farm_id','time'] + reduced_cols_list]
train_reduced.head()

Unnamed: 0,farm_id,time,reip_mean,reip_median,reip_std,ndvi_mean,ndvi_median,ndvi_std,gbndvi_mean,gbndvi_median,gbndvi_std
0,1,2017-01-01,724.649613,717.385581,12.226598,0.671766,0.692653,0.065668,0.352311,0.36362,0.055971
1,1,2017-01-31,723.528583,716.55992,11.197213,0.718396,0.729027,0.046242,0.411538,0.422254,0.046439
2,1,2017-02-10,731.982657,725.883626,16.406339,0.746055,0.755768,0.038175,0.428149,0.438856,0.043234
3,1,2017-03-12,721.881252,716.976952,9.816248,0.668629,0.684211,0.065733,0.32858,0.343184,0.062545
4,1,2017-03-22,720.824076,716.020232,9.010497,0.659258,0.673052,0.055294,0.309208,0.315113,0.048871


In [12]:
train_fresh = extract_relevant_features(train_reduced, y=labels, column_id='farm_id', column_sort='time')

Feature Extraction: 100%|██████████| 30/30 [06:14<00:00,  3.79s/it]
 'gbndvi_mean__agg_linear_trend__f_agg_"max"__chunk_len_50__attr_"rvalue"'
 'gbndvi_mean__agg_linear_trend__f_agg_"max"__chunk_len_50__attr_"slope"'
 ... 'reip_std__fft_coefficient__coeff_9__attr_"imag"'
 'reip_std__fft_coefficient__coeff_9__attr_"real"'
 'reip_std__spkt_welch_density__coeff_8'] did not have any finite values. Filling with zeros.


In [13]:
train_fresh.head()

variable,"ndvi_median__linear_trend__attr_""rvalue""","ndvi_mean__linear_trend__attr_""rvalue""","reip_mean__linear_trend__attr_""rvalue""","ndvi_median__linear_trend__attr_""pvalue""","ndvi_mean__linear_trend__attr_""pvalue""","reip_std__linear_trend__attr_""rvalue""","reip_mean__linear_trend__attr_""pvalue""","ndvi_mean__agg_autocorrelation__f_agg_""median""__maxlag_40","ndvi_median__agg_autocorrelation__f_agg_""median""__maxlag_40","reip_mean__agg_autocorrelation__f_agg_""median""__maxlag_40",...,"ndvi_std__augmented_dickey_fuller__attr_""teststat""","ndvi_std__augmented_dickey_fuller__attr_""pvalue""","gbndvi_mean__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.4","gbndvi_median__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_9__w_5","ndvi_mean__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_5__w_5","reip_mean__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4",gbndvi_mean__energy_ratio_by_chunks__num_segments_10__segment_focus_4,reip_median__has_duplicate_min,gbndvi_mean__number_peaks__n_5,ndvi_std__large_standard_deviation__r_0.30000000000000004
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.871779,-0.867431,-0.86984,0.000469,0.000542,-0.871224,0.000501,-0.777269,-0.79686,-0.772142,...,-0.183854,0.940419,-1.337325,9.657427,1.004849,50.578045,0.243072,0.0,0.0,1.0
2,-0.514319,-0.538217,-0.63368,0.105531,0.087643,-0.59592,0.036313,-0.21415,-0.252416,-0.079867,...,-3.541692,0.006979,0.021396,0.394899,1.443421,7.538752,0.126539,0.0,0.0,1.0
3,-0.880835,-0.884816,-0.877775,0.000342,0.000295,-0.829235,0.000382,-0.657397,-0.65185,-0.714854,...,-2.985832,0.036239,0.091526,0.001376,1.133652,5.234283,0.000762,0.0,0.0,0.0
4,-0.931085,-0.93119,-0.921549,3.1e-05,3.1e-05,-0.902284,5.5e-05,-0.815178,-0.818135,-0.726105,...,-2.585181,0.096099,1.202407,4.73608,1.101421,8.15967,0.000669,0.0,0.0,0.0
7,-0.930561,-0.932319,-0.873989,3.2e-05,2.9e-05,-0.841204,0.000436,-0.636413,-0.617364,-0.578332,...,-2.095908,0.246163,-0.006163,0.268544,1.387828,24.324183,0.047885,0.0,0.0,1.0


In [14]:
labels.head()

farm_id
1    Vineyard
2       Pecan
3    Vineyard
4    Vineyard
7    Vineyard
dtype: object

In [8]:
def long_to_wide(df):
    
    df_long = pd.pivot(df, index='farm_id', columns='time')
    
    columns = ['_'.join(col_pair) for col_pair in df_long.columns if 'y' not in col_pair]
    
    if 'y' in df:
        
        classes = {}
        for farm_id, farm_group in df.groupby('farm_id'):
            classes[farm_id] = farm_group.y.unique()[0]
        
        labels = pd.Series(classes, name='y')
        
        df_long.drop('y', axis=1, inplace=True)
        
        df_long = df_long.join(labels, on='farm_id')
        
        columns.append('y')
        
    df_long.columns = columns
    
    return df_long
    

In [None]:
train_df_wide = long_to_wide(train_df)
test_df_wide = long_to_wide(test_df)

In [None]:
train_df_wide.head()

## Save Dataset

In [None]:
from src.utils import safe_create_dir

output_dir = processed_data_dir / 'wide_datasets' / dataset_version
safe_create_dir(output_dir)

train_df_wide.to_csv(output_dir/'train.csv')
test_df_wide.to_csv(output_dir/'test.csv')