In [8]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.simplefilter('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Create Wide-from Dataset


In [9]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from config import interim_data_dir, processed_data_dir

from src.utils import safe_create_dir

dataset_version = 'v5'

input_dir = processed_data_dir / 'VI_datasets' / dataset_version

## Load VI Dataset (longform)

In [10]:
def impute_nan_inf(df):
    """
    Impute bad values (nan and inf) using
    the median of that feature on that time stamp
    for all farms
    
    TODO: This can be improved
    """
    
    # replace all nans variants with np.nan
    df = df.replace([np.nan, None, np.inf, -np.inf], np.nan)

    # replace all nans with the median value of all farms on that timestamp
    for ts, ts_df in df.groupby('time'):
        df.loc[ts_df.index] = ts_df.fillna(ts_df.median())
        
    return df

train_df = pd.read_csv(input_dir/'train.csv', index_col=0)
test_df = pd.read_csv(input_dir/'test.csv', index_col=0)

train_df = impute_nan_inf(train_df)
test_df = impute_nan_inf(test_df)

In [15]:
train_df.head()

Unnamed: 0,farm_id,time,ndvi_mean,ndvi_median,cvi_mean,cvi_median,evi_mean,evi_median,arvi2_mean,arvi2_median,...,band_7_median,band_8_mean,band_8_median,band_8a_mean,band_8a_median,band_11_mean,band_11_median,band_12_mean,band_12_median,y
0,1,2017-01-01,0.671766,0.692653,3.001651,0.947159,2.474928,0.0,0.605966,0.630404,...,0.0,1774.6,0.0,1945.271429,0.0,986.121429,0.0,496.128571,0.0,Vineyard
1,1,2017-01-31,0.718396,0.729027,2.744553,1.052384,2.208423,0.0,0.660523,0.672961,...,0.0,1878.164286,0.0,1941.592857,0.0,992.671429,0.0,521.65,0.0,Vineyard
2,1,2017-02-10,0.746055,0.755768,1.97707,1.065654,2.510243,0.0,0.692884,0.704249,...,0.0,2199.185714,0.0,2474.121429,0.0,1106.557143,0.0,508.7,0.0,Vineyard
3,1,2017-03-12,0.668629,0.684211,2.180335,0.865489,3.435664,0.0,0.602296,0.620526,...,0.0,1573.342857,0.0,1719.757143,0.0,898.8,0.0,466.95,0.0,Vineyard
4,1,2017-03-22,0.659258,0.673052,2.783775,1.226074,4.197762,0.0,0.591332,0.60747,...,0.0,1549.492857,0.0,1729.335714,0.0,871.714286,0.0,433.728571,0.0,Vineyard


In [11]:
def long_to_wide(df):
    
    df_long = pd.pivot(df, index='farm_id', columns='time')
    
    columns = ['_'.join(col_pair) for col_pair in df_long.columns if 'y' not in col_pair]
    
    if 'y' in df:
        
        classes = {}
        for farm_id, farm_group in df.groupby('farm_id'):
            classes[farm_id] = farm_group.y.unique()[0]
        
        labels = pd.Series(classes, name='y')
        
        df_long.drop('y', axis=1, inplace=True)
        
        df_long = df_long.join(labels, on='farm_id')
        
        columns.append('y')
        
    df_long.columns = columns
    
    return df_long
    

In [12]:
train_df_wide = long_to_wide(train_df)
test_df_wide = long_to_wide(test_df)

In [13]:
train_df_wide.head()

Unnamed: 0_level_0,ndvi_mean_2017-01-01,ndvi_mean_2017-01-31,ndvi_mean_2017-02-10,ndvi_mean_2017-03-12,ndvi_mean_2017-03-22,ndvi_mean_2017-05-31,ndvi_mean_2017-06-20,ndvi_mean_2017-07-10,ndvi_mean_2017-07-15,ndvi_mean_2017-08-04,...,band_12_median_2017-02-10,band_12_median_2017-03-12,band_12_median_2017-03-22,band_12_median_2017-05-31,band_12_median_2017-06-20,band_12_median_2017-07-10,band_12_median_2017-07-15,band_12_median_2017-08-04,band_12_median_2017-08-19,y
farm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.671766,0.718396,0.746055,0.668629,0.659258,0.247449,0.234662,0.246357,0.278753,0.217553,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vineyard
2,0.682275,0.702713,0.802193,0.457887,0.49786,0.712684,0.671382,0.625462,0.666805,0.380044,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Pecan
3,0.528756,0.657075,0.668956,0.670401,0.643311,0.532119,0.361842,0.261191,0.274244,0.177425,...,876.0,696.5,703.5,770.5,994.5,1087.0,1138.5,940.5,1637.5,Vineyard
4,0.674225,0.746993,0.747719,0.723733,0.632834,0.376561,0.317186,0.271709,0.297141,0.16645,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vineyard
7,0.699023,0.786059,0.808145,0.676815,0.682761,0.544508,0.50878,0.478193,0.517747,0.391577,...,746.5,942.0,825.0,945.5,1165.5,1184.0,1199.5,1070.5,1532.0,Vineyard


## Save Dataset

In [14]:
from src.utils import safe_create_dir

output_dir = processed_data_dir / 'wide_datasets' / dataset_version
safe_create_dir(output_dir)

train_df_wide.to_csv(output_dir/'train.csv')
test_df_wide.to_csv(output_dir/'test.csv')