In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.simplefilter('ignore')

# Create Wide-from Dataset


In [3]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from config import interim_data_dir, processed_data_dir

from src.utils import safe_create_dir

dataset_version = 'v4.1'

input_dir = processed_data_dir / 'VI_datasets' / dataset_version

## Load VI Dataset (longform)

In [4]:
def impute_nan_inf(df):
    """
    Impute bad values (nan and inf) using
    the median of that feature on that time stamp
    for all farms
    
    TODO: This can be improved
    """
    
    # replace all nans variants with np.nan
    df = df.replace([np.nan, None, np.inf, -np.inf], np.nan)

    # replace all nans with the median value of all farms on that timestamp
    for ts, ts_df in df.groupby('time'):
        df.loc[ts_df.index] = ts_df.fillna(ts_df.median())
        
    return df

train_df = pd.read_csv(input_dir/'train.csv', index_col=0)
test_df = pd.read_csv(input_dir/'test.csv', index_col=0)

train_df = impute_nan_inf(train_df)
test_df = impute_nan_inf(test_df)

In [5]:
def long_to_wide(df):
    
    df_long = pd.pivot(df, index='farm_id', columns='time')
    
    columns = ['_'.join(col_pair) for col_pair in df_long.columns if 'y' not in col_pair]
    
    if 'y' in df:
        
        classes = {}
        for farm_id, farm_group in df.groupby('farm_id'):
            classes[farm_id] = farm_group.y.unique()[0]
        
        labels = pd.Series(classes, name='y')
        
        df_long.drop('y', axis=1, inplace=True)
        
        df_long = df_long.join(labels, on='farm_id')
        
        columns.append('y')
        
    df_long.columns = columns
    
    return df_long
    

In [6]:
train_df_wide = long_to_wide(train_df)
test_df_wide = long_to_wide(test_df)

In [7]:
train_df_wide.head()

Unnamed: 0_level_0,ndvi_mean_2017-01-01,ndvi_mean_2017-01-31,ndvi_mean_2017-02-10,ndvi_mean_2017-03-12,ndvi_mean_2017-03-22,ndvi_mean_2017-05-31,ndvi_mean_2017-06-20,ndvi_mean_2017-07-10,ndvi_mean_2017-07-15,ndvi_mean_2017-08-04,...,band_12_std_2017-02-10,band_12_std_2017-03-12,band_12_std_2017-03-22,band_12_std_2017-05-31,band_12_std_2017-06-20,band_12_std_2017-07-10,band_12_std_2017-07-15,band_12_std_2017-08-04,band_12_std_2017-08-19,y
farm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.671766,0.718396,0.746055,0.668629,0.659258,0.247449,0.234662,0.246357,0.278753,0.217553,...,533.618506,490.728297,455.879587,815.750994,897.448315,830.92914,918.824975,716.475023,977.449066,Vineyard
2,0.682275,0.702713,0.802193,0.457887,0.49786,0.712684,0.671382,0.625462,0.666805,0.380044,...,464.616491,736.054274,499.812831,439.24906,522.862134,541.199592,554.839571,655.437723,842.016789,Pecan
3,0.528756,0.657075,0.668956,0.670401,0.643311,0.532119,0.361842,0.261191,0.274244,0.177425,...,458.616322,364.423058,369.769554,452.37406,590.37768,640.320051,676.874166,537.452323,866.403363,Vineyard
4,0.674225,0.746993,0.747719,0.723733,0.632834,0.376561,0.317186,0.271709,0.297141,0.16645,...,434.587467,342.331664,381.795574,615.638041,674.717005,692.681227,733.771024,744.607236,904.86208,Vineyard
7,0.699023,0.786059,0.808145,0.676815,0.682761,0.544508,0.50878,0.478193,0.517747,0.391577,...,416.870414,511.503689,450.955994,510.101591,633.729514,648.671586,670.512631,599.020332,791.042767,Vineyard


## Save Dataset

In [8]:
from src.utils import safe_create_dir

output_dir = processed_data_dir / 'wide_datasets' / dataset_version
safe_create_dir(output_dir)

train_df_wide.to_csv(output_dir/'train.csv')
test_df_wide.to_csv(output_dir/'test.csv')