In [1]:
# import packages for data manipulation
import pandas as pd
import numpy as np

# Plotting libraries
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import janitor

from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame, roll_time_series
from sklearn.ensemble import AdaBoostRegressor
from tsfresh.utilities.dataframe_functions import impute
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger('tsfresh').setLevel(logging.ERROR)

In [2]:
df = pd.read_csv('../avocado.csv')

In [3]:
# Removing index column
df.drop('Unnamed: 0', axis=1, inplace=True)

# Removing records with TotalUS region, assuming it is nust the average of all other regions
df = df.loc[df.region!='TotalUS'].reset_index(drop=True)

# Making date to datetime and sorting chrinologically
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['region','Date'])
df = df.clean_names()

In [4]:
df_conventional = df.loc[df.type=='conventional']

In [5]:
def tsfresh_feature_extraction(region,avocado_type):
    temp = df.loc[(df.region==region)&(df.type==avocado_type)].reset_index(drop=True)
    df_shift, y = make_forecasting_frame(temp.averageprice, kind="price", max_timeshift=52, rolling_direction=1)
    X = extract_features(df_shift, column_id="id", column_sort="time", column_value="value", impute_function=impute,
                     show_warnings=False)
    X = X.loc[:, X.apply(pd.Series.nunique) != 1] 
    X["region"] = region
    X['date'] = temp.date
    X['type'] = avocado_type
    X = X.iloc[1:,]
    y = y.iloc[1: ]
    X = X.reset_index(drop=True)
    return X,y

In [6]:
X = []
y = []
for avocado_type in ['conventional','organic']:
    for region in list(set(df.region)):
        x_temp,y_temp = tsfresh_feature_extraction(region,avocado_type)
        X.append(x_temp)
        y.append(y_temp)

Feature Extraction: 100%|██████████| 19/19 [00:05<00:00,  3.19it/s]
Feature Extraction: 100%|██████████| 19/19 [00:05<00:00,  3.54it/s]
Feature Extraction: 100%|██████████| 19/19 [00:04<00:00,  4.51it/s]
Feature Extraction: 100%|██████████| 19/19 [00:04<00:00,  4.24it/s]
Feature Extraction: 100%|██████████| 19/19 [00:05<00:00,  3.58it/s]
Feature Extraction: 100%|██████████| 19/19 [00:05<00:00,  3.43it/s]
Feature Extraction: 100%|██████████| 19/19 [00:05<00:00,  3.73it/s]
Feature Extraction: 100%|██████████| 19/19 [00:05<00:00,  3.58it/s]
Feature Extraction: 100%|██████████| 19/19 [00:05<00:00,  3.59it/s]
Feature Extraction: 100%|██████████| 19/19 [00:05<00:00,  3.52it/s]
Feature Extraction: 100%|██████████| 19/19 [00:05<00:00,  3.70it/s]
Feature Extraction: 100%|██████████| 19/19 [00:05<00:00,  3.69it/s]
Feature Extraction: 100%|██████████| 19/19 [00:05<00:00,  3.46it/s]
Feature Extraction: 100%|██████████| 19/19 [00:05<00:00,  3.70it/s]
Feature Extraction: 100%|██████████| 19/19 [00:0

In [7]:
train = pd.concat(X)
train_y = pd.concat(y)

In [8]:
train.shape, train_y.shape, df.shape

((17699, 437), (17699,), (17911, 13))

In [9]:
train.head()

Unnamed: 0,date,region,type,value__abs_energy,value__absolute_sum_of_changes,"value__agg_autocorrelation__f_agg_""mean""__maxlag_40","value__agg_autocorrelation__f_agg_""median""__maxlag_40","value__agg_autocorrelation__f_agg_""var""__maxlag_40","value__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","value__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""",...,value__symmetry_looking__r_0.75,value__symmetry_looking__r_0.8,value__symmetry_looking__r_0.8500000000000001,value__symmetry_looking__r_0.9,value__symmetry_looking__r_0.9500000000000001,value__time_reversal_asymmetry_statistic__lag_1,value__time_reversal_asymmetry_statistic__lag_2,value__time_reversal_asymmetry_statistic__lag_3,value__value_count__value_1,value__variance
0,2015-01-18,NorthernNewEngland,conventional,2.2261,0.01,-1.0,-1.0,0.0,1.24381,0.415434,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.5e-05
1,2015-01-25,NorthernNewEngland,conventional,3.4805,0.07,-0.677326,-0.677326,0.282999,1.24381,0.415434,...,1.0,1.0,1.0,1.0,1.0,0.161014,0.0,0.0,0.0,0.000956
2,2015-02-01,NorthernNewEngland,conventional,4.6041,0.13,-0.206865,-0.392954,0.170706,1.24381,0.415434,...,1.0,1.0,1.0,1.0,1.0,0.080507,0.0,0.0,0.0,0.000769
3,2015-02-08,NorthernNewEngland,conventional,5.7277,0.13,-0.113932,-0.019531,0.176779,1.24381,0.415434,...,1.0,1.0,1.0,1.0,1.0,0.007455,0.023632,0.0,0.0,0.00064
4,2015-02-15,NorthernNewEngland,conventional,6.7886,0.16,-0.089118,-0.055882,0.209648,1.24381,0.415434,...,1.0,1.0,1.0,1.0,1.0,-0.011024,-0.021415,0.0,0.0,0.000756


In [10]:
train.isnull().sum()

date                                                  0
region                                                0
type                                                  0
value__abs_energy                                     0
value__absolute_sum_of_changes                        0
                                                   ... 
value__time_reversal_asymmetry_statistic__lag_1       0
value__time_reversal_asymmetry_statistic__lag_2       0
value__time_reversal_asymmetry_statistic__lag_3       0
value__value_count__value_1                        9182
value__variance                                       0
Length: 437, dtype: int64

In [11]:
train = train.reset_index(drop=True)

In [12]:
import pickle
pickle.dump(train,open('tsfresh_features.p','wb'))

In [13]:
na_df = pd.DataFrame(train.isnull().sum().reset_index())
na_df.columns = ['feature','num_NA']

In [14]:
drop_features = na_df.loc[na_df.num_NA>0].feature.tolist()

In [15]:
train.drop(drop_features,axis=1,inplace=True)

In [16]:
import pickle
pickle.dump(train,open('tsfresh_features.p','wb'))