In [2]:
import feather
import pandas as pd
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder

In [60]:
from outliers import remove_outliers
from feature_gen import add_trend_features, add_penultimate_week
from feature_gen import add_window_trend_features, add_last_features

from feature_gen import add_window_mean_overall_features
from feature_gen import add_window_mean_dow_features
from feature_gen import add_window_mean_weekend_features

In [4]:
df_shop_info = feather.read_dataframe('data/df_shop_info.feather')

df_shop_info['city_name'] = LabelEncoder().fit_transform(df_shop_info.city_name)
df_shop_info['category_1'] = LabelEncoder().fit_transform(df_shop_info.category_1)
df_shop_info['category_2'] = LabelEncoder().fit_transform(df_shop_info.category_2)
df_shop_info['category_3'] = LabelEncoder().fit_transform(df_shop_info.category_3)

df_shop_info = df_shop_info[['city_name', 'location_id', 'category_1', 'category_2', 'category_3']]

In [11]:
df_pays = feather.read_dataframe('data/df_pays_na_test.feather')

In [15]:
df_shop_info = df_shop_info.iloc[df_pays.shop_id - 1].reset_index(drop=1)

In [17]:
df_pays = pd.concat([df_pays, df_shop_info], axis=1)

In [51]:
def group_mean(group):
    mean = df_pays.groupby([group, 'day']).pays_count.mean().reset_index()
    mean.rename(columns={'pays_count': 'pays_count_%s_mean' % group}, inplace=1)
    return mean

In [52]:
df_res = df_pays

for c in ['city_name', 'location_id', 'category_1', 'category_2', 'category_3']:
    df_group_mean = group_mean(c)
    df_res = df_res.merge(df_group_mean, how='left')

In [56]:
df_res.drop(['city_name', 'location_id', 'category_1', 'category_2', 'category_3'], axis=1, inplace=1)

In [72]:
targets = ['pays_count_city_name_mean', 'pays_count_location_id_mean', 
           'pays_count_category_1_mean', 'pays_count_category_2_mean',
           'pays_count_category_3_mean']
new_cols = ['penultimate', 'penultimate_null', 'pays_two_weeks_ago',
       'second_in_biweek', 'trend_overall', 'trend_overall_coeff',
       'trend_overall_dow', 'trend_overall_dow_coeff', 'trend_overall_weekend',
       'trend_overall_weekend_coeff', 'trend_2', 'trend_coef_2', 'trend_3',
       'trend_coef_3', 'trend_4', 'trend_coef_4', 'trend_5', 'trend_coef_5',
       'trend_6', 'trend_coef_6', 'trend_12', 'trend_coef_12', 'trend_18',
       'trend_coef_18', 'trend_dow_3', 'trend_dow_coef_3', 'trend_dow_4',
       'trend_dow_coef_4', 'trend_dow_5', 'trend_dow_coef_5', 'trend_dow_6',
       'trend_dow_coef_6', 'trend_dow_12', 'trend_dow_coef_12', 'trend_dow_18',
       'trend_dow_coef_18', 'trend_weekend_3', 'trend_weekend_coef_3',
       'trend_weekend_4', 'trend_weekend_coef_4', 'trend_weekend_5',
       'trend_weekend_coef_5', 'trend_weekend_6', 'trend_weekend_coef_6',
       'trend_weekend_12', 'trend_weekend_coef_12', 'trend_weekend_18',
       'trend_weekend_coef_18', 'mean_1', 'std_1', 'mean_2', 'std_2', 'mean_3',
       'std_3', 'dow_mean_2', 'dow_std_2', 'dow_mean_3', 'dow_std_3',
       'weekend_mean_2', 'weekend_std_2', 'weekend_mean_3', 'weekend_std_3',
       'prev_biweek_last_value', 'prev_biweek_p25', 'prev_biweek_p75',
       'prev_spread']

In [None]:
shops = df_pays.shop_id.unique()
shops = sorted(shops)

dfs = []

for i in tqdm(shops):
    df_shop = df_res[df_res.shop_id == i].reset_index(drop=1)

    for t in targets:
        remove_outliers(df_shop, target=t)
        add_penultimate_week(df_shop, target=t)
        add_trend_features(df_shop, target=t)
        add_window_trend_features(df_shop, target=t)
        add_window_mean_overall_features(df_shop, past_biweeks_list=[1, 2, 3], target=t)
        add_window_mean_dow_features(df_shop, past_biweeks_list=[2, 3], target=t)
        add_window_mean_weekend_features(df_shop, past_biweeks_list=[2, 3], target=t)
        add_last_features(df_shop, target=t)

        ren_dict = {c: '%s_%s' % (t, c) for c in new_cols}
        df_shop.rename(columns=ren_dict, inplace=1)

    df_shop.drop(targets, axis=1, inplace=1)

    dfs.append(df_shop)
    #break

100%|██████████| 2000/2000 [30:24:26<00:00, 44.50s/it]


In [None]:
''

In [82]:
truncated_dfs = []

for df in dfs:
    df = df.iloc[7 * 2 * 3:]
    truncated_dfs.append(df)

df_features = pd.concat(truncated_dfs).reset_index(drop=1)

In [None]:
feather.write_dataframe(df_features, 'features/store_series.feather')