# Feature Engineering

In [15]:
# Load the Dataframe from Nootbook 01

%store -r df_model df_store

In [16]:
#Data handling

import pandas as pd
import numpy as np
import calendar
import datetime

### Feature Engineering with "df_store" Dataframe

Feature Engineering with "promointerval"

In [17]:
# splitting 'Promointerval' string into individual strings and get the month

prom_interval = df_store['promointerval'].str.split(',').apply(pd.Series)

In [18]:
prom_interval.columns = prom_interval.columns.map(lambda x: str(x) + '_prominterval')
df_store = df_store.join(prom_interval)

In [19]:
def monthToNum(value):
    if(value=='Sept'):
        value='Sep'
    return list(calendar.month_abbr).index(value)

#mapping month abbr to month number

df_store['0_prominterval'] = df_store['0_prominterval'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['1_prominterval'] = df_store['1_prominterval'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['2_prominterval'] = df_store['2_prominterval'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['3_prominterval'] = df_store['3_prominterval'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)

Feature Engineering with "promo" 

In [20]:
promo = []
for index, value in df_store[['promo2sinceweek', 'promo2sinceyear']].iterrows():
    try:
        year, week = int(value['promo2sinceyear']), int(value['promo2sinceweek'])
        date = pd.to_datetime("{}-{}-01".format(year, week), format='%Y%W')
        promo.append(date)
    except:
        promo.append(np.nan)
promo = pd.to_datetime(pd.Series(promo))
promo.shape

(1115,)

In [21]:
df_store['promosince'] = promo #converted int to datetime
df_store['promosince'] = df_store.promosince.dt.strftime('%Y%m%d')

Feature Engineering with "competition" 

In [22]:
competition_open = []
for index, value in df_store[['competitionopensincemonth', 'competitionopensinceyear']].iterrows():
    try:
        year, month = int(value['competitionopensinceyear']), int(value['competitionopensincemonth'])
        date = pd.to_datetime("{}-{}-01".format(year, month), format='%Y-%m')
        competition_open.append(date)
    except:
        competition_open.append(np.nan)
competition_open = pd.Series(competition_open)
competition_open.shape

(1115,)

In [23]:
df_store['competitionopen'] = competition_open #converted int to datetime
df_store['competitionopen'] = df_store['competitionopen'].dt.strftime('%Y%m%d')

This concludes the Feature Engineering from df_store.
The newly created features are put into store_features.

In [24]:
store_features = ['store', 'storetype', 'assortment', 'competitiondistance', 'competitionopen', 
                  'promosince', '0_prominterval']

In [25]:
features_x = ['store', 'timestamp', 'dayofweek', 'open', 'promo', 'schoolholiday', 'stateholiday']
features_y = ['saleslog']

In [26]:
df_model = pd.merge(df_model, df_store[store_features], how='left', on=['store'])

In [27]:
# put new features into feature-list

features_x = list(set(features_x + store_features))

for feature in features_x:
    df_model[feature] = df_model[feature].fillna(-999) #out of range value for model

In [28]:
df_model['dateint'] = df_model.timestamp.dt.strftime('%Y%m%d').map(int) #mapping to Int
df_model['competitionopen'] = df_model.competitionopen.map(int)
df_model['promosince'] = df_model.promosince.map(int)

### Feature Engineering from "df_train" Dataframe

### Promo feature engineering


In [29]:
df_model['promotomorrow'] = df_model.promo.shift(-1)
df_model['promoyesterday'] = df_model.promo.shift(1)

In [30]:
promo_features = ['promotomorrow', 'promoyesterday']

features_x = list(set(features_x + promo_features))

In [31]:
# Sales and Customer FE

store_data_sales = df_model.groupby([df_model['store']])['sales'].sum()
store_data_customers = df_model.groupby([df_model['store']])['customers'].sum()
store_data_open = df_model.groupby([df_model['store']])['open'].count()

store_data_sales_per_day = store_data_sales / store_data_open
store_data_customers_per_day = store_data_customers / store_data_open
store_data_sales_per_customer_per_day = store_data_sales_per_day / store_data_customers_per_day

df_sales_cust = pd.merge(df_store, store_data_sales_per_day.reset_index(name='salesperday'), how='left', on=['store'])
df_sales_cust = pd.merge(df_sales_cust, store_data_customers_per_day.reset_index(name='customersperday'), how='left', on=['store'])
df_sales_cust = pd.merge(df_sales_cust, store_data_sales_per_customer_per_day.reset_index(name='salespercustomersperday'), how='left', on=['store'])

In [32]:
store_features = ['store', 'salesperday', 'customersperday', 'salespercustomersperday']

features_x = list(set(features_x + store_features))
df_model = pd.merge(df_model, df_sales_cust[store_features], how='left', on=['store'])

Feature Engineering with "holiday"

In [33]:
holidays_next_week=[]
holidays_next_week_index=[]
for index, value in df_model.groupby(df_model['timestamp']).sum().iterrows():
    start_range = index + datetime.timedelta(days=7)
    end_range = index + datetime.timedelta(days=15)
    school_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).schoolholiday)
    state_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).stateholiday)
    holidays_next_week.append(school_holidays+state_holidays)
    holidays_next_week_index.append(index)
    
holidays_next_week = pd.Series(holidays_next_week)
holidays_next_week.shape

(990,)

In [34]:
holidays_this_week=[]
index_list = []
for index, value in df_model.groupby(df_model['timestamp']).sum().iterrows():
    start_range = index 
    end_range = index + datetime.timedelta(days=7)
    school_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).schoolholiday)
    state_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).stateholiday)
    holidays_this_week.append(school_holidays+state_holidays)
    index_list.append(index)
    
holidays_this_week = pd.Series(holidays_this_week)
holidays_this_week.shape

(990,)

In [35]:
holidays_last_week=[]
holidays_last_week_index=[]
for index, value in df_model.groupby(df_model['timestamp']).sum().iterrows():
    start_range = index - datetime.timedelta(days=7)
    end_range = index + datetime.timedelta(days=1)
    school_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).schoolholiday)
    state_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).stateholiday)
    holidays_last_week.append(school_holidays+state_holidays)
    holidays_last_week_index.append(index)
    
holidays_last_week = pd.Series(holidays_next_week)
holidays_last_week.shape

(990,)

In [36]:
temp_df = pd.DataFrame({'holidaysnextweek':holidays_next_week, 'timestamp': holidays_next_week_index})
df_model = pd.merge(df_model, temp_df, on=['timestamp'])

In [37]:
temp_df = pd.DataFrame({'holidaysthisweek':holidays_this_week, 'timestamp': index_list})
df_model = pd.merge(df_model, temp_df, on=['timestamp'])

In [38]:
temp_df = pd.DataFrame({'holidayslastweek':holidays_last_week, 'timestamp': holidays_last_week_index})
df_model = pd.merge(df_model, temp_df, on=['timestamp'])

In [39]:
holidays_features = ['holidaysnextweek', 'holidaysthisweek', 'holidayslastweek']

features_x = list(set(features_x + holidays_features))

In [40]:
print(df_model.shape)
df_model.head()

(885426, 30)


Unnamed: 0,store,dayofweek,sales,customers,open,promo,stateholiday,schoolholiday,timestamp,year,...,0_prominterval,dateint,promotomorrow,promoyesterday,salesperday,customersperday,salespercustomersperday,holidaysnextweek,holidaysthisweek,holidayslastweek
0,1,5,5263.0,555.0,1,1,0,1,2015-07-31,2015,...,-999.0,20150731,1.0,,4483.539204,531.390832,8.437366,4860,5481,4860
1,2,5,6064.0,625.0,1,1,0,1,2015-07-31,2015,...,1.0,20150731,1.0,1.0,4953.90051,583.998724,8.482725,4860,5481,4860
2,3,5,8314.0,821.0,1,1,0,1,2015-07-31,2015,...,1.0,20150731,1.0,1.0,6539.614268,706.541717,9.255808,4860,5481,4860
3,4,5,13995.0,1498.0,1,1,0,1,2015-07-31,2015,...,-999.0,20150731,1.0,1.0,9638.401786,1321.752551,7.292138,4860,5481,4860
4,5,5,4822.0,559.0,1,1,0,1,2015-07-31,2015,...,-999.0,20150731,1.0,1.0,4676.274711,537.34018,8.702634,4860,5481,4860


In [41]:
print(df_model.columns)
print(features_x)

Index(['store', 'dayofweek', 'sales', 'customers', 'open', 'promo',
       'stateholiday', 'schoolholiday', 'timestamp', 'year', 'month', 'day',
       'dayofyear', 'is_train', 'id', 'storetype', 'assortment',
       'competitiondistance', 'competitionopen', 'promosince',
       '0_prominterval', 'dateint', 'promotomorrow', 'promoyesterday',
       'salesperday', 'customersperday', 'salespercustomersperday',
       'holidaysnextweek', 'holidaysthisweek', 'holidayslastweek'],
      dtype='object')
['promosince', 'promotomorrow', 'store', 'holidayslastweek', 'storetype', 'dayofweek', 'timestamp', 'promo', 'schoolholiday', 'customersperday', 'salespercustomersperday', 'assortment', 'promoyesterday', 'salesperday', '0_prominterval', 'open', 'competitiondistance', 'stateholiday', 'competitionopen', 'holidaysthisweek', 'holidaysnextweek']


In [42]:
features_x = ['open', 'store', 'storetype', 'holidayslastweek', '0_prominterval', 'stateholiday', 'assortment', 'dateint', 'holidaysthisweek', 'holidaysnextweek', 'promo', 'promosince', 'dayofweek', 'competitionopen', 'schoolholiday', 'competitiondistance']

In [43]:
%store  features_x features_y df_model

Stored 'features_x' (list)
Stored 'features_y' (list)
Stored 'df_model' (DataFrame)


In [44]:
features_x

['open',
 'store',
 'storetype',
 'holidayslastweek',
 '0_prominterval',
 'stateholiday',
 'assortment',
 'dateint',
 'holidaysthisweek',
 'holidaysnextweek',
 'promo',
 'promosince',
 'dayofweek',
 'competitionopen',
 'schoolholiday',
 'competitiondistance']

In [45]:
df_model.columns


Index(['store', 'dayofweek', 'sales', 'customers', 'open', 'promo',
       'stateholiday', 'schoolholiday', 'timestamp', 'year', 'month', 'day',
       'dayofyear', 'is_train', 'id', 'storetype', 'assortment',
       'competitiondistance', 'competitionopen', 'promosince',
       '0_prominterval', 'dateint', 'promotomorrow', 'promoyesterday',
       'salesperday', 'customersperday', 'salespercustomersperday',
       'holidaysnextweek', 'holidaysthisweek', 'holidayslastweek'],
      dtype='object')