# Feature Engineering

In [26]:
# Load the Dataframe from Nootbook 01

%store -r df_model df_store

In [27]:
#Data handling

import pandas as pd
import numpy as np
import calendar
import datetime

### Feature Engineering with "df_store" Dataframe

Feature Engineering with "promointerval"

In [28]:
# splitting 'Promointerval' string into individual strings and get the month

prom_interval = df_store['promointerval'].str.split(',').apply(pd.Series)

In [29]:
prom_interval.columns = prom_interval.columns.map(lambda x: str(x) + '_prominterval')
df_store = df_store.join(prom_interval)

In [30]:
def monthToNum(value):
    if(value=='Sept'):
        value='Sep'
    return list(calendar.month_abbr).index(value)

#mapping month abbr to month number

df_store['0_prominterval'] = df_store['0_prominterval'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['1_prominterval'] = df_store['1_prominterval'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['2_prominterval'] = df_store['2_prominterval'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['3_prominterval'] = df_store['3_prominterval'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)

Feature Engineering with "promo" 

In [31]:
promo = []
for index, value in df_store[['promo2sinceweek', 'promo2sinceyear']].iterrows():
    try:
        year, week = int(value['promo2sinceyear']), int(value['promo2sinceweek'])
        date = pd.to_datetime("{}-{}-01".format(year, week), format='%Y%W')
        promo.append(date)
    except:
        promo.append(np.nan)
promo = pd.to_datetime(pd.Series(promo))
promo.shape

(1115,)

In [32]:
df_store['promosince'] = promo #converted int to datetime
df_store['promosince'] = df_store.promosince.dt.strftime('%Y%m%d')

Feature Engineering with "competition" 

In [33]:
competition_open = []
for index, value in df_store[['competitionopensincemonth', 'competitionopensinceyear']].iterrows():
    try:
        year, month = int(value['competitionopensinceyear']), int(value['competitionopensincemonth'])
        date = pd.to_datetime("{}-{}-01".format(year, month), format='%Y-%m')
        competition_open.append(date)
    except:
        competition_open.append(np.nan)
competition_open = pd.Series(competition_open)
competition_open.shape

(1115,)

In [34]:
df_store['competitionopen'] = competition_open #converted int to datetime
df_store['competitionopen'] = df_store['competitionopen'].dt.strftime('%Y%m%d')

This concludes the Feature Engineering from df_store.
The newly created features are put into store_features.

In [35]:
store_features = ['store', 'storetype', 'assortment', 'competitiondistance', 'competitionopen', 
                  'promosince', '0_prominterval']

In [36]:
features_x = ['store', 'timestamp', 'dayofweek', 'open', 'promo', 'schoolholiday', 'stateholiday']
features_y = ['saleslog']

In [37]:
df_model = pd.merge(df_model, df_store[store_features], how='left', on=['store'])

In [38]:
# put new features into feature-list
features_x = list(set(features_x + store_features))

for feature in features_x:
    df_model[feature] = df_model[feature].fillna(-999) #out of range value for model

In [39]:
df_model['dateint'] = df_model.timestamp.dt.strftime('%Y%m%d').map(int) #mapping to Int
df_model['competitionopen'] = df_model.competitionopen.map(int)
df_model['promosince'] = df_model.promosince.map(int)

### Feature Engineering from "df_train" Dataframe

Feature Engineering with "holiday"

In [40]:
holidays_next_week=[]
holidays_next_week_index=[]
for index, value in df_model.groupby(df_model['timestamp']).sum().iterrows():
    start_range = index + datetime.timedelta(days=7)
    end_range = index + datetime.timedelta(days=15)
    school_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).schoolholiday)
    state_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).stateholiday)
    holidays_next_week.append(school_holidays+state_holidays)
    holidays_next_week_index.append(index)
    
holidays_next_week = pd.Series(holidays_next_week)
holidays_next_week.shape

(990,)

In [41]:
holidays_this_week=[]
index_list = []
for index, value in df_model.groupby(df_model['timestamp']).sum().iterrows():
    start_range = index 
    end_range = index + datetime.timedelta(days=7)
    school_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).schoolholiday)
    state_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).stateholiday)
    holidays_this_week.append(school_holidays+state_holidays)
    index_list.append(index)
    
holidays_this_week = pd.Series(holidays_this_week)
holidays_this_week.shape

(990,)

In [42]:
holidays_last_week=[]
holidays_last_week_index=[]
for index, value in df_model.groupby(df_model['timestamp']).sum().iterrows():
    start_range = index - datetime.timedelta(days=7)
    end_range = index + datetime.timedelta(days=1)
    school_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).schoolholiday)
    state_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).stateholiday)
    holidays_last_week.append(school_holidays+state_holidays)
    holidays_last_week_index.append(index)
    
holidays_last_week = pd.Series(holidays_next_week)
holidays_last_week.shape

(990,)

In [43]:
temp_df = pd.DataFrame({'holidaysnextweek':holidays_next_week, 'timestamp': holidays_next_week_index})
df_model = pd.merge(df_model, temp_df, on=['timestamp'])

In [44]:
temp_df = pd.DataFrame({'holidaysthisweek':holidays_this_week, 'timestamp': index_list})
df_model = pd.merge(df_model, temp_df, on=['timestamp'])

In [45]:
temp_df = pd.DataFrame({'holidayslastweek':holidays_last_week, 'timestamp': holidays_last_week_index})
df_model_afe = pd.merge(df_model, temp_df, on=['timestamp'])

In [46]:
holidays_features = ['holidaysnextweek', 'holidaysthisweek', 'holidayslastweek']

features_x = list(set(features_x + holidays_features))

In [47]:
print(df_model_afe.shape)
df_model.head()

(885426, 25)


Unnamed: 0,store,dayofweek,sales,customers,open,promo,stateholiday,schoolholiday,timestamp,year,...,id,storetype,assortment,competitiondistance,competitionopen,promosince,0_prominterval,dateint,holidaysnextweek,holidaysthisweek
0,1,5,5263.0,555.0,1,1,0,1,2015-07-31,2015,...,,2,0,1270.0,20080901,-999,-999.0,20150731,4860,5481
1,2,5,6064.0,625.0,1,1,0,1,2015-07-31,2015,...,,0,0,570.0,20071101,-999,1.0,20150731,4860,5481
2,3,5,8314.0,821.0,1,1,0,1,2015-07-31,2015,...,,0,0,14130.0,20061201,-999,1.0,20150731,4860,5481
3,4,5,13995.0,1498.0,1,1,0,1,2015-07-31,2015,...,,2,2,620.0,20090901,-999,-999.0,20150731,4860,5481
4,5,5,4822.0,559.0,1,1,0,1,2015-07-31,2015,...,,0,0,29910.0,20150401,-999,-999.0,20150731,4860,5481


In [48]:
print(df_model_afe.columns)
print(features_x)

Index(['store', 'dayofweek', 'sales', 'customers', 'open', 'promo',
       'stateholiday', 'schoolholiday', 'timestamp', 'year', 'month', 'day',
       'dayofyear', 'is_train', 'id', 'storetype', 'assortment',
       'competitiondistance', 'competitionopen', 'promosince',
       '0_prominterval', 'dateint', 'holidaysnextweek', 'holidaysthisweek',
       'holidayslastweek'],
      dtype='object')
['stateholiday', 'competitionopen', 'holidaysnextweek', 'holidaysthisweek', '0_prominterval', 'timestamp', 'storetype', 'dayofweek', 'assortment', 'open', 'store', 'schoolholiday', 'promo', 'promosince', 'competitiondistance', 'holidayslastweek']


In [49]:
features_x = ['open', 'store', 'storetype', 'holidayslastweek', '0_prominterval', 'stateholiday', 'assortment', 'dateint', 'holidaysthisweek', 'holidaysnextweek', 'promo', 'promosince', 'dayofweek', 'competitionopen', 'schoolholiday', 'competitiondistance']

In [50]:
%store df_model_afe features_x

Stored 'df_model_afe' (DataFrame)
Stored 'features_x' (list)
