# Feature Engineering

In [1]:
# Load the Dataframe from Notebook 01

%store -r df df_store

In [2]:
#Data handling

import pandas as pd
import numpy as np
import calendar
import datetime
from sklearn.preprocessing import LabelEncoder

In [3]:
df_model = df.copy()

### Handling Categorical Data

In [4]:
# Encoding categorical features

df_model['stateholiday_cat'] = LabelEncoder().fit_transform(df_model['stateholiday']) 
df_store['storetype_cat'] = LabelEncoder().fit_transform(df_store['storetype'])
df_store['assortment_cat'] = LabelEncoder().fit_transform(df_store['assortment'])

In [5]:
df_store.drop(['storetype', 'assortment'], axis = 1, inplace=True)
df_store = df_store.rename(columns={'storetype_cat': 'storetype', 'assortment_cat':'assortment'})

In [6]:
df_model.drop(['stateholiday'], axis = 1, inplace=True)
df_model = df_model.rename(columns={'stateholiday_cat': 'stateholiday'})

#### Feature engineering with "df_store" dataframe features

 Splitting 'Promointerval' string into individual strings and get each month value.


In [7]:
prom_interval = df_store['promointerval'].str.split(',').apply(pd.Series)

In [8]:
prom_interval.columns = prom_interval.columns.map(lambda x: str(x) + '_prominterval')
df_store = df_store.join(prom_interval)

In [9]:
def monthToNum(value):
    if(value=='Sept'):
        value='Sep'
    return list(calendar.month_abbr).index(value)

# Mapping month abbrevation to month number

df_store['0_prominterval'] = df_store['0_prominterval'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['1_prominterval'] = df_store['1_prominterval'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['2_prominterval'] = df_store['2_prominterval'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['3_prominterval'] = df_store['3_prominterval'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)

Combine the beginning of the Promo2 week and year values into a single date.

In [10]:
promo = []
for index, value in df_store[['promo2sinceweek', 'promo2sinceyear']].iterrows():
    try:
        year, week = int(value['promo2sinceyear']), int(value['promo2sinceweek'])
        date = pd.to_datetime("{}-{}-01".format(year, week), format='%Y%W')
        promo.append(date)
    except:
        promo.append(np.nan)
promo = pd.to_datetime(pd.Series(promo))
promo.shape

(1115,)

In [11]:
df_store['promosince'] = promo # Convert integer to datetime
df_store['promosince'] = df_store.promosince.dt.strftime('%Y%m%d')

Combine the competition-open week and year values into a single date.

In [12]:
competition_open = []
for index, value in df_store[['competitionopensincemonth', 'competitionopensinceyear']].iterrows():
    try:
        year, month = int(value['competitionopensinceyear']), int(value['competitionopensincemonth'])
        date = pd.to_datetime("{}-{}-01".format(year, month), format='%Y-%m')
        competition_open.append(date)
    except:
        competition_open.append(np.nan)
competition_open = pd.Series(competition_open)
competition_open.shape

(1115,)

In [13]:
df_store['competitionopen'] = competition_open # Convert integer to datetime
df_store['competitionopen'] = df_store['competitionopen'].dt.strftime('%Y%m%d')

This concludes the feature engineering for 'df_store'.

The newly created features are put into 'store_features' and merged with 'df_model'.

In [14]:
store_features = ['store', 'storetype', 'assortment', 'competitiondistance', 'competitionopen', 
                  'promosince', '0_prominterval']

In [15]:
features_x = ['store', 'timestamp', 'dayofweek', 'open', 'promo', 'schoolholiday', 'stateholiday']

# Seperate target-feature from dataframe
features_y = ['saleslog']

In [16]:
df_model = pd.merge(df_model, df_store[store_features], how='left', on=['store'])
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 885426 entries, 0 to 885425
Data columns (total 21 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   store                885426 non-null  int64         
 1   dayofweek            885426 non-null  int64         
 2   sales                844338 non-null  float64       
 3   customers            844338 non-null  float64       
 4   open                 885426 non-null  int64         
 5   promo                885426 non-null  int64         
 6   schoolholiday        885426 non-null  int64         
 7   timestamp            885426 non-null  datetime64[ns]
 8   year                 885426 non-null  int64         
 9   month                885426 non-null  int64         
 10  day                  885426 non-null  int64         
 11  dayofyear            885426 non-null  int64         
 12  is_train             885426 non-null  int64         
 13  id            

In [18]:
# Put new features into feature-list

features_x = list(set(features_x + store_features))

for feature in features_x:
    df_model[feature] = df_model[feature].fillna(-999) # Out of range value for model

In [19]:
# Mapping datetime into integer datatype
df_model['dateint'] = df_model.timestamp.dt.strftime('%Y%m%d').map(int)
df_model['competitionopen'] = df_model.competitionopen.map(int)
df_model['promosince'] = df_model.promosince.map(int)

In [20]:
features_x.remove('timestamp')
features_x.append('dateint')

#### Feature engineering with "df_train" dataframe feature

Create new columns for whether or not a promo was run the day before or after.

In [21]:
df_model['promotomorrow'] = df_model.promo.shift(-1)
df_model['promoyesterday'] = df_model.promo.shift(1)

In [22]:
promo_features = ['promotomorrow', 'promoyesterday']

features_x = list(set(features_x + promo_features))

Another feature to be looked at is the difference in sales and customer number between each store. This information could have an effect on the predicted sales number.

Therefore sales per day, customers per day and the number of sales per customers per day are calculated.

In [23]:
store_data_sales = df_model.groupby([df_model['store']])['sales'].sum()
store_data_customers = df_model.groupby([df_model['store']])['customers'].sum()
store_data_open = df_model.groupby([df_model['store']])['open'].count()

store_data_sales_per_day = store_data_sales / store_data_open
store_data_customers_per_day = store_data_customers / store_data_open
store_data_sales_per_customer_per_day = store_data_sales_per_day / store_data_customers_per_day

df_sales_cust = pd.merge(df_store, store_data_sales_per_day.reset_index(name='salesperday'), how='left', on=['store'])
df_sales_cust = pd.merge(df_sales_cust, store_data_customers_per_day.reset_index(name='customersperday'), how='left', on=['store'])
df_sales_cust = pd.merge(df_sales_cust, store_data_sales_per_customer_per_day.reset_index(name='salespercustomersperday'), how='left', on=['store'])

In [24]:
store_features = ['store', 'salesperday', 'customersperday', 'salespercustomersperday']

features_x = list(set(features_x + store_features))
df_model = pd.merge(df_model, df_sales_cust[store_features], how='left', on=['store'])

Customers could have different shopping needs depending on the holiday situation. Shopping for drug-store items prior or after a holiday might result in different customer behaviour and therefore different sales number.

An indicator for whether the next, current or previous week is marked as a holiday, is created.

In [25]:
holidays_next_week=[]
holidays_next_week_index=[]
for index, value in df_model.groupby(df_model['timestamp']).sum().iterrows():
    start_range = index + datetime.timedelta(days=7)
    end_range = index + datetime.timedelta(days=15)
    school_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).schoolholiday)
    state_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).stateholiday)
    holidays_next_week.append(school_holidays+state_holidays)
    holidays_next_week_index.append(index)
    
holidays_next_week = pd.Series(holidays_next_week)
holidays_next_week.shape

(990,)

In [26]:
holidays_this_week=[]
index_list = []
for index, value in df_model.groupby(df_model['timestamp']).sum().iterrows():
    start_range = index 
    end_range = index + datetime.timedelta(days=7)
    school_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).schoolholiday)
    state_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).stateholiday)
    holidays_this_week.append(school_holidays+state_holidays)
    index_list.append(index)
    
holidays_this_week = pd.Series(holidays_this_week)
holidays_this_week.shape

(990,)

In [27]:
holidays_last_week=[]
holidays_last_week_index=[]
for index, value in df_model.groupby(df_model['timestamp']).sum().iterrows():
    start_range = index - datetime.timedelta(days=7)
    end_range = index + datetime.timedelta(days=1)
    school_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).schoolholiday)
    state_holidays = sum((df_model.groupby(df_model['timestamp']).sum()[start_range:end_range]).stateholiday)
    holidays_last_week.append(school_holidays+state_holidays)
    holidays_last_week_index.append(index)
    
holidays_last_week = pd.Series(holidays_next_week)
holidays_last_week.shape

(990,)

In [28]:
# Temporary dataframe as helper to merge

temp_df = pd.DataFrame({'holidaysnextweek':holidays_next_week, 'timestamp': holidays_next_week_index})
df_model = pd.merge(df_model, temp_df, on=['timestamp'])

In [29]:
temp_df = pd.DataFrame({'holidaysthisweek':holidays_this_week, 'timestamp': index_list})
df_model = pd.merge(df_model, temp_df, on=['timestamp'])

In [30]:
temp_df = pd.DataFrame({'holidayslastweek':holidays_last_week, 'timestamp': holidays_last_week_index})
df_model = pd.merge(df_model, temp_df, on=['timestamp'])

In [31]:
holidays_features = ['holidaysnextweek', 'holidaysthisweek', 'holidayslastweek']

features_x = list(set(features_x + holidays_features))

In [32]:
print(df_model.shape)
df_model.info()

(885426, 30)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 885426 entries, 0 to 885425
Data columns (total 30 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   store                    885426 non-null  int64         
 1   dayofweek                885426 non-null  int64         
 2   sales                    844338 non-null  float64       
 3   customers                844338 non-null  float64       
 4   open                     885426 non-null  int64         
 5   promo                    885426 non-null  int64         
 6   schoolholiday            885426 non-null  int64         
 7   timestamp                885426 non-null  datetime64[ns]
 8   year                     885426 non-null  int64         
 9   month                    885426 non-null  int64         
 10  day                      885426 non-null  int64         
 11  dayofyear                885426 non-null  int64         
 12  is_

In [33]:
print(df_model.columns)
print(features_x)


Index(['store', 'dayofweek', 'sales', 'customers', 'open', 'promo',
       'schoolholiday', 'timestamp', 'year', 'month', 'day', 'dayofyear',
       'is_train', 'id', 'stateholiday', 'storetype', 'assortment',
       'competitiondistance', 'competitionopen', 'promosince',
       '0_prominterval', 'dateint', 'promotomorrow', 'promoyesterday',
       'salesperday', 'customersperday', 'salespercustomersperday',
       'holidaysnextweek', 'holidaysthisweek', 'holidayslastweek'],
      dtype='object')
['holidaysthisweek', 'salespercustomersperday', 'promoyesterday', 'store', 'stateholiday', 'holidayslastweek', 'competitiondistance', 'dateint', '0_prominterval', 'schoolholiday', 'promo', 'dayofweek', 'salesperday', 'competitionopen', 'customersperday', 'storetype', 'promosince', 'holidaysnextweek', 'open', 'promotomorrow', 'assortment']


In [34]:
%store  features_x features_y df_model

Stored 'features_x' (list)
Stored 'features_y' (list)
Stored 'df_model' (DataFrame)
