In [476]:
import numpy as np
import pandas as pd

In [477]:
# Import the data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
stores = pd.read_csv('../data/stores.csv')
transactions = pd.read_csv('../data/transactions.csv')
oil = pd.read_csv('../data/oil.csv')
holidays = pd.read_csv('../data/holidays_events.csv')

# Dictionary for all datasets
datasets = {'train':train, 'test':test, 'stores':stores, 'transactions':transactions, 'oil':oil, 'holidays':holidays}

# Cleaning

### Standarizing dates

We convert any dates in the datasets to pandas Timestamps.

In [478]:
for df in iter(datasets.values()):
    if 'date' in df.columns:
        df['date'] = pd.to_datetime( df['date'] )

### Training Set

We drop the `id` column from the training set. The preliminary analysis proved this is just a redundant row indexer.

In [479]:
train = train.drop('id',axis=1)

### Oil Dataset

The preliminary analysis shows 43 missing daily oil price values and missing weekend oil price values. 

We add the weekend days first as more null values, then interpolate 

Also, the oil price for the very first day (2013-01-01) is missing. We can manually add that here using the oil prices from 2012-12-31 and 2013-01-02 from (https://fred.stlouisfed.org/data/DCOILWTICO). We separately verified that is safe to do, since these oil prices match the ones in the oil dataset.

In [480]:
# Manually add oil price for the first day using average of 2012-12-31 and 2013-01-02 oil prices
oil.iloc[0,1] = 92.485                                  

In [481]:
# Create DataFrame with all dates in desired range, including weekends
dates = pd.DataFrame(pd.date_range(start='1/1/2013', end='8/31/2017',freq='D'), columns=['date'])
# Merge with oil data set, so that weekend dates are added to oil with null values
oil = dates.merge(oil,how='left', on='date')
# Interpolate all missing values in oil (all but possibly one of the gaps are of size 1,2, or 3)
oil['dcoilwtico'] = oil['dcoilwtico'].interpolate()

### Holiday Dataset

Rename the `type` column so that it won't conflict with store type.

In [482]:
holidays = holidays.rename({'type':'type_hol'},axis='columns')

From the preliminary analysis, two transferred holidays need their description updated so that all tranfer holidays have consistent formatting.

In [483]:
holidays.loc[304,'description'] = 'Traslado Fundacion de Cuenca'
holidays.loc[329, 'description'] = 'Traslado Fundacion de Ibarra'

There were also a couple mislabelled Additional Holidays, and one that should be deleted for redundancy.

In [484]:
holidays.loc[182,'type'] = 'Additional'
holidays.loc[322,'type'] = 'Holiday'
holidays = holidays.drop(264, axis=0)

We separate the holiday `locale` variable into three columns (local, regional, national).

In [485]:
holidays = pd.get_dummies(holidays, columns=['locale'],prefix='Hol')
#We can adjust weights here or later
holidays['Hol_Local']=holidays['Hol_Local']*1
holidays['Hol_National']=holidays['Hol_National']*1
holidays['Hol_Regional']=holidays['Hol_Regional']*1

In [486]:
# Looking at local holidays
hol_loc=holidays[holidays['Hol_Local']==1]
hol_loc=hol_loc.rename(columns={'locale_name':'city'})
hol_loc=hol_loc[['date', 'type_hol', 'city', 'description', 'transferred',
       'Hol_Local']]
hol_loc.sample()

Unnamed: 0,date,type_hol,city,description,transferred,Hol_Local
81,2013-12-05,Additional,Quito,Fundacion de Quito-1,False,1


In [487]:
# Looking at Regional holidays
hol_reg=holidays[holidays['Hol_Regional']==1]
hol_reg=hol_reg.rename(columns={'locale_name':'state'})
hol_reg=hol_reg[['date', 'type_hol', 'state', 'description', 'transferred',
       'Hol_Regional']]
hol_reg.sample()

Unnamed: 0,date,type_hol,state,description,transferred,Hol_Regional
24,2012-11-07,Holiday,Santa Elena,Provincializacion Santa Elena,False,1


In [488]:
# Looking at National holidays
hol_nat=holidays[holidays['Hol_National']==1]
hol_nat=hol_nat[['date', 'type_hol', 'description', 'transferred',
       'Hol_National']]
hol_nat.sample()

Unnamed: 0,date,type_hol,description,transferred,Hol_National
113,2014-06-25,Event,Mundial de futbol Brasil: Ecuador-Francia,False,1


In [489]:
# Create a map of  National holidays
holiday_nat_map = dict(zip(hol_nat['date'], hol_nat['Hol_National']))
holiday_nat_type_map = dict(zip(hol_nat['date'], hol_nat['type_hol']))
holiday_nat_name_map = dict(zip(hol_nat['date'], hol_nat['description']))
holiday_nat_transf_map = dict(zip(hol_nat['date'], hol_nat['transferred']))

# Now mapping regional holidays 
holiday_reg_map = dict(zip(zip(hol_reg['date'], hol_reg['state'].str.strip()), hol_reg['Hol_Regional']))
holiday_reg_type_map = dict(zip(zip(hol_reg['date'], hol_reg['state'].str.strip()), hol_reg['type_hol']))
holiday_reg_name_map = dict(zip(zip(hol_reg['date'], hol_reg['state'].str.strip()), hol_reg['description']))
holiday_reg_transf_map = dict(zip(hol_reg['date'], hol_reg['transferred']))

# Create a map of Local holidays
holiday_loc_map = dict(zip(zip(hol_loc['date'], hol_loc['city'].str.strip()), hol_loc['Hol_Local']))
holiday_loc_type_map = dict(zip(zip(hol_loc['date'], hol_loc['city'].str.strip()), hol_loc['type_hol']))
holiday_loc_name_map = dict(zip(zip(hol_loc['date'], hol_loc['city'].str.strip()), hol_loc['description']))
holiday_loc_transf_map = dict(zip(zip(hol_loc['date'], hol_loc['city'].str.strip()), hol_loc['transferred']))

# Merging

We first inner join the store and transaction data along the `store_nbr` column.

This guarantees no duplicates or new null values.

In [490]:
# Merge stores with transactions on date and store_nbr
X = stores.merge(transactions, how='inner', on='store_nbr')
X = X.sort_values(by=['date','store_nbr'],axis=0).reset_index(drop=True)
X = X[['date','store_nbr','type','cluster','city','state','transactions']]

Before merging more data, we break down the `date` column into year, month, week, day, and day of week.

In [491]:
X = X.assign(**{'year': pd.Series( [X.date[i].year for i in X.index]), 
            'month': pd.Series( [X.date[i].month for i in X.index]), 
            'week_number': pd.Series( [X.date[i].week for i in X.index]), 
            'day':pd.Series( [X.date[i].day for i in X.index]), 
            'day_of_week': pd.Series( [X.date[i].dayofweek for i in X.index]) })
X = X[['date','year', 'month', 'week_number', 'day', 'day_of_week','store_nbr','type','cluster','city','state','transactions']]

Next, we inner join oil prices along the `date` column.

Since there is one oil price per date and we filled null values in oil, this won't give duplicates or new null values.

In [492]:
X = X.merge(oil, how='left', on='date')
X = X.sort_values(by=['date','store_nbr'],axis=0).reset_index(drop=True)

Here we join the holiday data using the mappings defined earlier.

In [493]:
# Add empty tranferred and holiday type columns
X['transferred'] = np.nan
X['type_hol'] = np.nan

#Use mappings to fill in the values for national holidays
X['hol_Nat'] = X['date'].map(holiday_nat_map)
X.loc[X['transferred'].isna(), 'transferred'] = X.loc[X['transferred'].isna(), 'date'].map(holiday_nat_transf_map)
X['hol_Nat_name'] = X['date'].map(holiday_nat_name_map)
X.loc[X['type_hol'].isna(), 'type_hol'] = X['date'].map(holiday_nat_type_map)

# Assign regional holidays based on mapping 
X['hol_Reg'] = X.apply(lambda row: holiday_reg_map.get((row['date'], row['state'])), axis=1)
X.loc[X['transferred'].isna(), 'transferred'] = X.loc[X['transferred'].isna(), 'date'].map(holiday_reg_transf_map)
X['hol_Reg_name'] = X.apply(lambda row: holiday_reg_name_map.get((row['date'], row['state'])), axis=1)
X.loc[X['type_hol'].isna(), 'type_hol'] = X.loc[X['type_hol'].isna()].apply(lambda row: holiday_reg_type_map.get((row['date'], row['state'])), axis=1)

# Assign local holidays based on mapping 
X['hol_Loc'] = X.apply(lambda row: holiday_loc_map.get((row['date'], row['city'])), axis=1)
X.loc[X['transferred'].isna(), 'transferred'] = X.loc[X['transferred'].isna(), 'date'].map(holiday_nat_transf_map)

X['transferred'] = X.apply(
    lambda row: holiday_loc_map.get((row['date'], row['city'])) if pd.isna(row['transferred']) else row['transferred'], axis=1
)
X.loc[X['type_hol'].isna(), 'type_hol']= X.loc[X['type_hol'].isna()].apply(lambda row: holiday_loc_type_map.get((row['date'], row['city'])), axis=1)
X['hol_loc_name'] = X.apply(lambda row: holiday_loc_name_map.get((row['date'], row['city'])), axis=1)

# Just fillna
X[['hol_Nat','hol_Reg','hol_Loc']]=X[['hol_Nat','hol_Reg','hol_Loc']].fillna(0)

  X.loc[X['transferred'].isna(), 'transferred'] = X.loc[X['transferred'].isna(), 'date'].map(holiday_nat_transf_map)
  X.loc[X['type_hol'].isna(), 'type_hol'] = X['date'].map(holiday_nat_type_map)


Finally, we inner join our training data along both `date` and `store_nbr`.

In [494]:
X = X.merge(train, how='inner', on=['date','store_nbr'])

Reorder columns and create boolean columns for each holiday type.

In [495]:
#Reorder columns
X = X[['date','year', 'month','week_number', 'day',  'day_of_week', 
    'store_nbr','type', 'cluster', 'city', 'state','transactions', 'dcoilwtico',
    'hol_Nat','hol_Nat_name',  'hol_Reg','hol_Reg_name','hol_Loc','hol_loc_name', 'type_hol','transferred',
    'family', 'onpromotion', 'sales']] 

#Separating types of the holidays 
X = pd.get_dummies(X,columns=['type_hol'], prefix='type_hol')

In [501]:
X.sample(5)

Unnamed: 0,date,year,month,week_number,day,day_of_week,store_nbr,type,cluster,city,...,transferred,family,onpromotion,sales,type_hol_Additional,type_hol_Bridge,type_hol_Event,type_hol_Holiday,type_hol_Transfer,type_hol_Work Day
713156,2014-04-12,2014,4,15,12,5,3,D,8,Quito,...,,PET SUPPLIES,0,0.0,False,False,False,False,False,False
2749738,2017-08-12,2017,8,32,12,5,54,C,3,El Carmen,...,,GROCERY II,0,8.0,False,False,False,False,False,False
346791,2013-08-16,2013,8,33,16,4,45,A,11,Quito,...,,PLAYERS AND ELECTRONICS,0,0.0,False,False,False,False,False,False
1722603,2015-12-28,2015,12,53,28,0,21,B,6,Santo Domingo,...,,BEVERAGES,1,1764.0,False,False,False,False,False,False
833524,2014-06-29,2014,6,26,29,6,35,C,3,Playas,...,False,EGGS,1,146.0,False,False,True,False,False,False


In [15]:
X.to_csv("../data/merged_train.csv", index = False)