# Summary

### Preprocessing and Cleaning
1. Import DataFrames (train, test, stores, transactions, oil, holidays)
2. Handle Dates
    - Convert dates in all DataFrames to pandas Timestamps
    - Define add_date_component function (splits up dates into year, month, day, etc.)
    - Create date DataFrames for the training and testing days
3. Basic Data Cleaning
    - Train/Test: drop redundant id row
    - Oil: Rename oil price ('dcoilwtico') to 'oil'
    - Oil: Add missing oil prices and use linear interpolation for weekends and other missing days
    - Holidays: Rename holiday 'type' column to 'hol_type'
    - Fix some holiday errors manually
4. Handle holiday data
    - One-hot encode 'locale' (region holiday affects) into 'Hol_Local', 'Hol_National', 'Hol_Regional'
    - Split DataFrame into 'hol_loc', 'hol_reg','hol_nat', and 'hol_event' based on locale (or events)
    - Create dictionaries for local/regional/national/event holidays with date, location, description, and transfer status information
### Merging training set I (X)
- Merge stores, transactions, oil, train
- Merge holiday data using mapping dictionaries
### Merging training set II (X_alt)
- Merge oil, training dates, train, stores, transactions
- Merge holiday data using mapping dictionaries
### Merge testing set (y)
- Merge oil, testing dates, test, stores
- Merge holiday data using mapping dictionaries
### Final summary stats and export
- Summary stats for X, X_alt, y
- Export to csv 

# Preprocessing and Cleaning

In [2]:
import numpy as np
import pandas as pd

#-------------------------------------------------------------------------------------------

# 1. Import the data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
stores = pd.read_csv('../data/stores.csv')
transactions = pd.read_csv('../data/transactions.csv')
oil = pd.read_csv('../data/oil.csv')
holidays = pd.read_csv('../data/holidays_events.csv')

#-------------------------------------------------------------------------------------------

# 2. Handle Dates

# Convert all dates to pandas Timestamps
datasets = {'train':train, 'test':test, 'stores':stores, 'transactions':transactions, 'oil':oil, 'holidays':holidays}
for df in iter(datasets.values()):
    if 'date' in df.columns:
        df['date'] = pd.to_datetime( df['date'] )
        
# Make function that takes dates and splits it up into year, month, day, etc.
def add_date_components(df):
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['week_number'] = df['date'].dt.isocalendar().week
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    return df  

# Create nice dataframes with train/test date data
train_dates = pd.DataFrame(pd.date_range(start='1/1/2013', end='8/15/2017',freq='D'), columns=['date'])
test_dates = pd.DataFrame(pd.date_range(start='8/16/2017', end='8/31/2017',freq='D'), columns=['date'])
train_dates = add_date_components(train_dates)
test_dates = add_date_components(test_dates)      

#-------------------------------------------------------------------------------------------

# 3. Perform some basic cleaning (on train, test, oil, holidays)
        
# Drop redundant row `id`         
train = train.drop('id',axis=1)
test = test.drop('id',axis=1)

# Rename the oil price column to just `oil`
oil = oil.rename({'dcoilwtico': 'oil'}, axis=1)
# Manually add oil price for the first day using average of 2012-12-31 and 2013-01-02 oil prices
oil.iloc[0,1] = 92.485                                  
# Create DataFrame with all dates in desired range, including weekends
dates = pd.DataFrame(pd.date_range(start='1/1/2013', end='8/31/2017',freq='D'), columns=['date'])
# Merge with oil data set, so that weekend dates are added to oil with null values
oil = dates.merge(oil,how='left', on='date')
# Interpolate all missing values in oil (all but possibly one of the gaps are of size 1,2, or 3)
oil['oil'] = oil['oil'].interpolate()

# Rename the holiday type column to `hol_type`
holidays = holidays.rename({'type':'hol_type'},axis='columns')
# Fix some individual issues manually
holidays.loc[304,'description'] = 'Traslado Fundacion de Cuenca'
holidays.loc[329, 'description'] = 'Traslado Fundacion de Ibarra'
holidays.loc[182,'type'] = 'Additional'
holidays.loc[322,'type'] = 'Holiday'
holidays = holidays.drop(264, axis=0)


In [3]:
# 4. Break down the holiday data

# One-hot encode `locale` to get three columns (Hol_Local, Hol_National, Hol_Regional)
# (we can adjust weights here or later)
holidays = pd.get_dummies(holidays, columns=['locale'],prefix='Hol')
holidays['Hol_Local']=holidays['Hol_Local']*1
holidays['Hol_National']=holidays['Hol_National']*1
holidays['Hol_Regional']=holidays['Hol_Regional']*1

# Split holidays DataFrame into three, based on locale:
# local holidays
hol_loc=holidays[holidays['Hol_Local']==1]
hol_loc=hol_loc.rename(columns={'locale_name':'city'})
hol_loc=hol_loc[['date', 'hol_type', 'city', 'description', 'transferred','Hol_Local']]
# regional holidays
hol_reg=holidays[holidays['Hol_Regional']==1]
hol_reg=hol_reg.rename(columns={'locale_name':'state'})
hol_reg=hol_reg[['date', 'hol_type', 'state', 'description', 'transferred', 'Hol_Regional']]
# national holidays
hol_nat=holidays[ (holidays['Hol_National']==1) & (holidays['hol_type'] != 'Event')]
hol_nat=hol_nat[['date', 'hol_type', 'description', 'transferred', 'Hol_National']]
# events
hol_event = holidays[ (holidays['Hol_National']==1) & (holidays['hol_type'] == 'Event')]
hol_event = hol_event[['date', 'hol_type', 'description', 'transferred', 'Hol_National']]

# Create dictionaries containing holiday data:
# local holidays
holiday_loc_map = dict(zip(zip(hol_loc['date'], hol_loc['city'].str.strip()), hol_loc['Hol_Local']))
holiday_loc_type_map = dict(zip(zip(hol_loc['date'], hol_loc['city'].str.strip()), hol_loc['hol_type']))
holiday_loc_name_map = dict(zip(zip(hol_loc['date'], hol_loc['city'].str.strip()), hol_loc['description']))
holiday_loc_transf_map = dict(zip(zip(hol_loc['date'], hol_loc['city'].str.strip()), hol_loc['transferred']))
# regional holidays
holiday_reg_map = dict(zip(zip(hol_reg['date'], hol_reg['state'].str.strip()), hol_reg['Hol_Regional']))
holiday_reg_type_map = dict(zip(zip(hol_reg['date'], hol_reg['state'].str.strip()), hol_reg['hol_type']))
holiday_reg_name_map = dict(zip(zip(hol_reg['date'], hol_reg['state'].str.strip()), hol_reg['description']))
holiday_reg_transf_map = dict(zip(hol_reg['date'], hol_reg['transferred']))
# national holidays
holiday_nat_map = dict(zip(hol_nat['date'], hol_nat['Hol_National']))
holiday_nat_type_map = dict(zip(hol_nat['date'], hol_nat['hol_type']))
holiday_nat_name_map = dict(zip(hol_nat['date'], hol_nat['description']))
holiday_nat_transf_map = dict(zip(hol_nat['date'], hol_nat['transferred']))
# events
holiday_event_map = dict(zip(hol_event['date'], hol_event['Hol_National']))
holiday_event_type_map = dict(zip(hol_event['date'], hol_event['hol_type']))
holiday_event_name_map = dict(zip(hol_event['date'], hol_event['description']))
holiday_event_transf_map = dict(zip(hol_event['date'], hol_event['transferred']))


# Merging training set I

In [4]:
# 5. Merging training set I

# Merge stores with transactions
X = stores.merge(transactions, how='inner', on='store_nbr')
X = X.sort_values(by=['date','store_nbr'],axis=0).reset_index(drop=True)

# Add date components
X = add_date_components(X)
X = X[['date','year','month','week_number','day','day_of_week','store_nbr','type','cluster','city','state','transactions']]

# Add oil prices according to `date`
X = X.merge(oil, how='left', on='date')
X = X.sort_values(by=['date','store_nbr'],axis=0).reset_index(drop=True)

# Add sales data using `date` and `store_nbr`
X = X.merge(train, how='left', on=['date','store_nbr'])

In [5]:
# Initialize empty columns
X['transferred'] = np.nan
X['hol_type'] = np.nan

# Map national holiday data (excluding national events)
X['hol_Nat'] = X['date'].map(holiday_nat_map).fillna(0)
X['hol_Nat_name'] = X['date'].map(holiday_nat_name_map)
X['transferred'] = X['transferred'].fillna(X['date'].map(holiday_nat_transf_map))
X['hol_type'] = X['hol_type'].fillna(X['date'].map(holiday_nat_type_map))

# Map event data
X['event'] = X['date'].map(holiday_event_map).fillna(0)
X['hol_event_name'] = X['date'].map(holiday_event_name_map)
X['transferred'] = X['transferred'].fillna(X['date'].map(holiday_event_transf_map))
X['hol_type'] = X['hol_type'].fillna(X['date'].map(holiday_event_type_map))

# Map regional holidays
regional_keys = pd.Series(zip(X['date'], X['state']))
X['hol_Reg'] = regional_keys.map(holiday_reg_map).fillna(0)
X['hol_Reg_name'] = regional_keys.map(holiday_reg_name_map)
X['transferred'] = X['transferred'].fillna(regional_keys.map(holiday_reg_transf_map))
X['hol_type'] = X['hol_type'].fillna(regional_keys.map(holiday_reg_type_map))

# Map local holidays
local_keys = pd.Series(zip(X['date'], X['city']))
X['hol_Loc'] = local_keys.map(holiday_loc_map).fillna(0)
X['hol_loc_name'] = local_keys.map(holiday_loc_name_map)
X['transferred'] = X['transferred'].fillna(local_keys.map(holiday_loc_transf_map))
X['hol_type'] = X['hol_type'].fillna(local_keys.map(holiday_loc_type_map))

# Fill missing values for holiday indicators and `transferred`
X[['hol_Nat', 'hol_Reg', 'hol_Loc', 'event']] = X[['hol_Nat', 'hol_Reg', 'hol_Loc', 'event']].fillna(0)
X['transferred'] = X['transferred'].fillna(False)

#Separating types of the holidays 
X = pd.get_dummies(X,columns=['hol_type'], prefix='hol_type')
# Reorder columns
X = X[['date', 'year', 'month', 'week_number', 'day', 'day_of_week', 
       'store_nbr','type', 'cluster', 'city', 'state','transactions','oil',
       'hol_Nat','hol_Nat_name',  'hol_Reg','hol_Reg_name','hol_Loc','hol_loc_name',
       'event','hol_event_name','transferred','hol_type_Additional','hol_type_Bridge',
       'hol_type_Event','hol_type_Holiday', 'hol_type_Transfer', 'hol_type_Work Day',
       'family', 'onpromotion', 'sales']] 

  X['transferred'] = X['transferred'].fillna(False)


# Merging training set II

In [6]:
# 6. Merging training set II

# Add oil price for each day
X_alt = oil.merge(train_dates, how='inner', on='date')
# Add training data
X_alt = train.merge(X_alt, how='left', on='date')
# Add store data
X_alt = X_alt.merge(stores, how='left', on='store_nbr')
# Add transaction data
X_alt = X_alt.merge(transactions, how='left', on=['date','store_nbr'])

In [7]:
# Initialize empty columns
X_alt['transferred'] = np.nan
X_alt['hol_type'] = np.nan

# Map national holiday data (excluding national events)
X_alt['hol_Nat'] = X_alt['date'].map(holiday_nat_map).fillna(0)
X_alt['hol_Nat_name'] = X_alt['date'].map(holiday_nat_name_map)
X_alt['transferred'] = X_alt['transferred'].fillna(X_alt['date'].map(holiday_nat_transf_map))
X_alt['hol_type'] = X_alt['hol_type'].fillna(X_alt['date'].map(holiday_nat_type_map))

# Map event data
X_alt['event'] = X_alt['date'].map(holiday_event_map).fillna(0)
X_alt['hol_event_name'] = X_alt['date'].map(holiday_event_name_map)
X_alt['transferred'] = X_alt['transferred'].fillna(X_alt['date'].map(holiday_event_transf_map))
X_alt['hol_type'] = X_alt['hol_type'].fillna(X_alt['date'].map(holiday_event_type_map))

# Map regional holidays
regional_keys = pd.Series(zip(X_alt['date'], X_alt['state']))
X_alt['hol_Reg'] = regional_keys.map(holiday_reg_map).fillna(0)
X_alt['hol_Reg_name'] = regional_keys.map(holiday_reg_name_map)
X_alt['transferred'] = X_alt['transferred'].fillna(regional_keys.map(holiday_reg_transf_map))
X_alt['hol_type'] = X_alt['hol_type'].fillna(regional_keys.map(holiday_reg_type_map))

# Map local holidays
local_keys = pd.Series(zip(X_alt['date'], X_alt['city']))
X_alt['hol_Loc'] = local_keys.map(holiday_loc_map).fillna(0)
X_alt['hol_loc_name'] = local_keys.map(holiday_loc_name_map)
X_alt['transferred'] = X_alt['transferred'].fillna(local_keys.map(holiday_loc_transf_map))
X_alt['hol_type'] = X_alt['hol_type'].fillna(local_keys.map(holiday_loc_type_map))

# Fill missing values for holiday indicators and `transferred`
X_alt[['hol_Nat', 'hol_Reg', 'hol_Loc', 'event']] = X_alt[['hol_Nat', 'hol_Reg', 'hol_Loc', 'event']].fillna(0)
X_alt['transferred'] = X_alt['transferred'].fillna(False)

#Separating types of the holidays 
X_alt = pd.get_dummies(X_alt,columns=['hol_type'], prefix='hol_type')
# Reorder columns
X_alt = X_alt[['date', 'year', 'month', 'week_number', 'day', 'day_of_week', 
       'store_nbr','type', 'cluster', 'city', 'state','transactions','oil',
       'hol_Nat','hol_Nat_name',  'hol_Reg','hol_Reg_name','hol_Loc','hol_loc_name',
       'event','hol_event_name','transferred','hol_type_Additional','hol_type_Bridge',
       'hol_type_Event','hol_type_Holiday', 'hol_type_Transfer', 'hol_type_Work Day',
       'family', 'onpromotion', 'sales']] 


  X_alt['transferred'] = X_alt['transferred'].fillna(False)


# Merging testing set

In [8]:
# 7. Merging testing set

# Add oil price for each day
y = oil.merge(test_dates, how='inner', on='date')
# Add test data
y = test.merge(y, how='left', on='date')
# Add store data
y = y.merge(stores, how='left', on='store_nbr')

In [9]:
# Initialize empty columns
y['transferred'] = np.nan
y['hol_type'] = np.nan

# Map national holiday data (excluding national events)
y['hol_Nat'] = y['date'].map(holiday_nat_map).fillna(0)
y['hol_Nat_name'] = y['date'].map(holiday_nat_name_map)
y['transferred'] = y['transferred'].fillna(y['date'].map(holiday_nat_transf_map))
y['hol_type'] = y['hol_type'].fillna(y['date'].map(holiday_nat_type_map))

# Map event data
y['event'] = y['date'].map(holiday_event_map).fillna(0)
y['hol_event_name'] = y['date'].map(holiday_event_name_map)
y['transferred'] = y['transferred'].fillna(y['date'].map(holiday_event_transf_map))
y['hol_type'] = y['hol_type'].fillna(y['date'].map(holiday_event_type_map))

# Map regional holidays
regional_keys = pd.Series(zip(y['date'], y['state']))
y['hol_Reg'] = regional_keys.map(holiday_reg_map).fillna(0)
y['hol_Reg_name'] = regional_keys.map(holiday_reg_name_map)
y['transferred'] = y['transferred'].fillna(regional_keys.map(holiday_reg_transf_map))
y['hol_type'] = y['hol_type'].fillna(regional_keys.map(holiday_reg_type_map))

# Map local holidays
local_keys = pd.Series(zip(y['date'], y['city']))
y['hol_Loc'] = local_keys.map(holiday_loc_map).fillna(0)
y['hol_loc_name'] = local_keys.map(holiday_loc_name_map)
y['transferred'] = y['transferred'].fillna(local_keys.map(holiday_loc_transf_map))
y['hol_type'] = y['hol_type'].fillna(local_keys.map(holiday_loc_type_map))

# Fill missing values for holiday indicators and `transferred`
y[['hol_Nat', 'hol_Reg', 'hol_Loc', 'event']] = y[['hol_Nat', 'hol_Reg', 'hol_Loc', 'event']].fillna(0)
y['transferred'] = y['transferred'].fillna(False)

#Separating types of the holidays 
y = pd.get_dummies(y,columns=['hol_type'], prefix='hol_type')
# Reorder columns
y = y[['date', 'year', 'month', 'week_number', 'day', 'day_of_week', 
       'store_nbr','type', 'cluster', 'city', 'state','oil',
       'hol_Nat','hol_Nat_name',  'hol_Reg','hol_Reg_name','hol_Loc','hol_loc_name',
       'event','hol_event_name','transferred','hol_type_Holiday',
       'family', 'onpromotion']] 


  y['transferred'] = y['transferred'].fillna(y['date'].map(holiday_nat_transf_map))
  y['hol_type'] = y['hol_type'].fillna(y['date'].map(holiday_nat_type_map))
  y['transferred'] = y['transferred'].fillna(y['date'].map(holiday_event_transf_map))
  y['hol_type'] = y['hol_type'].fillna(y['date'].map(holiday_event_type_map))
  y['transferred'] = y['transferred'].fillna(regional_keys.map(holiday_reg_transf_map))
  y['hol_type'] = y['hol_type'].fillna(regional_keys.map(holiday_reg_type_map))
  y['transferred'] = y['transferred'].fillna(False)


In [10]:
y

Unnamed: 0,date,year,month,week_number,day,day_of_week,store_nbr,type,cluster,city,...,hol_Reg,hol_Reg_name,hol_Loc,hol_loc_name,event,hol_event_name,transferred,hol_type_Holiday,family,onpromotion
0,2017-08-16,2017,8,33,16,2,1,D,13,Quito,...,0.0,,0.0,,0.0,,False,False,AUTOMOTIVE,0
1,2017-08-16,2017,8,33,16,2,1,D,13,Quito,...,0.0,,0.0,,0.0,,False,False,BABY CARE,0
2,2017-08-16,2017,8,33,16,2,1,D,13,Quito,...,0.0,,0.0,,0.0,,False,False,BEAUTY,2
3,2017-08-16,2017,8,33,16,2,1,D,13,Quito,...,0.0,,0.0,,0.0,,False,False,BEVERAGES,20
4,2017-08-16,2017,8,33,16,2,1,D,13,Quito,...,0.0,,0.0,,0.0,,False,False,BOOKS,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28507,2017-08-31,2017,8,35,31,3,9,B,6,Quito,...,0.0,,0.0,,0.0,,False,False,POULTRY,1
28508,2017-08-31,2017,8,35,31,3,9,B,6,Quito,...,0.0,,0.0,,0.0,,False,False,PREPARED FOODS,0
28509,2017-08-31,2017,8,35,31,3,9,B,6,Quito,...,0.0,,0.0,,0.0,,False,False,PRODUCE,1
28510,2017-08-31,2017,8,35,31,3,9,B,6,Quito,...,0.0,,0.0,,0.0,,False,False,SCHOOL AND OFFICE SUPPLIES,9


# Exporting to csv

In [11]:
# 8. Final summary stats and export

# Import the summary function to exam the data frame 
import sys
import os

# Get the current folder path
current_folder = os.getcwd()
# Add the parent directory to sys.path
parent_folder = os.path.abspath(os.path.join(current_folder, ".."))
sys.path.append(parent_folder)

# Import the utility module
import utility  # Now you can use utility.py as a module import utility


In [12]:
utility.summary(X)

data shape: (2755104, 31)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,first value,second value,third value
date,datetime64[ns],0,0.0,1682,2013-01-01 00:00:00,2017-08-15 00:00:00,2013-01-01 00:00:00,2013-01-01 00:00:00,2013-01-01 00:00:00
year,int32,0,0.0,5,2013.0,2017.0,2013,2013,2013
month,int32,0,0.0,12,1.0,12.0,1,1,1
week_number,UInt32,0,0.0,53,1.0,53.0,1,1,1
day,int32,0,0.0,31,1.0,31.0,1,1,1
day_of_week,int32,0,0.0,7,0.0,6.0,1,1,1
store_nbr,int64,0,0.0,54,1.0,54.0,25,25,25
type,object,0,0.0,5,,,D,D,D
cluster,int64,0,0.0,17,1.0,17.0,1,1,1
city,object,0,0.0,22,,,Salinas,Salinas,Salinas






In [13]:
utility.summary(X_alt)

data shape: (3000888, 31)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,first value,second value,third value
date,datetime64[ns],0,0.0,1684,2013-01-01 00:00:00,2017-08-15 00:00:00,2013-01-01 00:00:00,2013-01-01 00:00:00,2013-01-01 00:00:00
year,int32,0,0.0,5,2013.0,2017.0,2013,2013,2013
month,int32,0,0.0,12,1.0,12.0,1,1,1
week_number,UInt32,0,0.0,53,1.0,53.0,1,1,1
day,int32,0,0.0,31,1.0,31.0,1,1,1
day_of_week,int32,0,0.0,7,0.0,6.0,1,1,1
store_nbr,int64,0,0.0,54,1.0,54.0,1,1,1
type,object,0,0.0,5,,,D,D,D
cluster,int64,0,0.0,17,1.0,17.0,13,13,13
city,object,0,0.0,22,,,Quito,Quito,Quito






In [14]:
utility.summary(y)

data shape: (28512, 24)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,first value,second value,third value
date,datetime64[ns],0,0.0,16,2017-08-16 00:00:00,2017-08-31 00:00:00,2017-08-16 00:00:00,2017-08-16 00:00:00,2017-08-16 00:00:00
year,int32,0,0.0,1,2017.0,2017.0,2017,2017,2017
month,int32,0,0.0,1,8.0,8.0,8,8,8
week_number,UInt32,0,0.0,3,33.0,35.0,33,33,33
day,int32,0,0.0,16,16.0,31.0,16,16,16
day_of_week,int32,0,0.0,7,0.0,6.0,2,2,2
store_nbr,int64,0,0.0,54,1.0,54.0,1,1,1
type,object,0,0.0,5,,,D,D,D
cluster,int64,0,0.0,17,1.0,17.0,13,13,13
city,object,0,0.0,22,,,Quito,Quito,Quito






In [15]:
X.to_csv("../data/merged_train.csv", index = False)
X_alt.to_csv("../data/merged_train_alt.csv", index = False)
y.to_csv("../data/merged_test.csv", index = False)