In [3]:
import numpy as np
import pandas as pd

In [4]:
# Import the data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
stores = pd.read_csv('../data/stores.csv')
transactions = pd.read_csv('../data/transactions.csv')
oil = pd.read_csv('../data/oil.csv')
holidays = pd.read_csv('../data/holidays_events.csv')

# Dictionary for all datasets
datasets = {'train':train, 'test':test, 'stores':stores, 'transactions':transactions, 'oil':oil, 'holidays':holidays}

# Cleaning

### Standarizing dates

We convert any dates in the datasets to pandas Timestamps.

In [5]:
for df in iter(datasets.values()):
    if 'date' in df.columns:
        df['date'] = pd.to_datetime( df['date'] )

### Training/Testing Set

We drop the `id` column from the training/testing set. The preliminary analysis proved this is just a redundant row indexer.

In [6]:
train = train.drop('id',axis=1)
test = test.drop('id',axis=1)

### Oil Dataset

We change `dcoilwtico` to just `oil` for simplicity.

In [7]:
oil = oil.rename({'dcoilwtico': 'oil'}, axis=1)

The preliminary analysis shows 43 missing daily oil price values and missing weekend oil price values. 

We add the weekend days first as more null values, then interpolate 

Also, the oil price for the very first day (2013-01-01) is missing. We can manually add that here using the oil prices from 2012-12-31 and 2013-01-02 from (https://fred.stlouisfed.org/data/DCOILWTICO). We separately verified that is safe to do, since these oil prices match the ones in the oil dataset.

In [8]:
# Manually add oil price for the first day using average of 2012-12-31 and 2013-01-02 oil prices
oil.iloc[0,1] = 92.485                                  

In [9]:
# Create DataFrame with all dates in desired range, including weekends
dates = pd.DataFrame(pd.date_range(start='1/1/2013', end='8/31/2017',freq='D'), columns=['date'])
# Merge with oil data set, so that weekend dates are added to oil with null values
oil = dates.merge(oil,how='left', on='date')
# Interpolate all missing values in oil (all but possibly one of the gaps are of size 1,2, or 3)
oil['oil'] = oil['oil'].interpolate()

### Holiday Dataset

Rename the `type` column so that it won't conflict with store type.

In [10]:
holidays = holidays.rename({'type':'hol_type'},axis='columns')

From the preliminary analysis, two transferred holidays need their description updated so that all tranfer holidays have consistent formatting.

In [11]:
holidays.loc[304,'description'] = 'Traslado Fundacion de Cuenca'
holidays.loc[329, 'description'] = 'Traslado Fundacion de Ibarra'

There were also a couple mislabelled Additional Holidays, and one that should be deleted for redundancy.

In [12]:
holidays.loc[182,'type'] = 'Additional'
holidays.loc[322,'type'] = 'Holiday'
holidays = holidays.drop(264, axis=0)

We separate the holiday `locale` variable into three columns (local, regional, national).

In [13]:
holidays = pd.get_dummies(holidays, columns=['locale'],prefix='Hol')
#We can adjust weights here or later
holidays['Hol_Local']=holidays['Hol_Local']*1
holidays['Hol_National']=holidays['Hol_National']*1
holidays['Hol_Regional']=holidays['Hol_Regional']*1

In [14]:
# Looking at local holidays
hol_loc=holidays[holidays['Hol_Local']==1]
hol_loc=hol_loc.rename(columns={'locale_name':'city'})
hol_loc=hol_loc[['date', 'hol_type', 'city', 'description', 'transferred','Hol_Local']]
hol_loc.sample()

Unnamed: 0,date,hol_type,city,description,transferred,Hol_Local
249,2016-05-12,Holiday,Puyo,Cantonizacion del Puyo,False,1


In [15]:
# Looking at Regional holidays
hol_reg=holidays[holidays['Hol_Regional']==1]
hol_reg=hol_reg.rename(columns={'locale_name':'state'})
hol_reg=hol_reg[['date', 'hol_type', 'state', 'description', 'transferred', 'Hol_Regional']]
hol_reg.sample()

Unnamed: 0,date,hol_type,state,description,transferred,Hol_Regional
140,2014-11-07,Holiday,Santa Elena,Provincializacion Santa Elena,False,1


In [16]:
# Looking at National holidays
hol_nat=holidays[holidays['Hol_National']==1]
hol_nat=hol_nat[['date', 'hol_type', 'description', 'transferred', 'Hol_National']]
hol_nat.sample()

Unnamed: 0,date,hol_type,description,transferred,Hol_National
108,2014-06-20,Event,Mundial de futbol Brasil: Ecuador-Honduras,False,1


In [17]:
# Create a map of national holidays
holiday_nat_map = dict(zip(hol_nat['date'], hol_nat['Hol_National']))
holiday_nat_type_map = dict(zip(hol_nat['date'], hol_nat['hol_type']))
holiday_nat_name_map = dict(zip(hol_nat['date'], hol_nat['description']))
holiday_nat_transf_map = dict(zip(hol_nat['date'], hol_nat['transferred']))

# Create a map of regional holidays
holiday_reg_map = dict(zip(zip(hol_reg['date'], hol_reg['state'].str.strip()), hol_reg['Hol_Regional']))
holiday_reg_type_map = dict(zip(zip(hol_reg['date'], hol_reg['state'].str.strip()), hol_reg['hol_type']))
holiday_reg_name_map = dict(zip(zip(hol_reg['date'], hol_reg['state'].str.strip()), hol_reg['description']))
holiday_reg_transf_map = dict(zip(hol_reg['date'], hol_reg['transferred']))

# Create a map of local holidays
holiday_loc_map = dict(zip(zip(hol_loc['date'], hol_loc['city'].str.strip()), hol_loc['Hol_Local']))
holiday_loc_type_map = dict(zip(zip(hol_loc['date'], hol_loc['city'].str.strip()), hol_loc['hol_type']))
holiday_loc_name_map = dict(zip(zip(hol_loc['date'], hol_loc['city'].str.strip()), hol_loc['description']))
holiday_loc_transf_map = dict(zip(zip(hol_loc['date'], hol_loc['city'].str.strip()), hol_loc['transferred']))

In [19]:
duplicates1 = holidays.loc[holidays['locale'] == 'National'].groupby('date').filter(lambda x: len(x) > 1)
duplicates1

Unnamed: 0,date,type,locale,locale_name,description,transferred
35,2012-12-24,Bridge,National,Ecuador,Puente Navidad,False
36,2012-12-24,Additional,National,Ecuador,Navidad-1,False
39,2012-12-31,Bridge,National,Ecuador,Puente Primer dia del ano,False
40,2012-12-31,Additional,National,Ecuador,Primer dia del ano-1,False
156,2014-12-26,Bridge,National,Ecuador,Puente Navidad,False
157,2014-12-26,Additional,National,Ecuador,Navidad+1,False
235,2016-05-01,Holiday,National,Ecuador,Dia del Trabajo,False
236,2016-05-01,Event,National,Ecuador,Terremoto Manabi+15,False
242,2016-05-07,Additional,National,Ecuador,Dia de la Madre-1,False
243,2016-05-07,Event,National,Ecuador,Terremoto Manabi+21,False


In [20]:
duplicates2 = holidays.loc[holidays['locale'] == 'Local'].groupby('date').filter(lambda x: len(x) > 1)
duplicates2

Unnamed: 0,date,type,locale,locale_name,description,transferred
8,2012-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
9,2012-06-25,Holiday,Local,Machala,Fundacion de Machala,False
10,2012-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False
11,2012-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False
59,2013-06-25,Holiday,Local,Machala,Fundacion de Machala,False
60,2013-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
61,2013-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False
62,2013-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False
110,2014-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
111,2014-06-25,Holiday,Local,Machala,Fundacion de Machala,False


In [21]:
duplicates3 = holidays.loc[holidays['locale'] == 'Regional'].groupby('date').filter(lambda x: len(x) > 1)
duplicates3

Unnamed: 0,date,type,locale,locale_name,description,transferred


# Merging

We first inner join the store and transaction data along the `store_nbr` column.

This guarantees no duplicates or new null values.

In [16]:
# Merge stores with transactions on date and store_nbr
X = stores.merge(transactions, how='inner', on='store_nbr')
X = X.sort_values(by=['date','store_nbr'],axis=0).reset_index(drop=True)
X = X[['date','store_nbr','type','cluster','city','state','transactions']]

Before merging more data, we break down the `date` column into year, month, w|eek, day, and day of week.

In [17]:
X = X.assign(**{'year': pd.Series( [X.date[i].year for i in X.index]), 
            'month': pd.Series( [X.date[i].month for i in X.index]), 
            'week_number': pd.Series( [X.date[i].week for i in X.index]), 
            'day':pd.Series( [X.date[i].day for i in X.index]), 
            'day_of_week': pd.Series( [X.date[i].dayofweek for i in X.index]) })
X = X[['date','year', 'month', 'week_number', 'day', 'day_of_week','store_nbr','type','cluster','city', 'state', 'transactions']]

Next, we  join oil prices along the `date` column.

Since there is one oil price per date and we filled null values in oil, this won't give duplicates or new null values.

In [18]:
X = X.merge(oil, how='left', on='date')
X = X.sort_values(by=['date','store_nbr'],axis=0).reset_index(drop=True)

Finally, we inner join our training data along both `date` and `store_nbr`.

In [19]:
X = X.merge(train, how='left', on=['date','store_nbr'])

Here we join the holiday data using the mappings defined earlier.

In [20]:
# Add empty tranferred and holiday type columns
X['transferred'] = np.nan
X['hol_type'] = np.nan

#Use mappings to fill in the values for national holidays
X['hol_Nat'] = X['date'].map(holiday_nat_map)
X.loc[X['transferred'].isna(), 'transferred'] = X.loc[X['transferred'].isna(), 'date'].map(holiday_nat_transf_map)
X['hol_Nat_name'] = X['date'].map(holiday_nat_name_map)
X.loc[X['hol_type'].isna(), 'hol_type'] = X['date'].map(holiday_nat_type_map)

# Assign regional holidays based on mapping 
X['hol_Reg'] = X.apply(lambda row: holiday_reg_map.get((row['date'], row['state'])), axis=1)
X.loc[X['transferred'].isna(), 'transferred'] = X.loc[X['transferred'].isna(), 'date'].map(holiday_reg_transf_map)
X['hol_Reg_name'] = X.apply(lambda row: holiday_reg_name_map.get((row['date'], row['state'])), axis=1)
X.loc[X['hol_type'].isna(), 'hol_type'] = X.loc[X['hol_type'].isna()].apply(lambda row: holiday_reg_type_map.get((row['date'], row['state'])), axis=1)

# Assign local holidays based on mapping 
X['hol_Loc'] = X.apply(lambda row: holiday_loc_map.get((row['date'], row['city'])), axis=1)
X.loc[X['transferred'].isna(), 'transferred'] = X.loc[X['transferred'].isna(), 'date'].map(holiday_nat_transf_map)

X['transferred'] = X.apply(
    lambda row: holiday_loc_map.get((row['date'], row['city'])) if pd.isna(row['transferred']) else row['transferred'], axis=1
)
X.loc[X['hol_type'].isna(), 'hol_type']= X.loc[X['hol_type'].isna()].apply(lambda row: holiday_loc_type_map.get((row['date'], row['city'])), axis=1)
X['hol_loc_name'] = X.apply(lambda row: holiday_loc_name_map.get((row['date'], row['city'])), axis=1)

# Just fillna
X[['hol_Nat','hol_Reg','hol_Loc']]=X[['hol_Nat','hol_Reg','hol_Loc']].fillna(0)


  X.loc[X['transferred'].isna(), 'transferred'] = X.loc[X['transferred'].isna(), 'date'].map(holiday_nat_transf_map)
  X.loc[X['hol_type'].isna(), 'hol_type'] = X['date'].map(holiday_nat_type_map)


Reorder columns and create boolean columns for each holiday type.

In [21]:
#Separating types of the holidays 
X = pd.get_dummies(X,columns=['hol_type'], prefix='hol_type')

In [22]:
#Reorder columns
X = X[['date', 'year', 'month', 'week_number', 'day', 'day_of_week',
       'store_nbr', 'type', 'cluster', 'city', 'state', 'transactions', 'oil',
       'hol_Nat', 'hol_Nat_name', 'hol_Reg', 'hol_Reg_name', 'hol_Loc','hol_loc_name', 
       'transferred','hol_type_Additional', 'hol_type_Bridge', 'hol_type_Event',
       'hol_type_Holiday', 'hol_type_Transfer', 'hol_type_Work Day',
       'family', 'onpromotion', 'sales']] 

# Merging (Alternative)

This approach to merging starts with the training set, and ensures no rows of the training data are lost. However, as a result of NaN values and missing dates among the other data sets, this might contain some NaN values.

Start with DataFrames containing all dates in the desired time frames.

In [23]:
# DataFrame with all the days
train_dates = pd.DataFrame(pd.date_range(start='1/1/2013', end='8/15/2017',freq='D'), columns=['date'])
test_dates = pd.DataFrame(pd.date_range(start='8/16/2017', end='8/31/2017',freq='D'), columns=['date'])

# Breaking down dates into year, month, day, etc.
train_dates = train_dates.assign(**{'year': pd.Series( [train_dates.date[i].year for i in train_dates.index]), 
            'month': pd.Series( [train_dates.date[i].month for i in train_dates.index]), 
            'week_number': pd.Series( [train_dates.date[i].week for i in train_dates.index]), 
            'day':pd.Series( [train_dates.date[i].day for i in train_dates.index]), 
            'day_of_week': pd.Series( [train_dates.date[i].dayofweek for i in train_dates.index]) })
test_dates = test_dates.assign(**{'year': pd.Series( [test_dates.date[i].year for i in test_dates.index]), 
            'month': pd.Series( [test_dates.date[i].month for i in test_dates.index]), 
            'week_number': pd.Series( [test_dates.date[i].week for i in test_dates.index]), 
            'day':pd.Series( [test_dates.date[i].day for i in test_dates.index]), 
            'day_of_week': pd.Series( [test_dates.date[i].dayofweek for i in test_dates.index]) })

# Reorder columns
train_dates = train_dates[['date','year', 'month', 'week_number', 'day', 'day_of_week']]
test_dates = test_dates[['date','year', 'month', 'week_number', 'day', 'day_of_week']]


Add oil price for each day.

In [24]:
# Add oil price for each day
X_alt = oil.merge(train_dates, how='inner', on='date')
y_alt = oil.merge(test_dates, how='inner', on='date')

Left join training set data (left) with date/oil (right) along the `date` column.

This ensures all training data is kept, while the date DataFrame drops December 25 2013, 2014, 2015, 2016, which are missing from train.csv.

In [25]:
X_alt = train.merge(X_alt, how='left', on='date')           # 3,000,888 rows
y_alt = test.merge(y_alt, how='left', on='date')            # 28,512 rows

Left join merged data (left) with stores (right) along the `store_nbr` column.

In [26]:
X_alt = X_alt.merge(stores, how='left', on='store_nbr')     # 3,000,888 rows
y_alt = y_alt.merge(stores, how='left', on='store_nbr')     # 28,512 rows

Left join merged date (left) with transactions along `date` then `store_nbr`.

Transactions is additionally missing data for Jan 1 and Jan 3 of 2016.

In [27]:
# Note: the testing set has no transaction data, so we can't add that here
X_alt = X_alt.merge(transactions, how='left', on=['date','store_nbr'])

Make a copy of the X_alt

In [28]:
# Make a copy
X_new = X_alt

Finally we add the holiday data using the mappings we constructed.

In [29]:
# Add empty transferred and holiday type columns
X_alt['transferred'] = np.nan
X_alt['hol_type'] = np.nan

#Use mappings to fill in the values for national holidays
X_alt['hol_Nat'] = X_alt['date'].map(holiday_nat_map)
X_alt.loc[X_alt['transferred'].isna(), 'transferred'] = X_alt.loc[X_alt['transferred'].isna(), 'date'].map(holiday_nat_transf_map)
X_alt['hol_Nat_name'] = X_alt['date'].map(holiday_nat_name_map)
X_alt.loc[X_alt['hol_type'].isna(), 'hol_type'] = X_alt['date'].map(holiday_nat_type_map)

# Assign regional holidays based on mapping 
X_alt['hol_Reg'] = X_alt.apply(lambda row: holiday_reg_map.get((row['date'], row['state'])), axis=1)
X_alt.loc[X_alt['transferred'].isna(), 'transferred'] = X_alt.loc[X_alt['transferred'].isna(), 'date'].map(holiday_reg_transf_map)
X_alt['hol_Reg_name'] = X_alt.apply(lambda row: holiday_reg_name_map.get((row['date'], row['state'])), axis=1)
X_alt.loc[X_alt['hol_type'].isna(), 'hol_type'] = X_alt.loc[X_alt['hol_type'].isna()].apply(lambda row: holiday_reg_type_map.get((row['date'], row['state'])), axis=1)

# Assign local holidays based on mapping 
X_alt['hol_Loc'] = X_alt.apply(lambda row: holiday_loc_map.get((row['date'], row['city'])), axis=1)
X_alt.loc[X_alt['transferred'].isna(), 'transferred'] = X_alt.loc[X_alt['transferred'].isna(), 'date'].map(holiday_nat_transf_map)

X_alt['transferred'] = X_alt.apply(
    lambda row: holiday_loc_map.get((row['date'], row['city'])) if pd.isna(row['transferred']) else row['transferred'], axis=1
)
X_alt.loc[X_alt['hol_type'].isna(), 'hol_type']= X_alt.loc[X_alt['hol_type'].isna()].apply(lambda row: holiday_loc_type_map.get((row['date'], row['city'])), axis=1)
X_alt['hol_loc_name'] = X_alt.apply(lambda row: holiday_loc_name_map.get((row['date'], row['city'])), axis=1)

# Just fillna
X_alt[['hol_Nat','hol_Reg','hol_Loc']]=X_alt[['hol_Nat','hol_Reg','hol_Loc']].fillna(0)

  X_alt.loc[X_alt['transferred'].isna(), 'transferred'] = X_alt.loc[X_alt['transferred'].isna(), 'date'].map(holiday_nat_transf_map)
  X_alt.loc[X_alt['hol_type'].isna(), 'hol_type'] = X_alt['date'].map(holiday_nat_type_map)


In [30]:
#Separating types of the holidays 
X_alt = pd.get_dummies(X_alt,columns=['hol_type'], prefix='hol_type')

In [31]:
# Reorder columns
X_alt = X_alt[['date', 'year', 'month', 'week_number', 'day', 'day_of_week', 
       'store_nbr','type', 'cluster', 'city', 'state','transactions','oil',
       'hol_Nat','hol_Nat_name',  'hol_Reg','hol_Reg_name','hol_Loc','hol_loc_name',
       'transferred','hol_type_Additional','hol_type_Bridge', 'hol_type_Event',
       'hol_type_Holiday', 'hol_type_Transfer', 'hol_type_Work Day',
       'family', 'onpromotion', 'sales']] 

# Creating csv

In [32]:
X.sample(5) 

Unnamed: 0,date,year,month,week_number,day,day_of_week,store_nbr,type,cluster,city,...,transferred,hol_type_Additional,hol_type_Bridge,hol_type_Event,hol_type_Holiday,hol_type_Transfer,hol_type_Work Day,family,onpromotion,sales
442595,2013-10-17,2013,10,42,17,3,33,C,3,Quevedo,...,,False,False,False,False,False,False,SEAFOOD,0,3.0
880610,2014-07-29,2014,7,31,29,1,47,A,14,Quito,...,,False,False,False,False,False,False,BREAD/BAKERY,1,750.664
2664371,2017-06-26,2017,6,26,26,0,5,D,4,Santo Domingo,...,,False,False,False,False,False,False,HOME APPLIANCES,0,0.0
2501506,2017-03-26,2017,3,12,26,6,13,C,15,Latacunga,...,,False,False,False,False,False,False,CLEANING,22,597.0
1867116,2016-03-23,2016,3,12,23,2,5,D,4,Santo Domingo,...,,False,False,False,False,False,False,DELI,0,193.844


In [33]:
X_alt.sample(5)

Unnamed: 0,date,year,month,week_number,day,day_of_week,store_nbr,type,cluster,city,...,transferred,hol_type_Additional,hol_type_Bridge,hol_type_Event,hol_type_Holiday,hol_type_Transfer,hol_type_Work Day,family,onpromotion,sales
2609456,2017-01-08,2017,1,1,8,6,26,D,10,Guayaquil,...,,False,False,False,False,False,False,HARDWARE,0,1.0
630643,2013-12-20,2013,12,51,20,4,53,D,13,Manta,...,,False,False,False,False,False,False,GROCERY II,0,0.0
105558,2013-03-01,2013,3,9,1,4,20,B,6,Quito,...,,False,False,False,False,False,False,MEATS,0,0.0
902169,2014-05-23,2014,5,21,23,4,22,C,7,Puyo,...,,False,False,False,False,False,False,HOME AND KITCHEN I,0,0.0
63978,2013-02-05,2013,2,6,5,1,53,D,13,Manta,...,,False,False,False,False,False,False,MEATS,0,0.0


In [34]:
#X.to_csv("../data/merged_train.csv", index = False)
#X_alt.to_csv("../data/merged_train_alt.csv", index = False)

In [35]:
print(X_alt['transactions'].isnull().sum())
    # X_alt has missing values in transactions, and in the three holiday name columns
    # the holiday day columns correspond to days with no holidays at all
    # note that:
        # number of fewer rows in merged_train == 245, 784 == number of missing transaction values in merged_train_alt 
    # i.e. merged_train has fewer rows, but has all values by skipping days without transaction data
    # while merged_train_alt has all the original rows (3,000,888) but retains the NaN transaction values

245784


## Fix the transfer issue and event label
Starting from X_alt

In [36]:
holidays.transferred.unique()

array([False,  True])

In [None]:
# FIx the map first 
# First the national holidays: 

# Looking at National holidays
hol_nat_new=holidays[ (holidays['Hol_National']==1) & (holidays['hol_type'] != 'Event')]

hol_nat_new=hol_nat_new[['date', 'hol_type', 'description', 'transferred', 'Hol_National']]
hol_nat_new.sample()

# Create a map of national holidays
holiday_nat_map_new = dict(zip(hol_nat_new['date'], hol_nat_new['Hol_National']))
holiday_nat_type_map_new = dict(zip(hol_nat_new['date'], hol_nat_new['hol_type']))
holiday_nat_name_map_new = dict(zip(hol_nat_new['date'], hol_nat_new['description']))
holiday_nat_transf_map_new = dict(zip(hol_nat_new['date'], hol_nat_new['transferred']))

In [38]:
# Then create the df for events

hol_event_new = holidays[ (holidays['Hol_National']==1) & (holidays['hol_type'] == 'Event')]
hol_event_new = hol_event_new[['date', 'hol_type', 'description', 'transferred', 'Hol_National']]

holiday_event_map_new = dict(zip(hol_event_new['date'], hol_event_new['Hol_National']))
holiday_event_type_map_new = dict(zip(hol_event_new['date'], hol_event_new['hol_type']))
holiday_event_name_map_new = dict(zip(hol_event_new['date'], hol_event_new['description']))
holiday_event_transf_map_new = dict(zip(hol_event_new['date'], hol_event_new['transferred']))


In [39]:
# Add empty transferred and holiday type columns
X_new['transferred'] = np.nan
X_new['hol_type'] = np.nan

#Use mappings to fill in the values for national holidays
# Exclude national events. There're overlaping
X_new['hol_Nat'] = X_new['date'].map(holiday_nat_map_new)
X_new.loc[X_new['transferred'].isna(), 'transferred'] = X_new.loc[X_new['transferred'].isna(), 'date'].map(holiday_nat_transf_map_new)
X_new['hol_Nat_name'] = X_new['date'].map(holiday_nat_name_map_new)
X_new.loc[X_new['hol_type'].isna(), 'hol_type'] = X_new['date'].map(holiday_nat_type_map_new)

# Add the event: 
X_new['event'] = X_new['date'].map(holiday_event_map_new)
X_new.loc[X_new['transferred'].isna(), 'transferred'] = X_new.loc[X_new['transferred'].isna(), 'date'].map(holiday_event_transf_map_new)
X_new['hol_event_name'] = X_new['date'].map(holiday_event_name_map_new)
X_new.loc[X_new['hol_type'].isna(), 'hol_type'] = X_new['date'].map(holiday_event_type_map_new)

# Assign regional holidays based on mapping 
X_new['hol_Reg'] = X_new.apply(lambda row: holiday_reg_map.get((row['date'], row['state'])), axis=1)
X_new.loc[X_new['transferred'].isna(), 'transferred'] = X_new.loc[X_new['transferred'].isna(), 'date'].map(holiday_reg_transf_map)
X_new['hol_Reg_name'] = X_new.apply(lambda row: holiday_reg_name_map.get((row['date'], row['state'])), axis=1)
X_new.loc[X_new['hol_type'].isna(), 'hol_type'] = X_new.loc[X_new['hol_type'].isna()].apply(lambda row: holiday_reg_type_map.get((row['date'], row['state'])), axis=1)

# Assign local holidays based on mapping 
X_new['hol_Loc'] = X_new.apply(lambda row: holiday_loc_map.get((row['date'], row['city'])), axis=1)
X_new.loc[X_new['transferred'].isna(), 'transferred'] = X_new.loc[X_new['transferred'].isna(), 'date'].map(holiday_loc_transf_map)
'''
X_new['transferred'] = X_new.apply(
    lambda row: holiday_loc_map.get((row['date'], row['city']) ) if pd.isna(row['transferred']) else row['transferred'], axis=1
)
'''
X_new.loc[X_new['hol_type'].isna(), 'hol_type']= X_new.loc[X_new['hol_type'].isna()].apply(lambda row: holiday_loc_type_map.get((row['date'], row['city'])), axis=1)
X_new['hol_loc_name'] = X_new.apply(lambda row: holiday_loc_name_map.get((row['date'], row['city'])), axis=1)

# Just fillna
X_new[['hol_Nat','hol_Reg','hol_Loc','event']]=X_new[['hol_Nat','hol_Reg','hol_Loc','event']].fillna(0)

X_new['transferred'] = X_new['transferred'].fillna(False)


  X_new.loc[X_new['transferred'].isna(), 'transferred'] = X_new.loc[X_new['transferred'].isna(), 'date'].map(holiday_nat_transf_map_new)
  X_new.loc[X_new['hol_type'].isna(), 'hol_type'] = X_new['date'].map(holiday_nat_type_map_new)
  X_new['transferred'] = X_new['transferred'].fillna(False)


In [40]:
#Separating types of the holidays 
X_new = pd.get_dummies(X_new,columns=['hol_type'], prefix='hol_type')

In [41]:
X_new.transferred.unique()

array([False,  True])

In [42]:
X_new.columns

Index(['date', 'store_nbr', 'family', 'sales', 'onpromotion', 'oil', 'year',
       'month', 'week_number', 'day', 'day_of_week', 'city', 'state', 'type',
       'cluster', 'transactions', 'transferred', 'hol_Nat', 'hol_Nat_name',
       'hol_Reg', 'hol_Reg_name', 'hol_Loc', 'hol_loc_name', 'event',
       'hol_event_name', 'hol_type_Additional', 'hol_type_Bridge',
       'hol_type_Event', 'hol_type_Holiday', 'hol_type_Transfer',
       'hol_type_Work Day'],
      dtype='object')

In [43]:
# Reorder columns
X_new = X_new[['date', 'year', 'month', 'week_number', 'day', 'day_of_week', 
       'store_nbr','type', 'cluster', 'city', 'state','transactions','oil',
       'hol_Nat','hol_Nat_name',  'hol_Reg','hol_Reg_name','hol_Loc','hol_loc_name',
       'event','hol_event_name','transferred','hol_type_Additional','hol_type_Bridge',
       'hol_type_Event','hol_type_Holiday', 'hol_type_Transfer', 'hol_type_Work Day',
       'family', 'onpromotion', 'sales']] 

In [44]:
# Import the summary function to exam the data frame 

import sys
import os

# Get the current folder path
current_folder = os.getcwd()
# Add the parent directory to sys.path
parent_folder = os.path.abspath(os.path.join(current_folder, ".."))
sys.path.append(parent_folder)

# Import the utility module
import utility  # Now you can use utility.py as a module import utility

In [45]:
utility.summary(X_new)

data shape: (3000888, 31)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,first value,second value,third value
date,datetime64[ns],0,0.0,1684,2013-01-01 00:00:00,2017-08-15 00:00:00,2013-01-01 00:00:00,2013-01-01 00:00:00,2013-01-01 00:00:00
year,int64,0,0.0,5,2013.0,2017.0,2013,2013,2013
month,int64,0,0.0,12,1.0,12.0,1,1,1
week_number,int64,0,0.0,53,1.0,53.0,1,1,1
day,int64,0,0.0,31,1.0,31.0,1,1,1
day_of_week,int64,0,0.0,7,0.0,6.0,1,1,1
store_nbr,int64,0,0.0,54,1.0,54.0,1,1,1
type,object,0,0.0,5,,,D,D,D
cluster,int64,0,0.0,17,1.0,17.0,13,13,13
city,object,0,0.0,22,,,Quito,Quito,Quito






In [47]:
X_new.sample(5)

Unnamed: 0,date,year,month,week_number,day,day_of_week,store_nbr,type,cluster,city,...,transferred,hol_type_Additional,hol_type_Bridge,hol_type_Event,hol_type_Holiday,hol_type_Transfer,hol_type_Work Day,family,onpromotion,sales
1101151,2014-09-11,2014,9,37,11,3,6,D,13,Quito,...,False,False,False,False,False,False,False,CLEANING,1,792.0
358974,2013-07-21,2013,7,29,21,6,31,B,10,Babahoyo,...,False,False,False,False,False,False,False,AUTOMOTIVE,0,6.0
351289,2013-07-17,2013,7,29,17,2,16,C,3,Santo Domingo,...,False,False,False,False,False,False,False,BOOKS,0,0.0
1381689,2015-02-17,2015,2,8,17,1,27,D,1,Daule,...,False,False,False,False,True,False,False,GROCERY I,6,4789.0
920842,2014-06-02,2014,6,23,2,0,46,A,14,Quito,...,False,False,False,False,False,False,False,EGGS,0,224.0


In [48]:
X_new.to_csv("../data/merged_train_alt_new.csv")