In [1]:
import pandas as pd
from collections import Counter

In [2]:
holidays_events_df = pd.read_csv('data/holidays_events.csv')
holidays_events_df

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
5,2012-05-12,Holiday,Local,Puyo,Cantonizacion del Puyo,False
6,2012-06-23,Holiday,Local,Guaranda,Cantonizacion de Guaranda,False
7,2012-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
8,2012-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
9,2012-06-25,Holiday,Local,Machala,Fundacion de Machala,False


In [3]:
# Check for NaN we may need to address

null_holiday_df = holidays_events_df.isnull()
null_holiday_df.describe()

Unnamed: 0,date,type,locale,locale_name,description,transferred
count,350,350,350,350,350,350
unique,1,1,1,1,1,1
top,False,False,False,False,False,False
freq,350,350,350,350,350,350


In [4]:
holidays_events_df.describe(include='all')

Unnamed: 0,date,type,locale,locale_name,description,transferred
count,350,350,350,350,350,350
unique,312,6,3,24,103,2
top,2014-06-25,Holiday,National,Ecuador,Carnaval,False
freq,4,221,174,174,10,338


In [5]:
# There are 350 description entries, and 174 entries occur only once.  This detail seems to have very little 
# correlation, and if 1-hot encoded may blow up our data so large that it will make our neural network slow down.
# Initially we will omit these to keep our neural network fast.

Counter(holidays_events_df['description'].value_counts())

Counter({1: 46, 2: 6, 3: 3, 4: 1, 5: 9, 6: 35, 7: 2, 10: 1})

In [6]:
holidays_events_df['description'].value_counts()

Carnaval                                           10
Fundacion de Ibarra                                 7
Fundacion de Cuenca                                 7
Cantonizacion de Libertad                           6
Cantonizacion de Latacunga                          6
Navidad+1                                           6
Fundacion de Santo Domingo                          6
Primer Grito de Independencia                       6
Cantonizacion del Puyo                              6
Navidad-2                                           6
Cantonizacion de Quevedo                            6
Provincializacion de Cotopaxi                       6
Fundacion de Loja                                   6
Fundacion de Machala                                6
Fundacion de Quito                                  6
Dia de Difuntos                                     6
Navidad-3                                           6
Independencia de Latacunga                          6
Cantonizacion de Salinas    

In [7]:
# Let's look at the impact locale names have on 

Counter(holidays_events_df['description'].value_counts())

Counter({1: 46, 2: 6, 3: 3, 4: 1, 5: 9, 6: 35, 7: 2, 10: 1})

In [8]:
# Though a majority of holidays occur in Ecuador, no locale_name occurs only once, so let's keep this to infuence
# our network.

holidays_events_df['locale_name'].value_counts()

Ecuador                           174
Quito                              13
Ambato                             12
Riobamba                           12
Latacunga                          12
Guaranda                           12
Guayaquil                          11
Ibarra                              7
Cuenca                              7
Machala                             6
Imbabura                            6
Santo Domingo de los Tsachilas      6
Quevedo                             6
Manta                               6
Cayambe                             6
Loja                                6
Puyo                                6
El Carmen                           6
Santa Elena                         6
Esmeraldas                          6
Libertad                            6
Santo Domingo                       6
Cotopaxi                            6
Salinas                             6
Name: locale_name, dtype: int64

In [9]:
holidays_events_df.drop(['description'], axis=1, inplace=True)
holidays_events_df.head()

Unnamed: 0,date,type,locale,locale_name,transferred
0,2012-03-02,Holiday,Local,Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,False


In [10]:
# 1 hot encode features to lightly influence output, where 
# larger integers may have a more significant impact on results

type_df = pd.get_dummies(holidays_events_df['type'], prefix='type_')
locale_df = pd.get_dummies(holidays_events_df['locale'], prefix='locale_')
locale_name_df = pd.get_dummies(holidays_events_df['locale_name'], prefix='locale_name_')
transferred_df = pd.get_dummies(holidays_events_df['transferred'], drop_first=True)
transferred_df.rename(columns={True:'transferred'}, inplace=True)

In [11]:
holidays_events_df = holidays_events_df.merge(type_df, left_index=True, right_index=True)
holidays_events_df.drop('type', axis=1, inplace=True)
holidays_events_df = holidays_events_df.merge(locale_df, left_index=True, right_index=True)
holidays_events_df.drop('locale', axis=1, inplace=True)
holidays_events_df = holidays_events_df.merge(locale_name_df, left_index=True, right_index=True)
holidays_events_df.drop('locale_name', axis=1, inplace=True)
holidays_events_df.drop('transferred', axis=1, inplace=True)
holidays_events_df = holidays_events_df.merge(transferred_df, left_index=True, right_index=True)

holidays_events_df

Unnamed: 0,date,type__Additional,type__Bridge,type__Event,type__Holiday,type__Transfer,type__Work Day,locale__Local,locale__National,locale__Regional,...,locale_name__Manta,locale_name__Puyo,locale_name__Quevedo,locale_name__Quito,locale_name__Riobamba,locale_name__Salinas,locale_name__Santa Elena,locale_name__Santo Domingo,locale_name__Santo Domingo de los Tsachilas,transferred
0,2012-03-02,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2012-04-01,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2012-04-12,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2012-04-14,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2012-04-21,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
5,2012-05-12,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
6,2012-06-23,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2012-06-25,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,2012-06-25,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2012-06-25,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# clear memory
del type_df, locale_df, locale_name_df, transferred_df

In [13]:
# Check for duplicate holiday dates, but keep the first one because that is how many we want when we merge duplicate
# dates together
holiday_duplicates = holidays_events_df.duplicated('date', keep='first')
holiday_duplicates

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8       True
9       True
10     False
11      True
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
320    False
321    False
322    False
323    False
324    False
325    False
326    False
327    False
328    False
329    False
330    False
331    False
332    False
333    False
334    False
335    False
336    False
337    False
338    False
339    False
340    False
341    False
342     True
343    False
344    False
345     True
346    False
347    False
348    False
349    False
Length: 350, dtype: bool

In [14]:
Counter(holiday_duplicates)

Counter({False: 312, True: 38})

In [15]:
# if dates are the same, go through the columns, replace '0' with 1/max(), 
holidays_events_df = holidays_events_df.groupby('date').agg(lambda x: x.max()).reset_index()
holidays_events_df

Unnamed: 0,date,type__Additional,type__Bridge,type__Event,type__Holiday,type__Transfer,type__Work Day,locale__Local,locale__National,locale__Regional,...,locale_name__Manta,locale_name__Puyo,locale_name__Quevedo,locale_name__Quito,locale_name__Riobamba,locale_name__Salinas,locale_name__Santa Elena,locale_name__Santo Domingo,locale_name__Santo Domingo de los Tsachilas,transferred
0,2012-03-02,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2012-04-01,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2012-04-12,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2012-04-14,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2012-04-21,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
5,2012-05-12,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
6,2012-06-23,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2012-06-25,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
8,2012-07-03,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
9,2012-07-23,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
'''
holiday_events_df = holidays_events_df.groupby('date').apply(lambda x: x.\
                                        fillna(method='ffill').\
                                        fillna(method='bfill').\
                                        drop_duplicates()).\
                                        reset_index(drop=True).\
                                        set_index('date')

holiday_events_df.reset_index(inplace=True)
'''

"\nholiday_events_df = holidays_events_df.groupby('date').apply(lambda x: x.                                        fillna(method='ffill').                                        fillna(method='bfill').                                        drop_duplicates()).                                        reset_index(drop=True).                                        set_index('date')\n\nholiday_events_df.reset_index(inplace=True)\n"

In [17]:
holiday_duplicates = holidays_events_df.duplicated('date', keep=False)
Counter(holiday_duplicates)

Counter({False: 312})

In [18]:
train_dates = pd.read_csv('modified_data/train_dates.csv')
test_dates = pd.read_csv('modified_data/test_dates.csv')

In [19]:
train_holidays = train_dates.merge(holidays_events_df, on='date', how='left')
train_holidays.fillna(0, inplace=True)
train_holidays

Unnamed: 0,date,month,day,year,type__Additional,type__Bridge,type__Event,type__Holiday,type__Transfer,type__Work Day,...,locale_name__Manta,locale_name__Puyo,locale_name__Quevedo,locale_name__Quito,locale_name__Riobamba,locale_name__Salinas,locale_name__Santa Elena,locale_name__Santo Domingo,locale_name__Santo Domingo de los Tsachilas,transferred
0,2013-01-01,1,1,2013,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2013-01-02,1,2,2013,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2013-01-03,1,3,2013,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2013-01-04,1,4,2013,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2013-01-05,1,5,2013,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2013-01-06,1,6,2013,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2013-01-07,1,7,2013,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2013-01-08,1,8,2013,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2013-01-09,1,9,2013,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2013-01-10,1,10,2013,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
test_holidays = test_dates.merge(holidays_events_df, on='date', how='left')
test_holidays.fillna(0, inplace=True)
test_holidays

Unnamed: 0,date,month,day,year,type__Additional,type__Bridge,type__Event,type__Holiday,type__Transfer,type__Work Day,...,locale_name__Manta,locale_name__Puyo,locale_name__Quevedo,locale_name__Quito,locale_name__Riobamba,locale_name__Salinas,locale_name__Santa Elena,locale_name__Santo Domingo,locale_name__Santo Domingo de los Tsachilas,transferred
0,2017-08-16,8,16,2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2017-08-17,8,17,2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2017-08-18,8,18,2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2017-08-19,8,19,2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2017-08-20,8,20,2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2017-08-21,8,21,2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2017-08-22,8,22,2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2017-08-23,8,23,2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2017-08-24,8,24,2017,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2017-08-25,8,25,2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
train_holidays.to_csv('modified_data/train_holidays.csv', index=False)
test_holidays.to_csv('modified_data/test_holidays.csv', index=False)