In [1]:
import calendar

import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('./data/train.csv', parse_dates=['date'])
test = pd.read_csv('./data/test.csv', parse_dates=['date'])
stores = pd.read_csv('./data/stores.csv')
transactions = pd.read_csv('./data/transactions.csv', parse_dates=['date'])
oil = pd.read_csv('./data/oil.csv', parse_dates=['date'])
holidays0 = pd.read_csv('./data/holidays_events.csv', parse_dates=['date'])

In [3]:
data = pd.concat([train, test])

In [4]:
test_border_date = test.date.unique()[0]

In [5]:
data.shape

(3029400, 6)

In [6]:
result0 = data.merge(
    stores,
    on='store_nbr',
)

In [7]:
result0.shape

(3029400, 10)

In [8]:
result1 = result0.merge(
    transactions,
    on=['date', 'store_nbr'],
    how='left',
)
result1.shape

(3029400, 11)

In [9]:
all_dates = sorted(result1.date.unique())

In [10]:
oil2_data = []
prev_item = None
for oil_date in all_dates:
    oil_item = oil[oil.date == oil_date]
    if oil_item.shape[0] and not np.isnan(oil_item.iloc[0].dcoilwtico):
        oil2_data.append({
            'date': oil_date,
            'oil_price': oil_item.iloc[0].dcoilwtico,
        })
        prev_item = oil_item
    elif prev_item is None:
        x = oil.iloc[1]
        oil2_data.append({
            'date': oil_date,
            'oil_price': x.dcoilwtico,
        })
    else:
        oil2_data.append({
            'date': oil_date,
            'oil_price': prev_item.iloc[0].dcoilwtico,
        })
oil2 = pd.DataFrame(oil2_data)

In [11]:
oil2.head()

Unnamed: 0,date,oil_price
0,2013-01-01,93.14
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-05,93.12


In [12]:
oil2.tail()

Unnamed: 0,date,oil_price
1695,2017-08-27,47.65
1696,2017-08-28,46.4
1697,2017-08-29,46.46
1698,2017-08-30,45.96
1699,2017-08-31,47.26


In [13]:
result2 = result1.merge(
    oil2,
    on='date',
    how='left',
)
result2.shape

(3029400, 12)

In [14]:
holidays = holidays0[(holidays0.transferred == False) & (holidays0.type != 'Work Day')].copy()
holidays.shape

(333, 6)

In [15]:
work_days = holidays0[holidays0.type == 'Work Day'].groupby('date').first()
work_days.rename({'description': 'work_day_name'}, axis=1, inplace=True)
work_days.shape

(5, 5)

In [16]:
events = holidays[holidays.type == 'Event']
earthquake = events[events.description.str.startswith('Terremoto Manabi')].groupby('date').first()
other_events = events[events.description.str.startswith('Terremoto Manabi') == False].groupby('date').first()

earthquake.rename({'description': 'earthquake'}, axis=1, inplace=True)
other_events.rename({'description': 'event_name'}, axis=1, inplace=True)

print(earthquake.shape)
print(other_events.shape)

(31, 5)
(25, 5)


In [17]:
holiday_events = holidays[holidays.type != 'Event']
holiday_events.shape

(277, 6)

In [18]:
national_events = holiday_events[holiday_events.locale == 'National'].groupby('date').first()
national_events.rename({'description': 'national_holiday'}, axis=1, inplace=True)
national_events.shape

(102, 5)

In [19]:
regional_events = holiday_events[holiday_events.locale == 'Regional'].groupby('date').first()
regional_events.rename({
    'description': 'regional_holiday',
    'locale_name': 'state',
}, axis=1, inplace=True)
regional_events.shape

(24, 5)

In [20]:
local_events = holiday_events[holiday_events.locale == 'Local'].groupby('date').first()
local_events.rename({
    'description': 'local_holiday',
    'locale_name': 'city',
}, axis=1, inplace=True)
local_events.shape

(134, 5)

In [21]:
result3 = result2.merge(
    work_days[['work_day_name']],
    on='date',
    how='left',
).merge(
    earthquake[['earthquake']],
    on='date',
    how='left',
).merge(
    other_events[['event_name']],
    on='date',
    how='left',
).merge(
    national_events[['national_holiday']],
    on='date',
    how='left'
).merge(
    regional_events[['regional_holiday', 'state']],
    on=['date', 'state'],
    how='left',
).merge(
    local_events[['local_holiday', 'city']],
    on=['date', 'city'],
    how='left',
)
result3.shape

(3029400, 18)

In [22]:
result3.earthquake.fillna('', inplace=True)
result3.event_name.fillna('', inplace=True)
result3.national_holiday.fillna('', inplace=True)
result3.regional_holiday.fillna('', inplace=True)
result3.local_holiday.fillna('', inplace=True)
result3.work_day_name.fillna('', inplace=True)

In [23]:
result3.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,transactions,oil_price,work_day_name,earthquake,event_name,national_holiday,regional_holiday,local_holiday
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,,93.14,,,,Primer dia del ano,,
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,,93.14,,,,Primer dia del ano,,
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,,93.14,,,,Primer dia del ano,,
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,,93.14,,,,Primer dia del ano,,
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,,93.14,,,,Primer dia del ano,,


In [24]:
result3['year'] = result3.date.apply(lambda x: x.year)
result3['month'] = result3.date.apply(lambda x: x.month)
result3['day'] = result3.date.apply(lambda x: x.day)
result3['day_of_week'] = result3.date.apply(lambda x: x.day_of_week)

In [25]:
result3['eom'] = result3.date.map(lambda x: calendar.monthrange(x.year, x.month)[1])
result3['payday'] = (result3.day == 15) | (result3.day == result3.eom) 
result3.drop('eom', axis=1, inplace=True)

In [26]:
train_result = result3[result3.date < test_border_date].copy()

In [27]:
test_result = result3[result3.date >= test_border_date].copy()
test_result.drop('sales', axis=1, inplace=True)

In [28]:
train_result.to_csv('data/train_merged.csv', index=False)

In [29]:
test_result.to_csv('data/test_merged.csv', index=False)