In [1]:
import numpy as np
import pandas as pd

In [2]:
# Import the data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
stores = pd.read_csv('../data/stores.csv')
transactions = pd.read_csv('../data/transactions.csv')
oil = pd.read_csv('../data/oil.csv')
holidays = pd.read_csv('../data/holidays_events.csv')

# Dictionary for all datasets
datasets = {'train':train, 'test':test, 'stores':stores, 'transactions':transactions, 'oil':oil, 'holidays':holidays}

# Cleaning

### Standarizing dates

We convert any dates in the datasets to pandas Timestamps.

In [3]:
for df in iter(datasets.values()):
    if 'date' in df.columns:
        df['date'] = pd.to_datetime( df['date'] )

### Training Set

We drop the `id` column from the training set. The preliminary analysis proved this is just a redundant row indexer.

In [4]:
train = train.drop('id',axis=1)

### Oil Dataset

The preliminary analysis shows 43 missing daily oil price values and missing weekend oil price values. 

We add the weekend days first as more null values, then interpolate 

Also, the oil price for the very first day (2013-01-01) is missing. We can manually add that here using the oil prices from 2012-12-31 and 2013-01-02 from (https://fred.stlouisfed.org/data/DCOILWTICO). We separately verified that is safe to do, since these oil prices match the ones in the oil dataset.

In [5]:
# Manually add oil price for the first day using average of 2012-12-31 and 2013-01-02 oil prices
oil.iloc[0,1] = 92.485                                  

In [6]:
# Create DataFrame with all dates in desired range, including weekends
dates = pd.DataFrame(pd.date_range(start='1/1/2013', end='8/31/2017',freq='D'), columns=['date'])
# Merge with oil data set, so that weekend dates are added to oil with null values
oil = dates.merge(oil,how='left', on='date')
# Interpolate all missing values in oil (all but possibly one of the gaps are of size 1,2, or 3)
oil['dcoilwtico'] = oil['dcoilwtico'].interpolate()

### Holiday Dataset

From the preliminary analysis, two transferred holidays need their description updated so that all tranfer holidays have consistent formatting.

In [7]:
holidays.loc[304,'description'] = 'Traslado Fundacion de Cuenca'
holidays.loc[329, 'description'] = 'Traslado Fundacion de Ibarra'

There were also a couple mislabelled Additional Holidays, and one that should be deleted for redundancy.

In [8]:
holidays.loc[264]

date                2016-07-24 00:00:00
type                         Additional
locale                            Local
locale_name                   Guayaquil
description    Fundacion de Guayaquil-1
transferred                       False
Name: 264, dtype: object

In [9]:
holidays.loc[182,'type'] = 'Additional'
holidays.loc[322,'type'] = 'Holiday'
holidays = holidays.drop(264, axis=0)

# Merging

We first inner join the store and transaction data along the `store_nbr` column.

This guarantees no duplicates or new null values.

In [10]:
# Merge stores with transactions on date and store_nbr
X = stores.merge(transactions, how='inner', on='store_nbr')
X = X.sort_values(by=['date','store_nbr'],axis=0).reset_index(drop=True)
X = X[['date','store_nbr','type','cluster','city','state','transactions']]

Before merging more data, we break down the `date` column into year, month, week, day, and day of week.

In [11]:
X = X.assign(**{'year': pd.Series( [X.date[i].year for i in X.index]), 
            'month': pd.Series( [X.date[i].month for i in X.index]), 
            'week_number': pd.Series( [X.date[i].week for i in X.index]), 
            'day':pd.Series( [X.date[i].day for i in X.index]), 
            'day_of_week': pd.Series( [X.date[i].dayofweek for i in X.index]) })
X = X[['date','year', 'month', 'week_number', 'day', 'day_of_week','store_nbr','type','cluster','city','state','transactions']]

Next, we inner join oil prices along the `date` column.

Since there is one oil price per date and we filled null values in oil, this won't give duplicates or new null values.

In [12]:
X = X.merge(oil, how='left', on='date')
X = X.sort_values(by=['date','store_nbr'],axis=0).reset_index(drop=True)

Here we join the holiday data,

In [13]:
# to be determined

Finally, we inner join our training data along both `date` and `store_nbr`.

In [14]:
X = X.merge(train, how='inner', on=['date','store_nbr'])

In [15]:
X.to_csv("../data/merged_train.csv", index = False)