In [1]:
pwd

u'/Users/jfdarre/Documents/NYCDS/Project4/KaggleProject'

## Data munging starts: 
### Importing packages and data

In [95]:
import datetime
import numpy as np
from numpy import double
import pandas as pd

In [96]:
print("Loading data start...")
# train and test set provided by Rossmann
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")
print("Complete!")

Loading data start...
Complete!   : )


In [132]:
print("Loading data start...")
# external data
store = pd.read_csv("input/store.csv")
states = pd.read_csv("input/store_states.csv")
eco_data = pd.read_csv("input/economic_data.csv")
print("Complete!")

Loading data start...
Complete!


## Data munging part I:
### Clean up, merges and dates features

In [97]:
# 1: Impute Open = 1 for store 622 in test data
test['Open'].fillna(1, inplace=True)
print 'Step 1 Complete.'

Step 1 Complete.


In [98]:
# 2: Merging the train and test for easy implementation of features
train['type'] = 'train'
test['type'] = 'test'
all_data = [train, test]
all_data = pd.concat(all_data)
print 'Step 2 Complete'

Step 2 Complete


In [99]:
# 3: Separate date columns
all_data['year']  = all_data.Date.apply(lambda x: x.split('-')[0])
all_data['year']  = all_data['year'].astype(int)
all_data['month'] = all_data.Date.apply(lambda x: x.split('-')[1])
all_data['month'] = all_data['month'].astype(int)
all_data['day']   = all_data.Date.apply(lambda x: x.split('-')[2])
all_data['day']   = all_data['day'].astype(int)
print 'Step 3 Complete.'

Step 3 Complete.


In [100]:
# 4: Withdraw multiple date features
all_data['Date']             = pd.to_datetime(all_data['Date'])
all_data['day_of_year']      = all_data['Date'].dt.dayofyear
all_data['quarter']          = all_data['Date'].dt.quarter
all_data['is_month_start']   = all_data['Date'].dt.is_month_start.astype(int)
all_data['is_month_end']     = all_data['Date'].dt.is_month_end.astype(int)
all_data['is_quarter_start'] = all_data['Date'].dt.is_quarter_start.astype(int)
all_data['is_quarter_end']   = all_data['Date'].dt.is_quarter_end.astype(int)
all_data['weeknum']          = np.ceil(all_data['day_of_year']/7.)
print 'Step 4 Complete.'

Step 4 Complete.


In [101]:
# 5: Standardize StateHoliday column.
all_data.ix[(all_data['StateHoliday'] == 0), 'StateHoliday'] = '0'
print 'Step 5 Complete.'

Step 5 Complete.


In [102]:
# 6: Setting 'Open' to 0 if Sales are 0
all_data.ix[(all_data['Sales'] == 0), 'Open'] = 0
print 'Step 6 Complete.'

Step 6 Complete.


In [103]:
# 7: Create duration of promo 1
all_data = all_data.sort(columns = ['Store', 'Date'], ascending = True, na_position = 'last')
all_data['PromoFirstDate'] = 0
all_data.ix[((all_data['Store'] == all_data['Store'].shift(1)) &
          (all_data['Promo'] == 1 ) & (all_data['Promo'].shift(1) ==0)),
         'PromoFirstDate'] = 1
print 'Step 7 Complete.'

Step 7 Complete.


In [104]:
# 8: Joining States:
all_data = pd.merge(all_data, states, on = 'Store', how = 'inner')
print 'Step 8 Complete.'

Step 8 Complete.


## Data munging part II: 
### Adding some usefull functions:

In [105]:
# f1: Creating a function to summarize our data
def rstr(df): 
    print df.shape
    print '=' * 72
    print df.apply(lambda x: [x.unique()])
    print '=' * 72
    print pd.isnull(df).any()
print 'function 1: rstr added'

function 1: rstr added


In [106]:
# f2: Creating a function to shift columns and making sure to distinguish between stores
def shift_col(df, col_name, n):
    df[col_name + str(n)] = df[col_name].shift(n)
    df.ix[(df['Store'] != df['Store'].shift(n)), col_name + str(n)] = float('NaN')
print 'function 2: shif_col added'    

function 2: shif_col added


In [111]:
# f2.1: Demonstrating what shift_col does:
temp = pd.DataFrame({
        'Sales': [1,2,3,4,5,1,2,3,4,5],
        'Store': [1,1,1,1,1,2,2,2,2,2]
    })
shift_col(temp, 'Sales', 2)
shift_col(temp, 'Sales', -1)
temp

Unnamed: 0,Sales,Store,Sales2,Sales-1
0,1,1,,2.0
1,2,1,,3.0
2,3,1,1.0,4.0
3,4,1,2.0,5.0
4,5,1,3.0,
5,1,2,,2.0
6,2,2,,3.0
7,3,2,1.0,4.0
8,4,2,2.0,5.0
9,5,2,3.0,


## Data munging part III: 
### Adding Schedule and historic Sales

In [113]:
# 9: now we can add previous sales for the past 21 days:
for i in range(1,22):
    shift_col(all_data, 'Sales', i)
print 'Step 9 Complete.'

Step 9 Complete.


In [114]:
# 10: now we can add previous open for the past 21 days:
for i in range(1,22):
    shift_col(all_data, 'Open', i)
print 'Step 10 Complete.'

Step 10 Complete.


In [115]:
# 11: now we can add next open for the past 21 days:
for i in range(1,22):
    shift_col(all_data, 'Sales', -i)
print 'Step 11 Complete.'

Step 11 Complete.


In [146]:
# 12: loading all weather data into one dataFrame called weather:
weather = pd.DataFrame()
for i in all_data['State'].unique():
    i = 'NI' if i == 'HB,NI' else i
    temp = pd.read_csv("data_for_features/Data_Weather/" + str(i) + ".csv", sep = ";")

    i = 'HB,NI' if i == 'NI' else i
    temp['State'] = i
    
    temp = [weather, temp]
    weather = pd.concat(temp)
print 'Step 12 Complete.'

In [149]:
# 12.1: Checking that step 12 was ok especially for state 'HB,NI'
weather[weather['State']=='HB,NI']

Unnamed: 0,Date,Max_TemperatureC,Mean_TemperatureC,Min_TemperatureC,Dew_PointC,MeanDew_PointC,Min_DewpointC,Max_Humidity,Mean_Humidity,Min_Humidity,...,Mean_VisibilityKm,Min_VisibilitykM,Max_Wind_SpeedKm_h,Mean_Wind_SpeedKm_h,Max_Gust_SpeedKm_h,Precipitationmm,CloudCover,Events,WindDirDegrees,State
0,2013-01-01,7,6,4,6,4,2,93,85,63,...,13,8,34,24,48,4.06,7,Rain,232,"HB,NI"
1,2013-01-02,7,5,3,5,3,2,93,83,71,...,11,8,32,23,40,1.02,6,Rain,266,"HB,NI"
2,2013-01-03,10,8,6,9,7,3,100,93,73,...,8,3,34,24,50,0.76,6,Rain,268,"HB,NI"
3,2013-01-04,9,8,7,9,8,6,100,93,87,...,7,3,35,24,52,0.25,7,Rain,279,"HB,NI"
4,2013-01-05,8,7,7,8,7,6,100,96,87,...,5,1,35,26,42,2.03,8,Rain,291,"HB,NI"
5,2013-01-06,8,7,6,8,7,5,100,95,87,...,7,3,26,16,37,1.02,7,Rain,286,"HB,NI"
6,2013-01-07,8,7,5,7,6,4,100,94,87,...,10,6,26,13,,0.00,7,Rain,273,"HB,NI"
7,2013-01-08,8,7,6,7,6,5,100,94,80,...,5,3,23,14,,2.03,7,Rain,271,"HB,NI"
8,2013-01-09,6,6,5,7,6,4,100,96,90,...,7,2,26,16,,4.06,7,Rain,268,"HB,NI"
9,2013-01-10,6,3,0,4,2,-4,100,89,75,...,11,6,26,16,,0.00,6,Rain-Snow,297,"HB,NI"
