In [1]:
# Run dependencies
from datetime import datetime, timedelta
import os
import pandas as pd
import requests
%run config.ipynb

In [2]:
## *******************************************EXTRACTION*******************************************
## (1) WALMART 2010-2012 SALES DATASET

# Read all walmart csv
features = pd.read_csv('Dataset/Walmart/features.csv')
stores = pd.read_csv('Dataset/Walmart/stores.csv')
train = pd.read_csv('Dataset/Walmart/train.csv')

In [3]:
## (2) API HOLIDAY

# Set holidays, country and years we want dates
holiday = ['Christmas Day','Christmas Eve','Independence Day',"New Year's Day", "Thanksgiving Day"]
country = 'US'
year = [2010,2011, 2012]

# Loops through API and get dates
holiday_name = []
holiday_date =[]
category = []
holiday_year = []

for number in year:
    # Set url base
    url = f'https://calendarific.com/api/v2/holidays?&api_key={api_key}&country={country}&year={number}'
    response = requests.get(url).json()
    
    #Loop through holiday
    for name in holiday:
        # Loop through API
        for i in range(len(response['response']['holidays'])):
            if name == response['response']['holidays'][i]['name']:
                holiday_date.append(response['response']['holidays'][i]['date']['iso'])
                category.append(name)
                holiday_year.append(number)
                break
                
# Create dataframes of lists created
holiday_df = pd.DataFrame({'Holiday': category,
                        'Year': holiday_year,
                        'Date': holiday_date})

# Display preview
holiday_df.head()

Unnamed: 0,Holiday,Year,Date
0,Christmas Day,2010,2010-12-25
1,Christmas Eve,2010,2010-12-24
2,Independence Day,2010,2010-07-04
3,New Year's Day,2010,2010-01-01
4,Thanksgiving Day,2010,2010-11-25


In [4]:
## (3) STOCK 1972-2020 DATASET

# Run stock csv
stock = pd.read_csv("Dataset/stock.csv")

# Display preview
stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1972-08-25,0.063477,0.064697,0.063477,0.064453,0.023123,2508800
1,1972-08-28,0.064453,0.064941,0.064209,0.064209,0.023035,972800
2,1972-08-29,0.063965,0.063965,0.063477,0.063477,0.022772,1945600
3,1972-08-30,0.063477,0.063477,0.062988,0.063477,0.022772,409600
4,1972-08-31,0.062988,0.062988,0.0625,0.0625,0.022422,870400


In [5]:
## *******************************************TRANSFORMATION*******************************************
## (1) WALMART 2010-2012 SALES DATASET/API HOLIDAY

# Merge stores and feature on Store
features_stores = features.merge(stores, how='inner', on='Store')

# Convert date column from string to date type
features_stores.Date = pd.to_datetime(features_stores.Date)
train.Date = pd.to_datetime(train.Date)

# Add column week and year
features_stores['Week'] = features_stores.Date.dt.week 
features_stores['Year'] = features_stores.Date.dt.year

# Merge datasets
walmart_data = train.merge(features_stores, 
                           how='inner',
                           on=['Store','Date','IsHoliday']).sort_values(by=['Store',
                                                                            'Dept',
                                                                            'Date']).reset_index(drop=True)

# Drop MarkDown columns and IsHoliday
walmart_data = walmart_data.drop(["IsHoliday", "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"], axis=1)

# Drop NaN values in CPI and Unemployment
walmart_data = walmart_data.dropna()

# Change Temprature column name to Temperature F
walmart_data=walmart_data.rename(columns={"Temperature": "Temperature F"})

# Convert temprature from Fahrenheit to Celsius
temp_celcius=[]
for temperature in walmart_data['Temperature F']:
    celcius = (temperature - 32) * (5.0/9.0)
    temp_celcius.append(celcius)
walmart_data["Temperature C"] = temp_celcius

# Rounding Column values to required decimal points
walmart_data = walmart_data.round({'Fuel_Price': 2, 'Temperature C': 0, 'CPI': 4 })

# Convert Date to datetime and subtract 6 days to get Start of Week
walmart_data['Start of Week'] = pd.to_datetime(walmart_data.Date) - timedelta(days=6)

# Sort dataframe and reset index for walmart
walmart_data = walmart_data.sort_values(by =['Date']).reset_index(drop = True)

In [6]:
## (1) WALMART 2010-2012 SALES DATASET/API HOLIDAY

# Sort dataframe and reset index for walmart
holiday_df = holiday_df.sort_values(by =['Date']).reset_index(drop = True)

# Convert holiday date
holiday_df['Date'] = pd.to_datetime(holiday_df['Date'])

holiday_name = []
k = 0
# Label data with Holiday Name or No Holiday
for i in range(len(walmart_data)):
    
    if (holiday_df['Date'][k] <= walmart_data['Date'][i]) & (holiday_df['Date'][k] >= walmart_data['Start of Week'][i]):
        holiday_name.append(holiday_df['Holiday'][k])
    
    elif (holiday_df['Date'][k] > walmart_data['Date'][i]):
        holiday_name.append("No Holiday")
    
    elif (walmart_data['Date'][i] > holiday_df['Date'][k]):
        holiday_name.append("No Holiday")
        k +=1

# Set new column as list created:
walmart_data['Holiday Name'] = holiday_name

# Convert Christmas Day and Eve to be Christmas
walmart_data['Holiday Name'] = walmart_data['Holiday Name'].replace({'Christmas Day': 'Christmas',
                                                                    'Christmas Eve': 'Christmas'})

# Display preview of dataframe:
walmart_data.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,Temperature F,Fuel_Price,CPI,Unemployment,Type,Size,Week,Year,Temperature C,Start of Week,Holiday Name
0,1,1,2010-02-05,24924.5,42.31,2.57,211.0964,8.106,A,151315,5,2010,6.0,2010-01-30,No Holiday
1,29,5,2010-02-05,15552.08,24.36,2.79,131.5279,10.064,B,93638,5,2010,-4.0,2010-01-30,No Holiday
2,29,6,2010-02-05,3200.22,24.36,2.79,131.5279,10.064,B,93638,5,2010,-4.0,2010-01-30,No Holiday
3,29,7,2010-02-05,10820.05,24.36,2.79,131.5279,10.064,B,93638,5,2010,-4.0,2010-01-30,No Holiday
4,29,8,2010-02-05,20055.64,24.36,2.79,131.5279,10.064,B,93638,5,2010,-4.0,2010-01-30,No Holiday


In [7]:
## (1) WALMART 2010-2012 SALES DATASET/API HOLIDAY
## TABLE 1 SUMMARY WALMART STORES SALES OVER 2010-2012

# Aggregate over Store/Start of Week, Include Other Columns
header = ['Store','Start of Week', 'Date','Fuel_Price','CPI','Unemployment','Type', 'Size','Week','Year','Temperature C','Holiday Name']
walmart = pd.DataFrame(walmart_data.groupby(header)['Weekly_Sales'].sum())

# Reset Index and add ID Column
walmart = walmart.reset_index()
walmart = walmart.reset_index(drop = False)

# Rename Columns
walmart.columns = ['ID','Store','Start_of_Week','Week_Date','Fuel_Price','CPI','Unemployment','Type','Size','Week','Year',
                  'Temperature_C','Holiday_Name','Weekly_Sales']

# Display preview
walmart.head()

Unnamed: 0,ID,Store,Start_of_Week,Week_Date,Fuel_Price,CPI,Unemployment,Type,Size,Week,Year,Temperature_C,Holiday_Name,Weekly_Sales
0,0,1,2010-01-30,2010-02-05,2.57,211.0964,8.106,A,151315,5,2010,6.0,No Holiday,1643690.9
1,1,1,2010-02-06,2010-02-12,2.55,211.2422,8.106,A,151315,6,2010,4.0,No Holiday,1641957.44
2,2,1,2010-02-13,2010-02-19,2.51,211.2891,8.106,A,151315,7,2010,4.0,No Holiday,1611968.17
3,3,1,2010-02-20,2010-02-26,2.56,211.3196,8.106,A,151315,8,2010,8.0,No Holiday,1409727.59
4,4,1,2010-02-27,2010-03-05,2.62,211.3501,8.106,A,151315,9,2010,8.0,No Holiday,1554806.68


In [8]:
## (2) API HOLIDAY
## TABLE 2 HOLIDAY DATE LIST 2010-2012

# Display preview
holiday_df.head()

Unnamed: 0,Holiday,Year,Date
0,New Year's Day,2010,2010-01-01
1,Independence Day,2010,2010-07-04
2,Thanksgiving Day,2010,2010-11-25
3,Christmas Eve,2010,2010-12-24
4,Christmas Day,2010,2010-12-25


In [9]:
## (3) STOCK 1972-2020 DATASET
## TABLE 3 ALL 2010-2012 STOCK DAILY DATASET

# Convert Date Columns to datetime
stock['Date'] = pd.to_datetime(stock['Date'])
stock['Year'] = stock['Date'].dt.year

# Set first date and last date on sales dateframe
first_date = walmart['Start_of_Week'][0]
last_date = walmart['Week_Date'].iloc[-1]

# Filter stock date with sales data range
stock = stock.loc[(stock['Date'] >= first_date) & (stock['Date'] <=last_date)]
stock = stock.reset_index(drop = True)

# Sort by Date
stock = stock.sort_values(by = 'Date')
walmart = walmart.sort_values(by = 'Week_Date')

# Get list of unique walmart weekly dates
walmart_date = walmart.Week_Date.unique()

# Classify Date under weekly date
wk_date = []
w = 0

for s in range(len(stock)):
    if stock['Date'][s] <= walmart_date[w]:
        wk_date.append(walmart_date[w])
    else:
        wk_date.append(walmart_date[w+1])
        w = w + 1

# Add Week Date and rename columns
stock['Week_Date'] = wk_date
stock.columns = ['Date','Open','High','Low','Close','Adj_Close','Volume','Year','Week_Date']

# Display preview
stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj_Close,Volume,Year,Week_Date
0,2010-02-01,53.619999,53.779999,53.310001,53.48,40.977615,11019500,2010,2010-02-05
1,2010-02-02,53.59,53.720001,53.330002,53.490002,40.985275,11387900,2010,2010-02-05
2,2010-02-03,53.73,54.5,53.639999,54.27,41.582928,17988900,2010,2010-02-05
3,2010-02-04,53.880001,54.299999,52.959999,52.970001,40.58683,21029900,2010,2010-02-05
4,2010-02-05,52.77,53.529999,52.759998,53.450001,40.95462,15545800,2010,2010-02-05


In [10]:
## (3) STOCK 1972-2020 DATASET
## TABLE 4 WEEKLY 2010-2012 STOCK DATASET

# Average weekly stock price and sum of stock sold per week
stock_wk = pd.DataFrame(stock.groupby('Week_Date').agg({'Open': 'mean',
                                'High': 'mean',
                                'Low': 'mean',
                                'Close': 'mean',
                                'Adj_Close': 'mean',
                                'Volume': 'sum'
                               }))

# Reset index and rename columns
stock_wk = stock_wk.reset_index()
stock_wk.columns = ['Week_Date','Avg_Open','Avg_High','Avg_Low','Avg_Close','Avg_Adj_Close','Total_Volume']

# Display preview
stock_wk.head()

Unnamed: 0,Week_Date,Avg_Open,Avg_High,Avg_Low,Avg_Close,Avg_Adj_Close,Total_Volume
0,2010-02-05,53.518,53.966,53.2,53.532001,41.017454,76972000
1,2010-02-12,53.22,53.444,52.853999,53.080001,40.671121,59357300
2,2010-02-19,53.33,53.762501,53.132501,53.645001,41.104042,75853300
3,2010-02-26,53.738,54.094,53.404,53.918,41.313212,66322400
4,2010-03-05,53.842,54.006,53.482001,53.85,41.26111,60428500


In [11]:
## WALMART 2010-2012 SALES DATASET/API HOLIDAY
## TABLE 5 SUMMARY WEEKLY SALES OF WALMART HOLIDAY 2010-2012

# Group by Week
walmart_wk = pd.DataFrame(walmart.groupby(['Start_of_Week','Week_Date']).agg({'Fuel_Price': 'mean',
                                        'CPI': 'mean',
                                        'Unemployment': 'mean',
                                        'Week': 'min',
                                        'Year': 'min',
                                        'Temperature_C': 'mean',
                                        'Holiday_Name': 'min',
                                        'Weekly_Sales': 'sum'
                                            }))

# Reset index and rename columns
walmart_wk = walmart_wk.reset_index()
walmart_wk.columns = ['Start_of_Week','Week_Date','Avg_Fuel_Price','Avg_CPI','Avg_Unemployment','Week','Year','Avg_Temperature_C',
                      'Holiday_Name','Total_Weekly_Sales']

# Merge

# Display preview
walmart_wk.head()

Unnamed: 0,Start_of_Week,Week_Date,Avg_Fuel_Price,Avg_CPI,Avg_Unemployment,Week,Year,Avg_Temperature_C,Holiday_Name,Total_Weekly_Sales
0,2010-01-30,2010-02-05,2.716667,167.730904,8.619311,5,2010,1.2,No Holiday,49750740.5
1,2010-02-06,2010-02-12,2.694222,167.825616,8.619311,6,2010,1.288889,No Holiday,48336677.63
2,2010-02-13,2010-02-19,2.671111,167.871696,8.619311,7,2010,3.177778,No Holiday,48276993.78
3,2010-02-20,2010-02-26,2.682667,167.909662,8.619311,8,2010,3.955556,No Holiday,43968571.13
4,2010-02-27,2010-03-05,2.730667,167.947642,8.619311,9,2010,6.111111,No Holiday,46871470.3


In [12]:
## WALMART 2010-2012 SALES DATASET/API HOLIDAY
## TABLE 6 WALMART AGGREGATED HOLIDAY SALES DATASET 2010-2012

# Group by Holiday Name and Year
walmart_holiday = pd.DataFrame(walmart.groupby(['Holiday_Name','Year']).agg({'Fuel_Price': 'mean',
                                        'CPI': 'mean',
                                        'Unemployment': 'mean',
                                        'Temperature_C': 'mean',
                                        'Weekly_Sales': 'mean'
                                            }))

# Reset index and rename column
walmart_holiday = walmart_holiday.reset_index()
walmart_holiday.columns = ['Holiday_Name','Year','Avg_Fuel_Price','Avg_CPI','Avg_Unemployment','Avg_Temperature_C',
                           'Avg_Weekly_Sales']

# Change format of average weekly sales
walmart_holiday['Avg_Weekly_Sales'] = [int(sales) for sales in walmart_holiday['Avg_Weekly_Sales']]

# Sort and reset index
walmart_holiday = walmart_holiday.sort_values(by='Year')
walmart_holiday = walmart_holiday.reset_index(drop = True)

# Display preview
walmart_holiday.head()

Unnamed: 0,Holiday_Name,Year,Avg_Fuel_Price,Avg_CPI,Avg_Unemployment,Avg_Temperature_C,Avg_Weekly_Sales
0,Christmas,2010,3.036111,168.495574,8.475289,3.955556,1348486
1,Independence Day,2010,2.783333,167.948456,8.428578,26.088889,1064433
2,No Holiday,2010,2.812862,168.05668,8.494913,16.162039,1036750
3,Thanksgiving Day,2010,2.919333,168.54162,8.475289,8.888889,1462688
4,Christmas,2011,3.272,173.991936,7.804222,3.244444,1023165


In [13]:
## WALMART 2010-2012 SALES DATASET/API HOLIDAY/STOCK
## TABLE 7 MERGE WALMART/HOLIDAY/STOCK ON WEEK

# Merge walmart and stock weekly date and sort
walmart_stk = walmart.merge(stock_wk, how = "inner", on = "Week_Date")

# Sort by stores and reset
walmart_stk = walmart_stk.sort_values(by=['Store', 'Week_Date'])
walmart_stk = walmart_stk.reset_index(drop=True)


# Display preview
walmart_stk.head()

Unnamed: 0,ID,Store,Start_of_Week,Week_Date,Fuel_Price,CPI,Unemployment,Type,Size,Week,Year,Temperature_C,Holiday_Name,Weekly_Sales,Avg_Open,Avg_High,Avg_Low,Avg_Close,Avg_Adj_Close,Total_Volume
0,0,1,2010-01-30,2010-02-05,2.57,211.0964,8.106,A,151315,5,2010,6.0,No Holiday,1643690.9,53.518,53.966,53.2,53.532001,41.017454,76972000
1,1,1,2010-02-06,2010-02-12,2.55,211.2422,8.106,A,151315,6,2010,4.0,No Holiday,1641957.44,53.22,53.444,52.853999,53.080001,40.671121,59357300
2,2,1,2010-02-13,2010-02-19,2.51,211.2891,8.106,A,151315,7,2010,4.0,No Holiday,1611968.17,53.33,53.762501,53.132501,53.645001,41.104042,75853300
3,3,1,2010-02-20,2010-02-26,2.56,211.3196,8.106,A,151315,8,2010,8.0,No Holiday,1409727.59,53.738,54.094,53.404,53.918,41.313212,66322400
4,4,1,2010-02-27,2010-03-05,2.62,211.3501,8.106,A,151315,9,2010,8.0,No Holiday,1554806.68,53.842,54.006,53.482001,53.85,41.26111,60428500


In [14]:
# *******************************************LOADING*******************************************
