In [None]:
# Run dependencies
from datetime import datetime, timedelta
import os
import pandas as pd
import requests
%run config.ipynb

# Extraction

In [None]:
## *******************************************EXTRACTION*******************************************
## (1) WALMART 2010-2012 SALES DATASET

# Read all walmart csv
features = pd.read_csv('Dataset/Walmart/features.csv')
stores = pd.read_csv('Dataset/Walmart/stores.csv')
train = pd.read_csv('Dataset/Walmart/train.csv')

In [None]:
## (2) API HOLIDAY

# Set holidays, country and years we want dates
holiday = ['Christmas Day','Christmas Eve','Independence Day',"New Year's Day", "Thanksgiving Day"]
country = 'US'
year = [2010,2011, 2012]

# Loops through API and get dates
holiday_name = []
holiday_date =[]
category = []
holiday_year = []

for number in year:
    # Set url base
    url = f'https://calendarific.com/api/v2/holidays?&api_key={api_key}&country={country}&year={number}'
    response = requests.get(url).json()
    
    #Loop through holiday
    for name in holiday:
        # Loop through API
        for i in range(len(response['response']['holidays'])):
            if name == response['response']['holidays'][i]['name']:
                holiday_date.append(response['response']['holidays'][i]['date']['iso'])
                category.append(name)
                holiday_year.append(number)
                break
                
# Create dataframes of lists created
holiday_df = pd.DataFrame({'Holiday': category,
                        'Year': holiday_year,
                        'Date': holiday_date})

# Display preview
holiday_df.head()

In [None]:
## (3) STOCK 1972-2020 DATASET

# Run stock csv
stock = pd.read_csv("Dataset/stock.csv")

# Display preview
stock.head()

In [None]:
## (4) WALMART SALES DATA (5 YEARS) ZIP FILE SALES_AUG & PRICES CSV

# Run sales_aug csv
sales_aug= pd.read_csv('Resources/sales_aug.csv')

# Display preview
sales_aug.head()

In [None]:
## (4) WALMART SALES DATA (5 YEARS) ZIP FILE SALES_AUG & PRICES CSV CONT.

# Run sales_aug csv
prices= pd.read_csv('Resources/prices.csv')

# Display preview
prices.head()

# Transformation

In [None]:
## *******************************************TRANSFORMATION*******************************************
## (1) WALMART 2010-2012 SALES DATASET/API HOLIDAY

# Merge stores and feature on Store
features_stores = features.merge(stores, how='inner', on='Store')

# Convert date column from string to date type
features_stores.Date = pd.to_datetime(features_stores.Date)
train.Date = pd.to_datetime(train.Date)

# Add column week and year
features_stores['Week'] = features_stores.Date.dt.week 
features_stores['Year'] = features_stores.Date.dt.year

# Merge datasets
walmart_data = train.merge(features_stores, 
                           how='inner',
                           on=['Store','Date','IsHoliday']).sort_values(by=['Store',
                                                                            'Dept',
                                                                            'Date']).reset_index(drop=True)

# Drop MarkDown columns and IsHoliday
walmart_data = walmart_data.drop(["IsHoliday", "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"], axis=1)

# Drop NaN values in CPI and Unemployment
walmart_data = walmart_data.dropna()

# Change Temprature column name to Temperature F
walmart_data=walmart_data.rename(columns={"Temperature": "Temperature F"})

# Convert temprature from Fahrenheit to Celsius
temp_celcius=[]
for temperature in walmart_data['Temperature F']:
    celcius = (temperature - 32) * (5.0/9.0)
    temp_celcius.append(celcius)
walmart_data["Temperature C"] = temp_celcius

# Rounding Column values to required decimal points
walmart_data = walmart_data.round({'Fuel_Price': 2, 'Temperature C': 0, 'CPI': 4 })

# Convert Date to datetime and subtract 6 days to get Start of Week
walmart_data['Start of Week'] = pd.to_datetime(walmart_data.Date) - timedelta(days=6)

# Sort dataframe and reset index for walmart
walmart_data = walmart_data.sort_values(by =['Date']).reset_index(drop = True)

In [None]:
## (1) WALMART 2010-2012 SALES DATASET/API HOLIDAY

# Sort dataframe and reset index for walmart
holiday_df = holiday_df.sort_values(by =['Date']).reset_index(drop = True)

# Convert holiday date
holiday_df['Date'] = pd.to_datetime(holiday_df['Date'])

holiday_name = []
k = 0
# Label data with Holiday Name or No Holiday
for i in range(len(walmart_data)):
    
    if (holiday_df['Date'][k] <= walmart_data['Date'][i]) & (holiday_df['Date'][k] >= walmart_data['Start of Week'][i]):
        holiday_name.append(holiday_df['Holiday'][k])
    
    elif (holiday_df['Date'][k] > walmart_data['Date'][i]):
        holiday_name.append("No Holiday")
    
    elif (walmart_data['Date'][i] > holiday_df['Date'][k]):
        holiday_name.append("No Holiday")
        k +=1

# Set new column as list created:
walmart_data['Holiday Name'] = holiday_name

# Convert Christmas Day and Eve to be Christmas
walmart_data['Holiday Name'] = walmart_data['Holiday Name'].replace({'Christmas Day': 'Christmas',
                                                                    'Christmas Eve': 'Christmas'})

# Display preview of dataframe:
walmart_data.head()

In [None]:
## (1) WALMART 2010-2012 SALES DATASET/API HOLIDAY
## TABLE 1 SUMMARY WALMART STORES SALES OVER 2010-2012

# Aggregate over Store/Start of Week, Include Other Columns
header = ['Store','Start of Week', 'Date','Fuel_Price','CPI','Unemployment','Type', 'Size','Week','Year','Temperature C','Holiday Name']
walmart = pd.DataFrame(walmart_data.groupby(header)['Weekly_Sales'].sum())

# Reset Index and add ID Column
walmart = walmart.reset_index()
walmart = walmart.reset_index(drop = False)

# Rename Columns
walmart.columns = ['ID','Store','Start_of_Week','Week_Date','Fuel_Price','CPI','Unemployment','Type','Size','Week','Year',
                  'Temperature_C','Holiday_Name','Weekly_Sales']

# Display preview
walmart.head()

In [None]:
## (2) API HOLIDAY
## TABLE 2 HOLIDAY DATE LIST 2010-2012

# Display preview
holiday_df.head()

In [None]:
## (3) STOCK 1972-2020 DATASET
## TABLE 3 ALL 2010-2012 STOCK DAILY DATASET

# Convert Date Columns to datetime
stock['Date'] = pd.to_datetime(stock['Date'])
stock['Year'] = stock['Date'].dt.year

# Set first date and last date on sales dateframe
first_date = walmart['Start_of_Week'][0]
last_date = walmart['Week_Date'].iloc[-1]

# Filter stock date with sales data range
stock = stock.loc[(stock['Date'] >= first_date) & (stock['Date'] <=last_date)]
stock = stock.reset_index(drop = True)

# Sort by Date
stock = stock.sort_values(by = 'Date')
walmart = walmart.sort_values(by = 'Week_Date')

# Get list of unique walmart weekly dates
walmart_date = walmart.Week_Date.unique()

# Classify Date under weekly date
wk_date = []
w = 0

for s in range(len(stock)):
    if stock['Date'][s] <= walmart_date[w]:
        wk_date.append(walmart_date[w])
    else:
        wk_date.append(walmart_date[w+1])
        w = w + 1

# Add Week Date and rename columns
stock['Week_Date'] = wk_date
stock.columns = ['Date','Open','High','Low','Close','Adj_Close','Volume','Year','Week_Date']

# Display preview
stock.head()

In [None]:
## (3) STOCK 1972-2020 DATASET
## TABLE 4 WEEKLY 2010-2012 STOCK DATASET

# Average weekly stock price and sum of stock sold per week
stock_wk = pd.DataFrame(stock.groupby('Week_Date').agg({'Open': 'mean',
                                'High': 'mean',
                                'Low': 'mean',
                                'Close': 'mean',
                                'Adj_Close': 'mean',
                                'Volume': 'sum'
                               }))

# Reset index and rename columns
stock_wk = stock_wk.reset_index()
stock_wk.columns = ['Week_Date','Avg_Open','Avg_High','Avg_Low','Avg_Close','Avg_Adj_Close','Total_Volume']

# Display preview
stock_wk.head()

In [None]:
## WALMART 2010-2012 SALES DATASET/API HOLIDAY
## TABLE 5 SUMMARY WEEKLY SALES OF WALMART HOLIDAY 2010-2012

# Group by Week
walmart_wk = pd.DataFrame(walmart.groupby(['Start_of_Week','Week_Date']).agg({'Fuel_Price': 'mean',
                                        'CPI': 'mean',
                                        'Unemployment': 'mean',
                                        'Week': 'min',
                                        'Year': 'min',
                                        'Temperature_C': 'mean',
                                        'Holiday_Name': 'min',
                                        'Weekly_Sales': 'sum'
                                            }))

# Reset index and rename columns
walmart_wk = walmart_wk.reset_index()
walmart_wk.columns = ['Start_of_Week','Week_Date','Avg_Fuel_Price','Avg_CPI','Avg_Unemployment','Week','Year','Avg_Temperature_C',
                      'Holiday_Name','Total_Weekly_Sales']

# Merge

# Display preview
walmart_wk.head()

In [None]:
## WALMART 2010-2012 SALES DATASET/API HOLIDAY
## TABLE 6 WALMART AGGREGATED HOLIDAY SALES DATASET 2010-2012

# Group by Holiday Name and Year
walmart_holiday = pd.DataFrame(walmart.groupby(['Holiday_Name','Year']).agg({'Fuel_Price': 'mean',
                                        'CPI': 'mean',
                                        'Unemployment': 'mean',
                                        'Temperature_C': 'mean',
                                        'Weekly_Sales': 'mean'
                                            }))

# Reset index and rename column
walmart_holiday = walmart_holiday.reset_index()
walmart_holiday.columns = ['Holiday_Name','Year','Avg_Fuel_Price','Avg_CPI','Avg_Unemployment','Avg_Temperature_C',
                           'Avg_Weekly_Sales']

# Change format of average weekly sales
walmart_holiday['Avg_Weekly_Sales'] = [int(sales) for sales in walmart_holiday['Avg_Weekly_Sales']]

# Sort and reset index
walmart_holiday = walmart_holiday.sort_values(by='Year')
walmart_holiday = walmart_holiday.reset_index(drop = True)

# Display preview
walmart_holiday.head()

In [None]:
## WALMART 2010-2012 SALES DATASET/API HOLIDAY/STOCK
## TABLE 7 MERGE WALMART/HOLIDAY/STOCK ON WEEK

# Merge walmart and stock weekly date and sort
walmart_stk = walmart.merge(stock_wk, how = "inner", on = "Week_Date")

# Sort by stores and reset
walmart_stk = walmart_stk.sort_values(by=['Store', 'Week_Date'])
walmart_stk = walmart_stk.reset_index(drop=True)


# Display preview
walmart_stk.head()

# Loading

In [None]:
# *******************************************LOADING*******************************************
