In [18]:
# Run dependencies
from datetime import datetime, timedelta
import os
import pandas as pd
import requests
from sqlalchemy import create_engine
import us
%run config.ipynb

# Extraction

### (1) Walmart 2010-2012 Sales Dataset

In [2]:
## *******************************************EXTRACTION*******************************************
## (1) WALMART 2010-2012 SALES DATASET

# Read all walmart csv
features = pd.read_csv('Resources/Walmart/features.csv')
stores = pd.read_csv('Resources/Walmart/stores.csv')
train = pd.read_csv('Resources/Walmart/train.csv')

### (2) API Holiday

In [3]:
## (2) API HOLIDAY

# Set holidays, country and years we want dates
holiday = ['Christmas Day','Christmas Eve','Independence Day',"New Year's Day", "Thanksgiving Day"]
country = 'US'
year = [2010,2011, 2012]

# Loops through API and get dates
holiday_name = []
holiday_date =[]
category = []
holiday_year = []

for number in year:
    # Set url base
    url = f'https://calendarific.com/api/v2/holidays?&api_key={api_key}&country={country}&year={number}'
    response = requests.get(url).json()
    
    #Loop through holiday
    for name in holiday:
        # Loop through API
        for i in range(len(response['response']['holidays'])):
            if name == response['response']['holidays'][i]['name']:
                holiday_date.append(response['response']['holidays'][i]['date']['iso'])
                category.append(name)
                holiday_year.append(number)
                break
                
# Create dataframes of lists created
holiday_df = pd.DataFrame({'Holiday': category,
                        'Year': holiday_year,
                        'Date': holiday_date})

# Display preview
holiday_df.head()

Unnamed: 0,Holiday,Year,Date
0,Christmas Day,2010,2010-12-25
1,Christmas Eve,2010,2010-12-24
2,Independence Day,2010,2010-07-04
3,New Year's Day,2010,2010-01-01
4,Thanksgiving Day,2010,2010-11-25


### (3) Walmart Stock 1972-2020 Dataset

In [4]:
## (3) STOCK 1972-2020 DATASET

# Run stock csv
stock = pd.read_csv("Resources/stock.csv")

# Display preview
stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1972-08-25,0.063477,0.064697,0.063477,0.064453,0.023123,2508800
1,1972-08-28,0.064453,0.064941,0.064209,0.064209,0.023035,972800
2,1972-08-29,0.063965,0.063965,0.063477,0.063477,0.022772,1945600
3,1972-08-30,0.063477,0.063477,0.062988,0.063477,0.022772,409600
4,1972-08-31,0.062988,0.062988,0.0625,0.0625,0.022422,870400


### (4) U.S State Dataset

In [17]:
## (4) UNITED STATES STATE DATASET

# Retrieve data from us python package
us_state = us.states.mapping('fips','abbr')

# Convert dictionary to dataframe
state_id = pd.DataFrame.from_dict(us_state, orient='index', columns=['STATE'])

# Create dictionary showing abbreviation and state id for each state
abbr_id = dict(us.states.mapping('ap_abbr', 'fips'))

# Export to csv
state_id = state_id.reset_index(drop=False)

# Display preivew
state_id.head()

Unnamed: 0,index,STATE
0,1,AL
1,2,AK
2,4,AZ
3,5,AR
4,6,CA


### () Walmart Ecommerce Product Dataset

In [20]:
## WALMART PRODUCT ECOMMERCE DATASET

# Read csv
df_ecomm = pd.read_csv('Resources/walmart_com-ecommerce_product_details.csv')

df_ecomm.head()

Unnamed: 0,Uniq Id,Crawl Timestamp,Product Url,Product Name,Description,List Price,Sale Price,Brand,Item Number,Gtin,Package Size,Category,Postal Code,Available
0,459b05f3cb7f1cba0a36fdc042ff0056,2019-03-22 17:10:04 +0000,https://www.walmart.com/ip/In-Style-Eyes-Catey...,In Style Eyes Cateye Two Tone Reading Glasses,Stunning Looking Cat Eye Two Tone Reading Glas...,19.99,19.99,In Style Eyes,,96647820000.0,,Health|Home Health Care|Daily Living Aids,,True
1,6a1bddc2801cbba539be0c182498d4dd,2019-03-22 17:10:04 +0000,https://www.walmart.com/ip/In-Style-Eyes-Catey...,In Style Eyes Cateye Two Tone Reading Glasses,Stunning Looking Cat Eye Two Tone Reading Glas...,19.99,19.99,In Style Eyes,,96647820000.0,,Health|Home Health Care|Daily Living Aids,,True
2,4d237340ae8361b4bb4f51e8a6128c8b,2019-03-22 17:10:04 +0000,https://www.walmart.com/ip/In-Style-Eyes-Catey...,In Style Eyes Cateye Two Tone Reading Glasses,Stunning Looking Cat Eye Two Tone Reading Glas...,19.99,19.99,In Style Eyes,,96647820000.0,,Health|Home Health Care|Daily Living Aids,,True
3,d6dbc29d5782a88db9082d81ad04089c,2019-03-22 17:10:04 +0000,https://www.walmart.com/ip/In-Style-Eyes-Catey...,In Style Eyes Cateye Two Tone Reading Glasses,Stunning Looking Cat Eye Two Tone Reading Glas...,19.99,19.99,In Style Eyes,,96647820000.0,,Health|Home Health Care|Daily Living Aids,,True
4,9e51356d763e53dc622c92e3a86f9ef8,2019-03-22 17:46:46 +0000,https://www.walmart.com/ip/Upper-Crust-Caribbe...,Upper Crust Caribbean Coconut Panko Bread Crum...,| Size information : 1-10 POUND Upper Crust Ca...,39.96,39.96,Upper Crust,,890749000000.0,,Food|Baking|Baking Mixes,,False


### (5) Walmart Sales Dataset (5 Years) ZIP File

In [5]:
## (5) WALMART SALES DATA (5 YEARS) ZIP FILE SALES_AUG & PRICES CSV

# Run sales_aug csv
sales_aug = pd.read_csv('Resources/sales_aug.csv')

# Display preview
sales_aug.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1937,d_1938,d_1939,d_1940,d_1941,sales1,sales2,start,scale1,scale2
0,HOBBIES_1_001_CA_1,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,3,3,0,1,1.097306e-05,1.3e-05,902,0.757426,0.773603
1,HOBBIES_1_002_CA_1,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,8.128193e-07,3e-06,144,0.424208,0.422049
2,HOBBIES_1_003_CA_1,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,2,3,0,1,6.502554e-06,8e-06,1106,0.492556,0.51199
3,HOBBIES_1_004_CA_1,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,2,6,2.072689e-05,1.9e-05,37,1.7792,1.778245
4,HOBBIES_1_005_CA_1,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,2,1,0,1.544357e-05,1.5e-05,113,1.101723,1.102354


In [6]:
## (5) WALMART SALES DATA (5 YEARS) ZIP FILE SALES_AUG & PRICES CSV CONT.

# Run sales_aug csv
prices = pd.read_csv('Resources/prices.csv')

# Display preview
prices.head()

Unnamed: 0,id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
0,FOODS_1_001_CA_1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
1,FOODS_1_001_CA_2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
2,FOODS_1_001_CA_3,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
3,FOODS_1_001_CA_4,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24
4,FOODS_1_001_TX_1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24,2.24


### (6) Walmart Store Listing Json

In [10]:
## (6) WALMART STORE JSON

# Run store json 
store = pd.read_json('Resources/walmart.json')

# Display preview
store.head()

Unnamed: 0,id,storeType,timeZone,openDate,name,postalCode,address1,city,state,country,latitude,longitude,phone_number
0,6601,3,K,01/04/1994 12:00,Sam's Club,99515,8801 Old Seward Hwy,Anchorage,AK,US,61.14077,-149.860016,(907) 522-2333
1,6602,3,K,01/06/1994 12:00,Sam's Club,99504,1074 N. Muldoon Road,Anchorage,AK,US,61.231079,-149.740844,(907) 276-2996
2,2074,1,K,03/29/1994 12:00,Walmart Supercenter,99654,1350 S Seward Meridian Pkwy,Wasilla,AK,US,61.568752,-149.363647,(907) 376-9780
3,2070,1,K,03/29/1994 12:00,Walmart Supercenter,99503,3101 A St,Anchorage,AK,US,61.192337,-149.880356,(907) 563-5900
4,6603,3,K,01/08/1994 12:00,Sam's Club,99701,48 College Rd,Fairbanks,AK,US,64.852867,-147.707336,(907) 451-4800


### (7) Walmart Marketshare

In [3]:
## (7) WALMART MARKETSHARE DATASET

#Extract and view data from CSV
marketShare_df = pd.read_csv('Resources/Walmart_MarketShare_data.csv')

# Display preview
marketShare_df.head()

Unnamed: 0,CITY,STATE,POPULATION,MARKET_SHARE
0,Atchison,Kan.,16580,95%
1,Portales,N.M.,19730,95%
2,Sterling,Colo.,22068,91%
3,Deming,N.M.,24699,90%
4,Guymon,Ohio,21385,90%


### (8) Competition Financial Dataset (Walmart, Amazon, Target, Costco)

In [8]:
## (8) COMPETITION FINANCIAL DATASET

# Run Walmart, Amazon, Target, Costco Financial csv
wmt_df = pd.read_csv("Resources/Competition/WMT.csv", skiprows=2, index_col=0)
amz_df = pd.read_csv("Resources/Competition/AMZN.csv", skiprows=2, index_col=0)
tgt_df = pd.read_csv("Resources/Competition/TGT.csv", skiprows=2, index_col=0)
cos_df = pd.read_csv("Resources/Competition/COST.csv", skiprows=2, index_col=0)