In [2]:
# Run dependencies
%run extraction.ipynb

# Transformation

### (1) Walmart 2010-2012 Sales Dataset/API Holiday

In [3]:
## *******************************************TRANSFORMATION*******************************************
## (1) WALMART 2010-2012 SALES DATASET/API HOLIDAY

# Merge stores and feature on Store
features_stores = features.merge(stores, how='inner', on='Store')

# Convert date column from string to date type
features_stores.Date = pd.to_datetime(features_stores.Date)
train.Date = pd.to_datetime(train.Date)

# Add column week and year
features_stores['Week'] = features_stores.Date.dt.week 
features_stores['Year'] = features_stores.Date.dt.year

# Merge datasets
walmart_data = train.merge(features_stores, 
                           how='inner',
                           on=['Store','Date','IsHoliday']).sort_values(by=['Store',
                                                                            'Dept',
                                                                            'Date']).reset_index(drop=True)

# Drop MarkDown columns and IsHoliday
walmart_data = walmart_data.drop(["IsHoliday", "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"], axis=1)

# Drop NaN values in CPI and Unemployment
walmart_data = walmart_data.dropna()

# Change Temprature column name to Temperature F
walmart_data=walmart_data.rename(columns={"Temperature": "Temperature F"})

# Convert temprature from Fahrenheit to Celsius
temp_celcius=[]
for temperature in walmart_data['Temperature F']:
    celcius = (temperature - 32) * (5.0/9.0)
    temp_celcius.append(celcius)
walmart_data["Temperature C"] = temp_celcius

# Rounding Column values to required decimal points
walmart_data = walmart_data.round({'Fuel_Price': 2, 'Temperature C': 0, 'CPI': 4 })

# Convert Date to datetime and subtract 6 days to get Start of Week
walmart_data['Start of Week'] = pd.to_datetime(walmart_data.Date) - timedelta(days=6)

# Sort dataframe and reset index for walmart
walmart_data = walmart_data.sort_values(by =['Date']).reset_index(drop = True)

In [4]:
## (1) WALMART 2010-2012 SALES DATASET/API HOLIDAY

# Sort dataframe and reset index for walmart
holiday_df = holiday_df.sort_values(by =['Date']).reset_index(drop = True)

# Convert holiday date
holiday_df['Date'] = pd.to_datetime(holiday_df['Date'])

holiday_name = []
k = 0
# Label data with Holiday Name or No Holiday
for i in range(len(walmart_data)):
    
    if (holiday_df['Date'][k] <= walmart_data['Date'][i]) & (holiday_df['Date'][k] >= walmart_data['Start of Week'][i]):
        holiday_name.append(holiday_df['Holiday'][k])
    
    elif (holiday_df['Date'][k] > walmart_data['Date'][i]):
        holiday_name.append("No Holiday")
    
    elif (walmart_data['Date'][i] > holiday_df['Date'][k]):
        holiday_name.append("No Holiday")
        k +=1

# Set new column as list created:
walmart_data['Holiday Name'] = holiday_name

# Convert Christmas Day and Eve to be Christmas
walmart_data['Holiday Name'] = walmart_data['Holiday Name'].replace({'Christmas Day': 'Christmas',
                                                                    'Christmas Eve': 'Christmas'})

# Display preview of dataframe:
walmart_data.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,Temperature F,Fuel_Price,CPI,Unemployment,Type,Size,Week,Year,Temperature C,Start of Week,Holiday Name
0,1,1,2010-02-05,24924.5,42.31,2.57,211.0964,8.106,A,151315,5,2010,6.0,2010-01-30,No Holiday
1,29,5,2010-02-05,15552.08,24.36,2.79,131.5279,10.064,B,93638,5,2010,-4.0,2010-01-30,No Holiday
2,29,6,2010-02-05,3200.22,24.36,2.79,131.5279,10.064,B,93638,5,2010,-4.0,2010-01-30,No Holiday
3,29,7,2010-02-05,10820.05,24.36,2.79,131.5279,10.064,B,93638,5,2010,-4.0,2010-01-30,No Holiday
4,29,8,2010-02-05,20055.64,24.36,2.79,131.5279,10.064,B,93638,5,2010,-4.0,2010-01-30,No Holiday


In [5]:
## (1) WALMART 2010-2012 SALES DATASET/API HOLIDAY
## TABLE 1 SUMMARY WALMART STORES SALES OVER 2010-2012

# Aggregate over Store/Start of Week, Include Other Columns
header = ['Store','Start of Week', 'Date','Fuel_Price','CPI','Unemployment','Type', 'Size','Week','Year','Temperature C','Holiday Name']
walmart = pd.DataFrame(walmart_data.groupby(header)['Weekly_Sales'].sum())

# Reset Index and add ID Column
walmart = walmart.reset_index()
walmart = walmart.reset_index(drop = False)

# Rename Columns
walmart.columns = ['ID','Store','Start_of_Week','Week_Date','Fuel_Price','CPI','Unemployment','Type','Size','Week','Year',
                  'Temperature_C','Holiday_Name','Weekly_Sales']

# Display preview
walmart.head()

Unnamed: 0,ID,Store,Start_of_Week,Week_Date,Fuel_Price,CPI,Unemployment,Type,Size,Week,Year,Temperature_C,Holiday_Name,Weekly_Sales
0,0,1,2010-01-30,2010-02-05,2.57,211.0964,8.106,A,151315,5,2010,6.0,No Holiday,1643690.9
1,1,1,2010-02-06,2010-02-12,2.55,211.2422,8.106,A,151315,6,2010,4.0,No Holiday,1641957.44
2,2,1,2010-02-13,2010-02-19,2.51,211.2891,8.106,A,151315,7,2010,4.0,No Holiday,1611968.17
3,3,1,2010-02-20,2010-02-26,2.56,211.3196,8.106,A,151315,8,2010,8.0,No Holiday,1409727.59
4,4,1,2010-02-27,2010-03-05,2.62,211.3501,8.106,A,151315,9,2010,8.0,No Holiday,1554806.68


### (2) API HOLIDAY

In [6]:
## (2) API HOLIDAY
## TABLE 2 HOLIDAY DATE LIST 2010-2012

# Display preview
holiday_df.head()

Unnamed: 0,Holiday,Year,Date
0,New Year's Day,2010,2010-01-01
1,Independence Day,2010,2010-07-04
2,Thanksgiving Day,2010,2010-11-25
3,Christmas Eve,2010,2010-12-24
4,Christmas Day,2010,2010-12-25


### (3) Walmart Stock 1972-2020 Dataset

In [7]:
## (3) STOCK 1972-2020 DATASET
## TABLE 3 ALL 2010-2012 STOCK DAILY DATASET

# Convert Date Columns to datetime
stock['Date'] = pd.to_datetime(stock['Date'])
stock['Year'] = stock['Date'].dt.year

# Set first date and last date on sales dateframe
first_date = walmart['Start_of_Week'][0]
last_date = walmart['Week_Date'].iloc[-1]

# Filter stock date with sales data range
stock = stock.loc[(stock['Date'] >= first_date) & (stock['Date'] <=last_date)]
stock = stock.reset_index(drop = True)

# Sort by Date
stock = stock.sort_values(by = 'Date')
walmart = walmart.sort_values(by = 'Week_Date')

# Get list of unique walmart weekly dates
walmart_date = walmart.Week_Date.unique()

# Classify Date under weekly date
wk_date = []
w = 0

for s in range(len(stock)):
    if stock['Date'][s] <= walmart_date[w]:
        wk_date.append(walmart_date[w])
    else:
        wk_date.append(walmart_date[w+1])
        w = w + 1

# Add Week Date and rename columns
stock['Week_Date'] = wk_date
stock.columns = ['Date','Open','High','Low','Close','Adj_Close','Volume','Year','Week_Date']

# Display preview
stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj_Close,Volume,Year,Week_Date
0,2010-02-01,53.619999,53.779999,53.310001,53.48,40.977615,11019500,2010,2010-02-05
1,2010-02-02,53.59,53.720001,53.330002,53.490002,40.985275,11387900,2010,2010-02-05
2,2010-02-03,53.73,54.5,53.639999,54.27,41.582928,17988900,2010,2010-02-05
3,2010-02-04,53.880001,54.299999,52.959999,52.970001,40.58683,21029900,2010,2010-02-05
4,2010-02-05,52.77,53.529999,52.759998,53.450001,40.95462,15545800,2010,2010-02-05


### (4) Walmart Sales Dataset (5 Years) ZIP File

In [None]:
## (4) WALMART SALES DATA (5 YEARS) ZIP FILE SALES_AUG & PRICES CSV CONT.

# Drop NA values of prices
prices = prices.dropna()

### (5) Walmart Store Listing Json

In [None]:
## (5) WALMART STORE JSON
## TABLE _ ALL WALMART STORE LISTING

# Filter for Walmart Supercenter 
store = store[store.storeType.eq(1)]

# Display preview
store.head()

In [None]:
## (5) WALMART STORE JSON
## TABLE _ COUNT OF STORE IN EACH STATE

# Calculate count of stores in each state
walmart_stores = pd.DataFrame(store['state'].value_counts(sort = True))
walmart_stores.reset_index(inplace = True)
walmart_stores.rename(columns = {"index":"state", "state":"total stores"}, inplace = True)

# Display preview
walmart_stores.head()

### (6) Competition Financial Dataset (Walmart, Amazon, Target, Costco)

In [10]:
## (6) COMPETITION FINANCIAL DATASET

#Transpose dataset
wmt_df = wmt_df.T; amz_df = amz_df.T; tgt_df = tgt_df.T; cos_df = cos_df.T

# Reset Index
wmt_df.reset_index(inplace=True); amz_df.reset_index(inplace=True); tgt_df.reset_index(inplace=True); cos_df.reset_index(inplace=True)

# Rename Columns
wmt_df = wmt_df.rename(columns={'index':'Date'}); amz_df = amz_df.rename(columns={'index':'Date'}); tgt_df = tgt_df.rename(columns={'index':'Date'}); cos_df = cos_df.rename(columns={'index':'Date'})

# Drop Columns and NA values
wmt_df = wmt_df[:-1]; amz_df = amz_df[:-1]; tgt_df = tgt_df[:-2]; cos_df = cos_df[:-1]
wmt_df = wmt_df.dropna(axis=1, how='any'); amz_df = amz_df.dropna(axis=1, how='any'); cos_df = cos_df.dropna(axis=1, how='any'); tgt_df = tgt_df.dropna(axis=1, how='any')

# Format column and select columns
wmt_df['Date'] = pd.to_datetime(wmt_df['Date'],format='%Y-%m').dt.strftime('%Y'); amz_df['Date'] = pd.to_datetime(amz_df['Date'],format='%Y-%m').dt.strftime('%Y')
tgt_df['Date'] = pd.to_datetime(tgt_df['Date'],format='%Y-%m').dt.strftime('%Y'); cos_df['Date'] = pd.to_datetime(cos_df['Date'],format='%Y-%m').dt.strftime('%Y')
wmt_df = wmt_df[['Date', 'Revenue USD Mil','Operating Income USD Mil', 'Net Income USD Mil', 'Shares Mil', 'Earnings Per Share USD']]
amz_df = amz_df[['Date', 'Revenue USD Mil','Operating Income USD Mil', 'Net Income USD Mil', 'Shares Mil', 'Earnings Per Share USD']]
tgt_df = tgt_df[['Date', 'Revenue USD Mil','Operating Income USD Mil', 'Net Income USD Mil', 'Shares Mil', 'Earnings Per Share USD']]
cos_df = cos_df[['Date', 'Revenue USD Mil','Operating Income USD Mil', 'Net Income USD Mil', 'Shares Mil', 'Earnings Per Share USD']]

In [11]:
## (6) COMPETITION FINANCIAL DATASET

# Merge on date
merge1 = pd.merge(wmt_df, amz_df, on=['Date'])

# Rename columns
merge1.rename(columns={'Revenue USD Mil_x':'Walmart Revenue USD Mil',
    'Operating Income USD Mil_x':'Walmart Operating Income USD Mil',
    'Net Income USD Mil_x':'Walmart Net Income USD Mil',
    'Shares Mil_x':'Walmart Shares Mil',
    'Earnings Per Share USD_x':'Walmart Earnings Per Share USD',
    'Revenue USD Mil_y':'Amazon Revenue USD Mil',
    'Operating Income USD Mil_y':'Amazon Operating Income USD Mil',
    'Net Income USD Mil_y':'Amazon Net Income USD Mil',
    'Shares Mil_y':'Amazon Shares Mil',
    'Earnings Per Share USD_y':'Amazon Earnings Per Share USD'}, inplace=True)

# Merge on date
merge2 = pd.merge(merge1, tgt_df, on=['Date'], how='outer')

# Rename columns
merge2.rename(columns={'Revenue USD Mil': 'Target Revenue USD Mil',
    'Operating Income USD Mil':'Target Operating Income USD Mil',
    'Net Income USD Mil':'Target Net Income USD Mil',
    'Shares Mil':'Target Shares Mil',
    'Earnings Per Share USD':'Target Earnings Per Share USD'}, inplace=True)

# Merge on date
merge_df = pd.merge(merge2, cos_df, on=['Date'])

# Rename columns
merge_df.rename(columns={'Date':"date",
    'Revenue USD Mil': 'Costco Revenue USD Mil',
    'Operating Income USD Mil':'Costco Operating Income USD Mil',
    'Net Income USD Mil':'Costco Net Income USD Mil',
    'Shares Mil':'Costco Shares Mil',
    'Earnings Per Share USD':'Costco Earnings Per Share USD'}, inplace=True)

In [12]:
## (6) COMPETITION FINANCIAL DATASET CONT.

# Create tables from selected columns
revenue_df = merge_df[['date','Walmart Revenue USD Mil','Amazon Revenue USD Mil','Target Revenue USD Mil','Costco Revenue USD Mil']]
opincome_df = merge_df[['date','Walmart Operating Income USD Mil','Amazon Operating Income USD Mil','Target Operating Income USD Mil','Costco Operating Income USD Mil']]
netincome_df = merge_df[['date', 'Walmart Net Income USD Mil', 'Amazon Net Income USD Mil', 'Target Net Income USD Mil', 'Costco Net Income USD Mil']]
shares_df = merge_df[['date', 'Walmart Shares Mil', 'Amazon Shares Mil', 'Target Shares Mil', 'Costco Shares Mil']]
earnings_df = merge_df[['date', 'Walmart Earnings Per Share USD', 'Amazon Earnings Per Share USD', 'Target Earnings Per Share USD', 'Costco Earnings Per Share USD']]

# Set index
revenue_df = revenue_df.set_index('date'); opincome_df = opincome_df.set_index('date'); netincome_df = netincome_df.set_index('date')
shares_df = shares_df.set_index('date'); earnings_df = earnings_df.set_index('date')