# Walmart Retail Analytics

### I. Initial Steps

Import the packages

In [1]:
# libraries
import pandas as pd
import numpy as np
import seaborn as sns

# built in packages
import math
from datetime import datetime

# hide warnings
import warnings
warnings.filterwarnings('ignore')

Load the queried data <br>
`Queried using Big Query: https://console.cloud.google.com/bigquery?sq=936168413614:1d9ff9c5249a4e1eae5bc257061cb810`

In [2]:
data = pd.read_csv('walmart train set with features.csv')

Take a peak

In [3]:
data.head()

Unnamed: 0,Store,Date,Weekly_Sales,IsHoliday,Dept,CPI,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,Unemployment
0,1,2010-08-27,15793.87,False,1,211.567306,85.22,2.619,,,,,,7.787
1,1,2010-08-27,49623.53,False,2,211.567306,85.22,2.619,,,,,,7.787
2,1,2010-08-27,51159.17,False,3,211.567306,85.22,2.619,,,,,,7.787
3,1,2010-08-27,36404.6,False,4,211.567306,85.22,2.619,,,,,,7.787
4,1,2010-08-27,13570.32,False,5,211.567306,85.22,2.619,,,,,,7.787


### II. Data Preprocessing

Extract the year, month, and day from the Date column

In [4]:
data['Month'] = pd.DatetimeIndex(data['Date']).month
data['Day'] = pd.DatetimeIndex(data['Date']).day
data['Year'] = pd.DatetimeIndex(data['Date']).year

Replace the numeric value of the month column with the equivalent Month

In [5]:
Months = {1: 'J', 2: 'F', 3: 'M', 4: 'A', 5: 'M', 6: 'Jn', 7: 'J',
         8: 'A', 9: 'S', 10: 'O', 11: 'N', 12: 'D'}

data['Month'] = data['Month'].apply(lambda x: Months[int(x)])

Get rid of the Date column

In [6]:
data.drop(['Date'], axis=1)

Unnamed: 0,Store,Weekly_Sales,IsHoliday,Dept,CPI,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,Unemployment,Month,Day,Year
0,1,15793.87,False,1,211.567306,85.22,2.619,,,,,,7.787,A,27,2010
1,1,49623.53,False,2,211.567306,85.22,2.619,,,,,,7.787,A,27,2010
2,1,51159.17,False,3,211.567306,85.22,2.619,,,,,,7.787,A,27,2010
3,1,36404.60,False,4,211.567306,85.22,2.619,,,,,,7.787,A,27,2010
4,1,13570.32,False,5,211.567306,85.22,2.619,,,,,,7.787,A,27,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421565,45,1776.44,False,93,189.421473,31.85,3.533,3205.9,4038.51,2.19,488.63,2344.09,8.424,J,20,2012
421566,45,4120.46,False,94,189.421473,31.85,3.533,3205.9,4038.51,2.19,488.63,2344.09,8.424,J,20,2012
421567,45,50973.62,False,95,189.421473,31.85,3.533,3205.9,4038.51,2.19,488.63,2344.09,8.424,J,20,2012
421568,45,6371.68,False,97,189.421473,31.85,3.533,3205.9,4038.51,2.19,488.63,2344.09,8.424,J,20,2012


Looking at the measures of the data, we will notice that the minimum value of Weekly Sales is negative. This is problematic because we're talking about Sales, not revenue so it can't be negative. Since we don't have info on whether a robbery happened in this stores or if it was a typographical error, we have to remove the negative values.

In [7]:
data.describe()

Unnamed: 0,Store,Weekly_Sales,Dept,CPI,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,Unemployment,Day,Year
count,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,150681.0,111248.0,137091.0,134967.0,151432.0,421570.0,421570.0,421570.0
mean,22.200546,15981.258123,44.260317,171.201947,60.090059,3.361027,7246.420196,3334.628621,1439.421384,3383.168256,4628.975079,7.960289,15.673131,2010.968591
std,12.785297,22711.183519,30.492054,39.159276,18.447931,0.458515,8291.221345,9475.357325,9623.07829,6292.384031,5962.887455,1.863296,8.753549,0.796876
min,1.0,-4988.94,1.0,126.064,-2.06,2.472,0.27,-265.76,-29.1,0.22,135.16,3.879,1.0,2010.0
25%,11.0,2079.65,18.0,132.022667,46.68,2.933,2240.27,41.6,5.08,504.22,1878.44,6.891,8.0,2010.0
50%,22.0,7612.03,37.0,182.31878,62.09,3.452,5347.45,192.0,24.6,1481.31,3359.45,7.866,16.0,2011.0
75%,33.0,20205.8525,74.0,212.416993,74.28,3.738,9210.9,1926.94,103.99,3595.04,5563.8,8.572,23.0,2012.0
max,45.0,693099.36,99.0,227.232807,100.14,4.468,88646.76,104519.54,141630.61,67474.85,108519.28,14.313,31.0,2012.0


After counting the negative values in the dataset, there are 1285 negative values. We can remove it since overall we have more than a hundred thousand data.

In [8]:
data['Weekly_Sales'].value_counts(bins=[-100000, -0.00000000001, 1000000])

(-1e-11, 1000000.0]      420285
(-100000.001, -1e-11]      1285
Name: Weekly_Sales, dtype: int64

To easily remove the negative values, convert them into nan.

In [9]:
data['Weekly_Sales'] = data['Weekly_Sales'].apply(lambda x: float('nan') if x < 0 else x)

This verifies that we have 1285 null values for the Weekly_Sales column. We will drop these null values later.

In [10]:
data['Weekly_Sales'].isnull().sum()

1285

### III. Separate the dataset into two:

***Dataset 1:* To get rid of the null values and be able to analyze the effect of different variables on weekly sales, drop all the Markdown columns.**

This is because there are a lot of null values in these columns.

In [11]:
MarkDown_array = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']

data_no_mark_down = data.drop(MarkDown_array, axis=1)

Drop the rows with null values for the new dataframe.

In [12]:
data_no_mark_down.dropna(inplace=True)

Save the preprocessed data into a csv file.

In [13]:
data_no_mark_down.to_csv('data_no_mark_down.csv')

***Dataset 2:* To get the effect of Markdowns on Weekly Sales, drop the rows that contain no information on any Markdown.**

The Markdown values was converted into absolute value to get rid of the negative since it doesn't make sense that Walmart will record their Markup values. It was also converted into string so that the nan values will turn into a string 'nan.'

In [14]:
for i in MarkDown_array:
    data[i] = data[i].apply(lambda x: str(abs(x)))

A new column 'MD' was temporarily created to store the concatenated values of the 5 MarkDown columns. We turned nan into a string so those rows with null values for all the MarkDown columns will have 'nannannannannan' for their 'MD' column. That way, we can distinguish them and remove them.

In [15]:
data['MD'] = data['MarkDown1'] + data['MarkDown2'] + data['MarkDown3'] + data['MarkDown4'] + data['MarkDown5']

To be able to easily remove those rows, they were turned into nan, which is a None type.

In [16]:
data['MD'] = data['MD'].apply(lambda x: float('nan') if x == 'nannannannannan' else x)

A single line of code was run to remove all the rows with null values. The next line removed the column 'MD' since it's not needed anymore.

In [17]:
data.dropna(inplace=True)
data.drop(['MD'], axis = 1)

Unnamed: 0,Store,Date,Weekly_Sales,IsHoliday,Dept,CPI,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,Unemployment,Month,Day,Year
2898,1,2012-10-05,21904.47,False,1,223.181477,68.55,3.617,8077.89,,18.22,3617.43,3626.14,6.573,O,5,2012
2899,1,2012-10-05,48577.08,False,2,223.181477,68.55,3.617,8077.89,,18.22,3617.43,3626.14,6.573,O,5,2012
2900,1,2012-10-05,11676.98,False,3,223.181477,68.55,3.617,8077.89,,18.22,3617.43,3626.14,6.573,O,5,2012
2901,1,2012-10-05,39311.93,False,4,223.181477,68.55,3.617,8077.89,,18.22,3617.43,3626.14,6.573,O,5,2012
2902,1,2012-10-05,25508.81,False,5,223.181477,68.55,3.617,8077.89,,18.22,3617.43,3626.14,6.573,O,5,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421565,45,2012-01-20,1776.44,False,93,189.421473,31.85,3.533,3205.9,4038.51,2.19,488.63,2344.09,8.424,J,20,2012
421566,45,2012-01-20,4120.46,False,94,189.421473,31.85,3.533,3205.9,4038.51,2.19,488.63,2344.09,8.424,J,20,2012
421567,45,2012-01-20,50973.62,False,95,189.421473,31.85,3.533,3205.9,4038.51,2.19,488.63,2344.09,8.424,J,20,2012
421568,45,2012-01-20,6371.68,False,97,189.421473,31.85,3.533,3205.9,4038.51,2.19,488.63,2344.09,8.424,J,20,2012


The preprocessed data was saved into a csv file.

In [18]:
data.to_csv('data_with_markdown.csv')

*NEXT: VISUALIZATION*