In [1]:
#Things to keep in mind with the data:
# (1) The calendar week for Walmart changes each year, so the holiday week may capture more or less days related 
# to the holiday
# (2) The data proportionally has more weeks that are not holidays we are examining, this could possibly skew
# our results
# (3) We are only looking at 4 major holidays, we are not including local holidays, sports events and other events
# that could possibly have an effect on weekly sales
# (4) This dataset only pulls from 45 stores out of all of Walmarts across the United States, this subset may not be 
# an accurate depiction of Walmart as a whole

In [2]:
import pandas as pd
import csv
from scipy import stats

walmart_clean = pd.read_csv("Walmart.csv")
walmart_clean.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Store,Date,IsHoliday,Dept,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,...,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size,Year,Month,Day,Holiday Name
0,0,1,1,2010-02-05,False,1.0,24924.5,5.73,2.572,,...,,,211.096358,8.106,A,151315,2010,2,5,No Holiday
1,1,2,1,2010-02-05,False,26.0,11737.12,5.73,2.572,,...,,,211.096358,8.106,A,151315,2010,2,5,No Holiday
2,2,3,1,2010-02-05,False,17.0,13223.76,5.73,2.572,,...,,,211.096358,8.106,A,151315,2010,2,5,No Holiday
3,3,4,1,2010-02-05,False,45.0,37.44,5.73,2.572,,...,,,211.096358,8.106,A,151315,2010,2,5,No Holiday
4,4,5,1,2010-02-05,False,28.0,1085.29,5.73,2.572,,...,,,211.096358,8.106,A,151315,2010,2,5,No Holiday


In [3]:
is_holiday = []
for holiday in walmart_clean['Holiday Name']:
    if holiday == 'No Holiday':
        is_holiday.append("No Holiday")
    else:
        is_holiday.append("Holiday")
        
walmart_clean['Is Holiday']=is_holiday

#Group by Is Holiday
walmart_ttl = pd.DataFrame(walmart_clean.groupby('Is Holiday')['Weekly_Sales'].sum())

#Convert Weekly Sales to int to view whole number
walmart_ttl['Weekly_Sales'] = [ int(sales) for sales in walmart_ttl['Weekly_Sales']]
walmart_ttl['Percentage %'] = walmart_ttl['Weekly_Sales']/walmart_ttl['Weekly_Sales'].sum()

#Format
walmart_ttl['Weekly_Sales'] = pd.DataFrame(walmart_ttl['Weekly_Sales'].map("${:,.2f}".format))
walmart_ttl['Percentage %'] = pd.DataFrame(walmart_ttl['Percentage %'].map("{:.0%}".format))

#Display
walmart_ttl

#The 4 major holidays account for about 8 of total sales

Unnamed: 0_level_0,Weekly_Sales,Percentage %
Is Holiday,Unnamed: 1_level_1,Unnamed: 2_level_1
Holiday,"$534,564,028.00",8%
No Holiday,"$6,202,654,958.00",92%


In [4]:
walmart_clean['Holiday Name'].value_counts()

# Disporpotionally more data in No Holiday then the 4 major holidays we're examining

No Holiday          393586
Christmas             8947
Independence Day      8840
Thanksgiving          6004
New Year              5948
Name: Holiday Name, dtype: int64

In [5]:
sales_rank = walmart_clean.groupby("Holiday Name")['Weekly_Sales'].mean()
sales_rank = pd.DataFrame(sales_rank).sort_values(by='Weekly_Sales', ascending = False)
sales_rank

# (1) Thanksgiving surprisingly has a higher average weekly sales than Christmas, this could be due to the fact
# that we're only looking at the week when it's Christmas, although people may do last minute shopping there
# are also people who shop early and those sales would not be reflected in our results
# (2) New Year 

Unnamed: 0_level_0,Weekly_Sales
Holiday Name,Unnamed: 1_level_1
Thanksgiving,22220.944538
Christmas,18805.481424
Independence Day,16715.38545
No Holiday,15822.65538
New Year,14862.139543


In [6]:
walmart_groupby = walmart_clean.groupby(['Date', 'Store','Holiday Name','Is Holiday'])
walmart_holiday = pd.DataFrame(walmart_groupby.agg({'Weekly_Sales': 'sum', 
                                                    'Temperature': 'mean',
                                                   'CPI': 'mean',
                                                   'Unemployment': 'mean',
                                                   'Fuel_Price': 'mean'}).reset_index())
walmart_holiday['Holiday Name'].value_counts()

No Holiday          7560
Christmas            180
Independence Day     180
Thanksgiving         135
New Year             135
Name: Holiday Name, dtype: int64

In [7]:
walmart_holiday.sort_values(by= 'Weekly_Sales', ascending = False).head()

# Christmas has the top 3 highest weekly_sales out the 3 years in Walmart

Unnamed: 0,Date,Store,Holiday Name,Is Holiday,Weekly_Sales,Temperature,CPI,Unemployment,Fuel_Price
2083,2010-12-24,14,Christmas,Holiday,3818686.45,-0.78,182.54459,8.724,3.141
2089,2010-12-24,20,Christmas,Holiday,3766687.43,-3.79,204.637673,7.484,3.141
2079,2010-12-24,10,Christmas,Holiday,3749057.69,13.92,126.983581,9.003,3.236
4413,2011-12-23,4,No Holiday,No Holiday,3676388.98,2.18,129.984548,5.143,3.103
2082,2010-12-24,13,Christmas,Holiday,3595903.2,1.61,126.983581,7.795,2.846


In [8]:
walmart_holiday.sort_values(by= 'Weekly_Sales', ascending = True).head()

# During the month of February had lowest/no sales during the 4 years

Unnamed: 0,Date,Store,Holiday Name,Is Holiday,Weekly_Sales,Temperature,CPI,Unemployment,Fuel_Price
8189,2013-07-26,45,No Holiday,No Holiday,0.0,24.48,,,3.804
7151,2013-02-15,42,No Holiday,No Holiday,0.0,7.67,132.272571,6.897,3.612
7152,2013-02-15,43,No Holiday,No Holiday,0.0,8.7,215.594112,8.934,3.475
7153,2013-02-15,44,No Holiday,No Holiday,0.0,-2.71,132.272571,4.983,3.323
7154,2013-02-15,45,No Holiday,No Holiday,0.0,2.15,192.943471,8.625,3.814


In [9]:
# (1) What effect does temperature have on weekly sales?
# Hypothesis: Expecting extreme weather temperature to have lower sales

#Create bin and label in dataframe
bins = [ min(walmart_holiday['Temperature']) - 2, -10, 0, 10, 20, 30, max(walmart_holiday['Temperature']) + 2 ]
labels = [ "<-10", "-10 to -1", "0 to 9", "10 to 19","20 to 30", ">30"]

walmart_holiday['Temp Category'] = pd.cut(walmart_holiday['Temperature'], bins, labels = labels)

#Check Distribution of Bins
walmart_holiday['Temp Category'].value_counts()

10 to 19     2621
20 to 30     2480
0 to 9       1839
-10 to -1     684
>30           518
<-10           48
Name: Temp Category, dtype: int64

In [10]:
# (1) Temp vs. Weekly Sales CONT.

#Which temperature category has the highest total weekly sales
walmart_holiday.groupby("Temp Category")['Weekly_Sales'].sum()

#Which temperature category has the highest weekly sales on average
walmart_holiday.groupby("Temp Category")['Weekly_Sales'].mean()

#Linear Regression
(slope, intercept, rvalue, pvalue, stderr) = stats.linregress(walmart_holiday['Temperature'], walmart_holiday["Weekly_Sales"])

#Calculate x-values and regression values
x_values = walmart_holiday['Temperature']
y_values = slope*x_values + intercept

#Create linear regression equation
line_eq = 'y = ' + str(round(slope,2)) + "x + " + str(round(intercept,2))

print(line_eq, rvalue)

# (1) Equation suggests a positive relationship between Weekly Sales and Temperature
# implying an increase in temperature is correlated to weekly sales
# (2) R-Value is very close to 0 which suggest a very bad fit to the data
# Linear regression is not a good model for the two variables

y = 2857.57x + 779186.27 0.044968819166433784


In [11]:
#Take Subset Dataframe of Holiday and Not Holiday
not_holiday = walmart_holiday.loc[walmart_holiday['Is Holiday'] == 'No Holiday', :]
holiday = walmart_holiday.loc[walmart_holiday['Is Holiday'] == 'Holiday', :]

#Calculate linear regression coefficients
(slope_h, intercept_h, rvalue_h, pvalue_h, stderr_h) = stats.linregress(holiday['Temperature'], holiday["Weekly_Sales"])
(slope, intercept, rvalue, pvalue, stderr) = stats.linregress(not_holiday['Temperature'], not_holiday["Weekly_Sales"])

#Calculate x-value and regression values
x_holiday = holiday['Temperature']
y_holiday = slope_h*x_holiday + intercept_h
x_values = not_holiday['Temperature']
y_values = slope*x_values + intercept

#Create linear regression equation
eq_holiday = 'y = ' + str(round(slope_h,2)) + "x + " + str(round(intercept_h,2))
eq_regular = 'y = ' + str(round(slope,2)) + "x + " + str(round(intercept,2))

print(eq_holiday, rvalue_h)
print(eq_regular, rvalue)

# After subsetting data, linear regression are both not a good model to fit the two variables
# Positive correlation between temperature and weekly sales

y = 532.59x + 842650.55 0.007884263252436735
y = 3261.13x + 769756.53 0.05131029862621493


In [13]:
# (2) What effect does regional fuel prices have on weekly sales?
# Hypothesis: Would expect higher fuel prices would cause a decrease in weekly sales, since less money to spend

#Linear Regression
(slope, intercept, rvalue, pvalue, stderr) = stats.linregress(walmart_holiday['Fuel_Price'], walmart_holiday["Weekly_Sales"])

x_values = walmart_holiday['Fuel_Price']
y_values = slope*x_values + intercept

line_eq = 'y = ' + str(round(slope,2)) + "x + " + str(round(intercept,2))

print(line_eq, rvalue)

#Calculate linear regression coefficients
(slope_h, intercept_h, rvalue_h, pvalue_h, stderr_h) = stats.linregress(holiday['Fuel_Price'], holiday["Weekly_Sales"])
(slope, intercept, rvalue, pvalue, stderr) = stats.linregress(not_holiday['Fuel_Price'], not_holiday["Weekly_Sales"])

#Calculate x-value and regression values
x_holiday = holiday['Fuel_Price']
y_holiday = slope_h*x_holiday + intercept_h
x_values = not_holiday['Fuel_Price']
y_values = slope*x_values + intercept

#Create linear regression equation
eq_holiday = 'y = ' + str(round(slope_h,2)) + "x + " + str(round(intercept_h,2))
eq_regular = 'y = ' + str(round(slope,2)) + "x + " + str(round(intercept,2))

print(eq_holiday, rvalue_h)
print(eq_regular, rvalue)

# Negative correlation between fuel price and weekly sales, not surprisingly as fuel prices increases, weekly sales decreases
# since families would have to spend more 

y = -199182.45x + 1501029.05 -0.13028966801241113
y = -607948.46x + 2834539.12 -0.23721984188559986
y = -181855.97x + 1441966.89 -0.12323690157962891
