In [1]:
library(dplyr)


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




# Reading files

In [2]:
df <- readr::read_csv('Walmart_Store_sales.csv')

Parsed with column specification:
cols(
  Store = [32mcol_double()[39m,
  Date = [31mcol_character()[39m,
  Weekly_Sales = [32mcol_double()[39m,
  Holiday_Flag = [32mcol_double()[39m,
  Temperature = [32mcol_double()[39m,
  Fuel_Price = [32mcol_double()[39m,
  CPI = [32mcol_double()[39m,
  Unemployment = [32mcol_double()[39m
)



# Data overview

In [3]:
head(df)

Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,05-02-2010,1643691,0,42.31,2.572,211.0964,8.106
1,12-02-2010,1641957,1,38.51,2.548,211.2422,8.106
1,19-02-2010,1611968,0,39.93,2.514,211.2891,8.106
1,26-02-2010,1409728,0,46.63,2.561,211.3196,8.106
1,05-03-2010,1554807,0,46.5,2.625,211.3501,8.106
1,12-03-2010,1439542,0,57.79,2.667,211.3806,8.106


In [4]:
glimpse(df)

Rows: 6,435
Columns: 8
$ Store        [3m[90m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ Date         [3m[90m<chr>[39m[23m "05-02-2010", "12-02-2010", "19-02-2010", "26-02-2010"...
$ Weekly_Sales [3m[90m<dbl>[39m[23m 1643691, 1641957, 1611968, 1409728, 1554807, 1439542, ...
$ Holiday_Flag [3m[90m<dbl>[39m[23m 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ Temperature  [3m[90m<dbl>[39m[23m 42.31, 38.51, 39.93, 46.63, 46.50, 57.79, 54.58, 51.45...
$ Fuel_Price   [3m[90m<dbl>[39m[23m 2.572, 2.548, 2.514, 2.561, 2.625, 2.667, 2.720, 2.732...
$ CPI          [3m[90m<dbl>[39m[23m 211.0964, 211.2422, 211.2891, 211.3196, 211.3501, 211....
$ Unemployment [3m[90m<dbl>[39m[23m 8.106, 8.106, 8.106, 8.106, 8.106, 8.106, 8.106, 8.106...


# Data manipulation

## Select columns

In [5]:
select(df, Store, Date, Weekly_Sales) %>% head

Store,Date,Weekly_Sales
<dbl>,<chr>,<dbl>
1,05-02-2010,1643691
1,12-02-2010,1641957
1,19-02-2010,1611968
1,26-02-2010,1409728
1,05-03-2010,1554807
1,12-03-2010,1439542


## Filter rows

In [6]:
filter(df, Store == 1) %>% head

Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,05-02-2010,1643691,0,42.31,2.572,211.0964,8.106
1,12-02-2010,1641957,1,38.51,2.548,211.2422,8.106
1,19-02-2010,1611968,0,39.93,2.514,211.2891,8.106
1,26-02-2010,1409728,0,46.63,2.561,211.3196,8.106
1,05-03-2010,1554807,0,46.5,2.625,211.3501,8.106
1,12-03-2010,1439542,0,57.79,2.667,211.3806,8.106


## Sort rows

In [7]:
arrange(df, -Weekly_Sales) %>% head

Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
14,24-12-2010,3818686,0,30.59,3.141,182.5446,8.724
20,24-12-2010,3766687,0,25.17,3.141,204.6377,7.484
10,24-12-2010,3749058,0,57.06,3.236,126.9836,9.003
4,23-12-2011,3676389,0,35.92,3.103,129.9845,5.143
13,24-12-2010,3595903,0,34.9,2.846,126.9836,7.795
13,23-12-2011,3556766,0,24.76,3.186,129.9845,6.392


## Create or modify columns

In [8]:
mutate(
    df,
    Date = lubridate::dmy(Date),
    Weekly_Sales_K = Weekly_Sales / 1000
) %>% head

Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Weekly_Sales_K
<dbl>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2010-02-05,1643691,0,42.31,2.572,211.0964,8.106,1643.691
1,2010-02-12,1641957,1,38.51,2.548,211.2422,8.106,1641.957
1,2010-02-19,1611968,0,39.93,2.514,211.2891,8.106,1611.968
1,2010-02-26,1409728,0,46.63,2.561,211.3196,8.106,1409.728
1,2010-03-05,1554807,0,46.5,2.625,211.3501,8.106,1554.807
1,2010-03-12,1439542,0,57.79,2.667,211.3806,8.106,1439.542


## Rename columns

In [9]:
rename(df, is_holiday = Holiday_Flag) %>% head

Store,Date,Weekly_Sales,is_holiday,Temperature,Fuel_Price,CPI,Unemployment
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,05-02-2010,1643691,0,42.31,2.572,211.0964,8.106
1,12-02-2010,1641957,1,38.51,2.548,211.2422,8.106
1,19-02-2010,1611968,0,39.93,2.514,211.2891,8.106
1,26-02-2010,1409728,0,46.63,2.561,211.3196,8.106
1,05-03-2010,1554807,0,46.5,2.625,211.3501,8.106
1,12-03-2010,1439542,0,57.79,2.667,211.3806,8.106


## Groupby and summarise

In [10]:
group_by(df, Store) %>% 
    summarise(total_sales = sum(Weekly_Sales)) %>% 
    head

Store,total_sales
<dbl>,<dbl>
1,222402809
2,275382441
3,57586735
4,299543953
5,45475689
6,223756131


## Window functions

In [11]:
df %>% 
    select(Store, Date, Weekly_Sales) %>% 
    group_by(Store) %>% 
    mutate(total_sales = sum(Weekly_Sales)) %>% 
    head

Store,Date,Weekly_Sales,total_sales
<dbl>,<chr>,<dbl>,<dbl>
1,05-02-2010,1643691,222402809
1,12-02-2010,1641957,222402809
1,19-02-2010,1611968,222402809
1,26-02-2010,1409728,222402809
1,05-03-2010,1554807,222402809
1,12-03-2010,1439542,222402809


# R pipes and Method chaining

In [12]:
df %>%
    select_all(tolower) %>%
    filter(holiday_flag == 1) %>% 
    select(date, store, weekly_sales) %>% 
    mutate(date = lubridate::dmy(date)) %>% 
    arrange(desc(date), -weekly_sales) %>% 
    group_by(date) %>% 
    top_n(5) %>% 
    mutate(weekly_sales = scales::comma_format()(weekly_sales))

Selecting by weekly_sales



date,store,weekly_sales
<date>,<dbl>,<chr>
2012-09-07,13,2165796
2012-09-07,4,2125105
2012-09-07,20,2080529
2012-09-07,14,1904512
2012-09-07,2,1898777
2012-02-10,20,2462978
2012-02-10,4,2374661
2012-02-10,10,2218596
2012-02-10,2,2103323
2012-02-10,14,2077256
