In [1]:
# installing and loading the dplyr package
# install.packages("dplyr")
library(dplyr)

# adding a column with mutate
library(dslabs)
data("murders")
murders <- mutate(murders, rate = total / population * 100000)

# subsetting with filter
filter(murders, rate <= 0.71)

# selecting columns with select
new_table <- select(murders, state, region, rate)

# using the pipe
murders %>% select(state, region, rate) %>% filter(rate <= 0.71)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



state,abb,region,population,total,rate
Hawaii,HI,West,1360301,7,0.514592
Iowa,IA,North Central,3046355,21,0.6893484
New Hampshire,NH,Northeast,1316470,5,0.3798036
North Dakota,ND,North Central,672591,4,0.5947151
Vermont,VT,Northeast,625741,2,0.3196211


state,region,rate
Hawaii,West,0.514592
Iowa,North Central,0.6893484
New Hampshire,Northeast,0.3798036
North Dakota,North Central,0.5947151
Vermont,Northeast,0.3196211


In [3]:
# Load the dplyr package and the murders dataset.

library(dplyr)
library(dslabs)
data(murders)
# You can add columns using the dplyr function mutate.
# This function is aware of the column names and inside the function you can call them unquoted. Like this:

murders <- mutate(murders, population_in_millions = population / 10^6)
# Note that we can write population rather than murders$population.
# The function mutate knows we are grabing columns from murders.
head(murders)

state,abb,region,population,total,population_in_millions
Alabama,AL,South,4779736,135,4.779736
Alaska,AK,West,710231,19,0.710231
Arizona,AZ,West,6392017,232,6.392017
Arkansas,AR,South,2915918,93,2.915918
California,CA,West,37253956,1257,37.253956
Colorado,CO,West,5029196,65,5.029196


In [4]:
# Note that if rank(x) gives you the ranks of x from lowest to highest,
# rank(-x) gives you the ranks from highest to lowest.

In [5]:
# Add the necessary columns
murders <- mutate(murders, rate = total/population * 100000, rank = rank(-rate))

# Filter to show the top 5 states with the highest murder rates
filter(murders, rank < 6)

state,abb,region,population,total,population_in_millions,rate,rank
District of Columbia,DC,South,601723,99,0.601723,16.452753,1
Louisiana,LA,South,4533372,351,4.533372,7.742581,2
Maryland,MD,South,5773552,293,5.773552,5.074866,4
Missouri,MO,North Central,5988927,321,5.988927,5.359892,3
South Carolina,SC,South,4625364,207,4.625364,4.475323,5


In [9]:
# Using the pipe %>%
# The pipe %>% can be used to perform operations sequentially without having to define intermediate objects.
# After redefining murder to include rate and rank.

library(dplyr)
murders <- mutate(murders, rate =  total / population * 100000, rank = rank(-rate))
# in the solution to the previous exercise we did the following:

# Created a table 
my_states <- filter(murders, region %in% c("Northeast", "West") & rate < 1)

# Used select to show only the state name, the murder rate and the rank
select(my_states, state, rate, rank)

# The pipe %>% permits us to perform both operation sequentially
# and without having to define an intermediate variable my_states


## Define the rate column
murders <- mutate(murders, rate =  total / population * 100000, rank = rank(-rate))

# show the result and only include the state, rate, and rank columns, all in one line
filter(murders, region %in% c("Northeast", "West") & rate < 1) %>% select(state, rate, rank)

# Note that select no longer has a data frame as the first argument.
# The first argument is assumed to be the result of the operation conducted right before the %>%

state,rate,rank
Hawaii,0.514592,49
Idaho,0.7655102,46
Maine,0.8280881,44
New Hampshire,0.3798036,50
Oregon,0.9396843,42
Utah,0.795981,45
Vermont,0.3196211,51
Wyoming,0.8871131,43


state,rate,rank
Hawaii,0.514592,49
Idaho,0.7655102,46
Maine,0.8280881,44
New Hampshire,0.3798036,50
Oregon,0.9396843,42
Utah,0.795981,45
Vermont,0.3196211,51
Wyoming,0.8871131,43
