# 16_使用purrr实现迭代

purrr包在迭代方面确实很好用，有些地方也不是很好理解，有多好用我们看例子

In [1]:
library(tidyverse)

─ [1mAttaching packages[22m ──────────────────── tidyverse 1.2.1 ─
[32m✔[39m [34mggplot2[39m 3.2.0     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 0.8.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
─ [1mConflicts[22m ───────────────────── tidyverse_conflicts() ─
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


## for 循环

In [2]:
# 这里是一个简单的数据框
df <- tibble(
    a = rnorm(10),
    b = rnorm(10),
    c = rnorm(10),
    d = rnorm(10)
)

# 例如我们要计算中位数
median(df$a)
median(df$b)
median(df$c)
median(df$d)
# 复制粘贴也可以接受

# for循环解决方法

output <- vector("double", ncol(df))
for (i in seq_along(df)) {
    output[[i]] <- median(df[[i]])
}
# vector()函数创建给定长度的空向量
# seq_along()函数生成一定长度的序列
# 双方括号要比单方括号好，在同样的情况下
output

## 练习

In [3]:
# a. 计算出mtcars数据集中每列的均值。
head(mtcars)
mean(mtcars$mpg)
# ;
# ;
# ;手动也可以，很麻烦
mean(mtcars$carb)

# for循环实现
out_mean <- vector("double", ncol(mtcars))
for (i in seq_along(mtcars)){
    out_mean[[i]] <- mean(mtcars[[i]])
}

out_mean

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1


In [4]:
# b. 确定nycflights13::flights数据集中每列的类型。
library(nycflights13)
head(flights)
typeof(flights$year)# 查看数据类型

# 循环体已经知道，准备循环
out_type <- vector("character", ncol(flights))
# 使用charactor是因为typeof输出是字符型
for (i in seq_along(flights)){
    out_type[[i]] <- typeof(flights[[i]])
}

out_type %>% t()

year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>
2013,1,1,517,515,2,830,819,11,UA,1545,N14228,EWR,IAH,227,1400,5,15,2013-01-01 05:00:00
2013,1,1,533,529,4,850,830,20,UA,1714,N24211,LGA,IAH,227,1416,5,29,2013-01-01 05:00:00
2013,1,1,542,540,2,923,850,33,AA,1141,N619AA,JFK,MIA,160,1089,5,40,2013-01-01 05:00:00
2013,1,1,544,545,-1,1004,1022,-18,B6,725,N804JB,JFK,BQN,183,1576,5,45,2013-01-01 05:00:00
2013,1,1,554,600,-6,812,837,-25,DL,461,N668DN,LGA,ATL,116,762,6,0,2013-01-01 06:00:00
2013,1,1,554,558,-4,740,728,12,UA,1696,N39463,EWR,ORD,150,719,5,58,2013-01-01 05:00:00


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
integer,integer,integer,integer,integer,double,integer,integer,double,character,integer,character,character,character,double,double,double,double,double


In [5]:
# c. 计算出iris数据集中每列唯一值的数量
head(iris)# 查看数据
unique(iris$Species)%>%length()# unique()函数去重
table(iris$Species)
# 以上是重复体
out_num <- vector("integer", ncol(iris))
for (i in seq_along(iris)){
    out_num[[i]] <- length(unique(iris[[i]]))
}
out_num

# 下面对向量进行了命名，更清晰
iris_uniq <- vector("double", ncol(iris))
names(iris_uniq) <- names(iris)
for (i in names(iris)) {
  iris_uniq[i] <- length(unique(iris[[i]]))
}
iris_uniq

Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
<dbl>,<dbl>,<dbl>,<dbl>,<fct>
5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa
5.4,3.9,1.7,0.4,setosa



    setosa versicolor  virginica 
        50         50         50 

In [6]:
# d. 分别使用μ= -10、0、10和100的正态分布生成10个随机数

# 直接写了
mu <- c(-10, 0, 10, 100)
output <- vector("list",length(mu))
for (i in seq_along(mu)){
    output[[i]] <- rnorm(10, mean = mu[[i]])
}
output

In [7]:
# 使用支持向量运算的现有函数替换for循环
out <- ""
for (x in letters) {   
    out <- stringr::str_c(out, x) 
} 
out

str_c(letters, collapse = "")

In [8]:
# 使用支持向量运算的现有函数替换以下示例中的for循环
x <-sample(100) 
sd <- 0
for (i in seq_along(x)) {  
    sd <- sd + (x[i] -mean(x)) ^ 2
} 
    sd <-sqrt(sd / (length(x) -1)) 
# 能看出来是计算样本标准差
sd
sd(x)# 现成函数很方便

In [9]:
# 使用支持向量运算的现有函数替换以下示例中的for循环
x <-runif(100) 
# x <- 1:10
out <-vector("numeric", length(x)) 
out[1] <- x[1] 
for (i in 2:length(x)) {  
    out[i] <- out[i -1] + x[i] 
}
out %>% matrix(nrow = 10)
accumulate(x, `+`) %>% matrix(nrow = 10)# 一个累计函数就解决了

0,1,2,3,4,5,6,7,8,9
0.6973035,3.938048,8.259073,13.67238,17.15333,22.35866,29.25973,34.1049,40.8017,46.68028
1.3506894,4.384395,8.756041,14.10328,17.47762,22.91591,29.86711,34.71337,40.94017,47.60513
1.6900253,4.754075,8.877519,14.13409,17.79589,23.72497,30.28393,35.39988,41.76018,48.51303
1.7954746,4.872559,9.830943,14.14016,18.6,24.13645,31.04306,36.22826,42.63365,49.08509
2.4524392,5.03389,9.974643,14.19695,19.22917,25.0127,31.19702,36.69341,43.16276,49.4977
2.5742237,5.298167,10.853133,14.35575,19.87172,25.81202,31.26369,37.65621,44.04778,50.0102
3.2427838,5.578703,11.613923,14.59185,20.28119,26.70988,31.87535,38.52491,44.78916,50.45304
3.3848914,6.527052,11.957272,15.44704,20.53768,27.64155,32.12513,39.51977,45.62506,50.87641
3.6706284,7.204942,12.898611,15.97371,21.21989,28.44343,32.5567,40.39189,45.97259,51.13508
3.6844165,8.110477,13.113221,16.96792,21.5237,28.78806,33.34381,40.56337,46.23726,51.56451


0,1,2,3,4,5,6,7,8,9
0.6973035,3.938048,8.259073,13.67238,17.15333,22.35866,29.25973,34.1049,40.8017,46.68028
1.3506894,4.384395,8.756041,14.10328,17.47762,22.91591,29.86711,34.71337,40.94017,47.60513
1.6900253,4.754075,8.877519,14.13409,17.79589,23.72497,30.28393,35.39988,41.76018,48.51303
1.7954746,4.872559,9.830943,14.14016,18.6,24.13645,31.04306,36.22826,42.63365,49.08509
2.4524392,5.03389,9.974643,14.19695,19.22917,25.0127,31.19702,36.69341,43.16276,49.4977
2.5742237,5.298167,10.853133,14.35575,19.87172,25.81202,31.26369,37.65621,44.04778,50.0102
3.2427838,5.578703,11.613923,14.59185,20.28119,26.70988,31.87535,38.52491,44.78916,50.45304
3.3848914,6.527052,11.957272,15.44704,20.53768,27.64155,32.12513,39.51977,45.62506,50.87641
3.6706284,7.204942,12.898611,15.97371,21.21989,28.44343,32.5567,40.39189,45.97259,51.13508
3.6844165,8.110477,13.113221,16.96792,21.5237,28.78806,33.34381,40.56337,46.23726,51.56451


## for循环变体

In [10]:
#修改现有对象

# 还是前面的例子
df <- tibble(
    a = rnorm(10),
    b = rnorm(10),
    c = rnorm(10),
    d = rnorm(10)
)

rescale01 <- function(x) {
    rng <- range(x, na.rm = T)
    (x - rng[1]) / (rng[2] - rng[1])
}

df$a <-rescale01(df$a) 
df$b <-rescale01(df$b) 
df$c <-rescale01(df$c) 
df$d <-rescale01(df$d)

# for循环实现如下
for (i in seq_along(df)) {
    df[[i]] <- rescale01(df[[i]])
}

# 要记住使用[[，而不是[。你或许已经发现了，
# 我们在所有for循环中使用的都是[[。
# 我们认为甚至在原子向量中最好也使用[[，
# 因为它可以明确表示我们要处理的是单个元素

# 循环模式

# 除了前面的数值索引，
# 还可以通过元素进行循环以及按名称进行循环，
# 前面已经用过了

# 如果想要创建命名的输出向量，请一定按照如下方式进行命名：

# results <-vector("list", length(x)) 

# names(results) <-names(x)

result <- vector("double", length(df))
names(result) <- names(df)

# for (i in seq_along(df)){
#     name <- names(df)[[i]]
#     value <- df[[i]]
# }
# name
# value
# 这里不合适，试一下就明白了

mean(df$a)
mean(df$b)
mean(df$c)
mean(df$d)

for (i in names(result)){
    print(i)
    result[i] <- mean(df[[i]])# 再次强调用双方括号
}

result

[1] "a"
[1] "b"
[1] "c"
[1] "d"


In [11]:
## 未知的输出长度
# sample(100,1)
# rnorm(sample(100,1), 0)
# mean(rnorm(sample(100,1), 0))

# 例如
means <- c(0, 1, 2)
output <- double()
for (i in seq_along(means)) {
    n <- sample(100, 1)
    output <- c(output, rnorm(n, means[[i]]))
}
str(output)

# 这种方式很低效，如果元素数量多，将会非常耗时

# 将结果保存到列表中的解决方法
out <- vector("list", length(means))
for (i in seq_along(means)) {
    n <- sample(100, 1)
    out[[i]] <- rnorm(n, means[[i]])
}
str(out)
str(unlist(out))# 列表转向量
str(flatten_dbl(out))# 来自purrr包的列表转向量函数

# 只要遇到类似情况，就应该使用一个更复杂的对象来保存每次迭代的结果，
# 最后再一次性组合起来。

# 未知的序列长度
# 此时用while循环

# while (condition) {   
#     # 循环体
# }

# 使用while循环找出了连续3次掷出正面向上的硬币所需的投掷次数

flip <- function() sample(c("T", "H"), 1)

flips <- 0
nheads <- 0

while(nheads < 3) {
    if (flip() == "H") {
        nheads <- nheads + 1
    } else {
        nheads <- 0
    }
    flips <- flips + 1
}

flips

# while循环用的少，当不知道迭代次数的时候用

 num [1:110] -1.446 -0.865 0.409 -0.86 -1.618 ...
List of 3
 $ : num [1:82] -1.485 1.063 -1.168 0.356 -1.372 ...
 $ : num [1:72] 0.917 2.902 1.368 1.292 0.176 ...
 $ : num [1:95] 3.75 2.41 1.47 2.52 1.45 ...
 num [1:249] -1.485 1.063 -1.168 0.356 -1.372 ...
 num [1:249] -1.485 1.063 -1.168 0.356 -1.372 ...


## 练习

(1) 假设一个目录中全是你想要读入的CSV文件。你已经将这些文件的路径保存在向量files <- dir("data/", pattern = "\\.csv$", full.names = TRUE)中，现在想要使用read_csv()函数来读取每个文件。编写一个for循环将这些文件加载到一个数据框中。

In [12]:
files <- dir("abc_data/", pattern = "\\.csv$", full.names = TRUE)
out <-vector("list", length(files))

for (i in files) {
    print(i)
    nm <- str_split(i, "/", simplify = T)[1,3]
    out[[nm]] <- read_csv(file = i, col_names = T, quote = "")

}# 利用一下字符串拆分名一下名
out
bind_rows(out)
bind_cols(out)

[1] "abc_data//abc.csv"


Parsed with column specification:
cols(
  a = [32mcol_double()[39m,
  b = [32mcol_double()[39m,
  c = [32mcol_double()[39m
)


[1] "abc_data//abc2.csv"


Parsed with column specification:
cols(
  a = [32mcol_double()[39m,
  d = [32mcol_double()[39m,
  e = [32mcol_double()[39m
)


a,b,c
<dbl>,<dbl>,<dbl>
1,2,4
2,4,16
3,6,28
4,8,40
5,10,52
6,12,64
7,14,76
8,16,88
9,18,100
10,20,112

a,d,e
<dbl>,<dbl>,<dbl>
1,3,4
2,9,8
3,15,12
4,21,16
5,27,20
6,33,24
7,39,28
8,45,32
9,51,36
10,57,40


a,b,c,d,e
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2.0,4.0,,
2,4.0,16.0,,
3,6.0,28.0,,
4,8.0,40.0,,
5,10.0,52.0,,
6,12.0,64.0,,
7,14.0,76.0,,
8,16.0,88.0,,
9,18.0,100.0,,
10,20.0,112.0,,


a,b,c,a1,d,e
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2,4,1,3,4
2,4,16,2,9,8
3,6,28,3,15,12
4,8,40,4,21,16
5,10,52,5,27,20
6,12,64,6,33,24
7,14,76,7,39,28
8,16,88,8,45,32
9,18,100,9,51,36
10,20,112,10,57,40


In [13]:
# 编写一个函数，使其输出一个数据框中所有数值列的均值及名称
show_mean <- function(x){
    for (i in names(x)){
        if(is.numeric(x[[i]])){
        mn <- mean(x[[i]])
#         print(i)
        cat(i, ":" , format(mn, digits = 2, nsmall = 2), "\n", sep = "")
        }# format用于格式化
    }
}
show_mean(iris)
# 目前只能写到这样

# 下面的是网上找到的，对齐了数值
show_mean2 <- function(df, digits = 2) {
  # Get max length of all variable names in the dataset
  maxstr <- max(str_length(names(df)))
  for (nm in names(df)) {
    if (is.numeric(df[[nm]])) {
      cat(
        str_c(str_pad(str_c(nm, ":"), maxstr + 1L, side = "right"),
              format(mean(df[[nm]]), digits = digits, nsmall = digits),
              sep = " "
        ),
        "\n"
      )
    }
  }
}
show_mean2(iris)

Sepal.Length:5.84
Sepal.Width:3.06
Petal.Length:3.76
Petal.Width:1.20
Sepal.Length: 5.84 
Sepal.Width:  3.06 
Petal.Length: 3.76 
Petal.Width:  1.20 


In [14]:
head(mtcars)
trans <-list(  
    disp =function(x) x *0.0163871,   
    am =function(x) { 
        factor(x, labels =c("auto", "manual"))   
    } 
) 
for (var in names(trans)) {   
    mtcars[[var]] <- trans[[var]](mtcars[[var]]) 
}
head(mtcars)

# 对比一下能懂，写不出来

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1


Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<dbl>
Mazda RX4,21.0,6,2.621936,110,3.9,2.62,16.46,0,manual,4,4
Mazda RX4 Wag,21.0,6,2.621936,110,3.9,2.875,17.02,0,manual,4,4
Datsun 710,22.8,4,1.769807,93,3.85,2.32,18.61,1,manual,4,1
Hornet 4 Drive,21.4,6,4.227872,110,3.08,3.215,19.44,1,auto,3,1
Hornet Sportabout,18.7,8,5.899356,175,3.15,3.44,17.02,0,auto,3,2
Valiant,18.1,6,3.687098,105,2.76,3.46,20.22,1,auto,3,1


## for循环与函数式编程

for循环在R中不像在其他语言中那么重要，因为R是一门函数式编程语言。这意味着可以先将for循环包装在函数中，然后再调用这个函数，而不是直接使用for循环。

In [15]:
# 还是老例子

df <- tibble(
    a = rnorm(10),
    b = rnorm(10),
    c = rnorm(10),
    d = rnorm(10)
)

# 计算均值
output <- vector("double", length(df))
for (i in seq_along(df)) {
    output[[i]] <- mean(df[[i]])
}
output

# 将for循环包装进函数
col_mean <- function(df){
    output <- vector("double", length(df))
    for (i in seq_along(df)) {
        output[[i]] <- mean(df[[i]])
    }
    output
}

col_mean(df)

# 计算中位数
col_median <- function(df){
    output <- vector("double", length(df))
    for (i in seq_along(df)) {
        output[[i]] <- median(df[[i]])
    }
    output
}
col_median(df)

# 计算标准差
col_sd <- function(df){
    output <- vector("double", length(df))
    for (i in seq_along(df)) {
        output[[i]] <- sd(df[[i]])
    }
    output
}
col_sd(df)

# 我们又复制了两次

In [16]:
# 让我们扩展一下我们的函数
col_summary <- function(df, fun){
    output <- vector("double", length(df))
    for (i in seq_along(df)) {
        output[[i]] <- fun(df[[i]])
    }
    output
}# 为函数增加了一个参数，将函数作为参数传入另一个函数，以便实现功能
col_summary(df, median)
col_summary(df, mean)

> 在本章剩余部分，我们将学习和使用purrr包，它提供的函数可以替代很多常见的for循环应用。R基础包中的应用函数族（apply()、lapply()、tapply()等）也可以完成类似的任务，但purrr包中的函数更一致，也更易于学习。

大佬说的

In [17]:
col_summary <- function(df, fun){
    output <- vector("double", length(df))
    for (i in seq_along(df)) {
        if(is.numeric(df[[i]])){
        output[[i]] <- fun(df[[i]])
            }
        }
    output
}# 加个判断不如不加吧

col_summary(iris, mean)

## 映射函数

* map()用于输出列表；
* map_lgl()用于输出逻辑型向量；
* map_int()用于输出整型向量；
* map_dbl()用于输出双精度型向量；
* map_chr()用于输出字符型向量。

映射函数是一种高度抽象，需要花费很长时间才能理解其工作原理

In [18]:
# 看摘要函数的例子
map_dbl(df, mean)
map_dbl(df, median)
# map_dbl(df, sd)# 这个容易报错
map_dbl(iris, mean)

“argument is not numeric or logical: returning NA”

In [19]:
# 使用管道更明显
df %>% map_dbl(mean)

purrr函数都是C语言实现的，速度特别快

第二个参数可以是公式、一个字符向量等

In [20]:
map_dbl(df, mean, trim = 0.5)# 可以有附加参数
z <- list(x = 1:3, y = 4:5)
map_int(z, length)# 可以保留名称

In [21]:
# 快捷方式

# 按照气缸拟合线性模型
models <- mtcars %>% split(.$cyl) %>% 
map(function(df) lm(mpg ~ wt, data = df))

# 可以使用单侧公式简化
models <- mtcars %>% split(.$cyl) %>%
map(~lm(mpg ~ wt, data = .))
# models
    
# 提取R^2

models %>% map(summary) %>% map_dbl(~ .$r.squared)

# 甚至可以使用字符串
models %>% map(summary) %>% map_dbl("r.squared")
    
# 还可以使用整数按照位置来选取元素：
x <-list(list(1, 2, 3), list(4, 5, 6), list(7, 8, 9)) 
x %>%map_dbl(2) 

In [22]:
### R基础包
# R基础包中也有应用函数族，像是lapply(), sapply(), vapply()

# sapply()函数是对lapply()的包装，可以自动简化输出。
# 这对交互工作是有用的，但作为函数则是有问题的，
# 因为你不知道会得到什么样的输出
x1 <-list( 
    c(0.27, 0.37, 0.57, 0.91, 0.20), 
    c(0.90, 0.94, 0.66, 0.63, 0.06),
    c(0.21, 0.18, 0.69, 0.38, 0.77) 
) 
x2 <-list( 
    c(0.50, 0.72, 0.99, 0.38, 0.78), 
    c(0.93, 0.21, 0.65, 0.13, 0.27), 
    c(0.39, 0.01, 0.38, 0.87, 0.34) 
) 
threshold <-function(x, cutoff =0.8) x[x > cutoff] 
x1 %>%sapply(threshold) %>%str() # 返回列表
x2 %>%sapply(threshold) %>%str()# 返回num

List of 3
 $ : num 0.91
 $ : num [1:2] 0.9 0.94
 $ : num(0) 
 num [1:3] 0.99 0.93 0.87


vapply()函数是sapply()的一种安全替代方式，因为前者可以提供额外的参数来定义类型。vapply()的唯一缺点是输入量较大：vapply(df,  is.numeric,  logical(1))等价于map_lgl(df,  is.numeric)。

In [23]:
# a.计算mtcars数据集中每列的均值
# head(mtcars)
map_dbl(mtcars, mean)
# b.确定nycflights13::flights数据集中每列的类型。
library(nycflights13)
# flights %>% str()
flights %>% map(class)
# c.计算iris数据集中每列唯一值的数量。
iris %>% map_int(~length(unique(.)))
# d.分别使用μ= -10、0、10和100的正态分布生成10个随机数。
mu <- list(-10, 0, 10, 100)
mu %>% map(~rnorm(n = 10, mean = .))

“argument is not numeric or logical: returning NA”

In [24]:
# 如果在非列表向量上使用映射函数，那么会发生什么情况？
# map(1:5,  runif)的作用是什么？为什么？
map(1:5, runif)# 作为参数输入

In [25]:
# map(-2:2, rnorm, n = 5)的作用是什么？为什么？
# map_dbl(-2:2, rnorm, n = 5)的作用又是什么？为什么？
map(-2:2, rnorm, n = 5)# 作为第二个参数导入
# map_dbl(-2:2, rnorm, n = 5)# 报错

## 对操作失败的处理

safely(),transpose(),possibly(),quietly()函数

这四个函数对失败的处理不同，书上写的很明白

## 多参数映射

In [26]:
# 模拟均值不同的随机正态分布
mu <- list(5, 10, -3)
mu %>% map(rnorm, n = 5) %>% str()

# 如果还想让标准差也不同呢？

# 一种方法是用索引进行迭代
sigma <- list(1, 5, 10)
seq_along(mu) %>% map(~rnorm(5, mu[[.]], sigma[[.]])) %>% str()

# 另一种方法是使用map2函数，它对两个向量进行同步迭代
map2(mu, sigma, rnorm, n = 5) %>% str()

# 如果是样本数量也不同呢？
# pmap()函数可以将一个列表作为参数
n <- list(1, 3, 5)
args1 <- list(n, mu, sigma)
args1 %>% pmap(rnorm) %>% str()

# 这样没有命名容易出错，且可读性差

# 命名后的代码如下
args2 <- list(mean = mu, sd = sigma, n = n)
args2 %>% pmap(rnorm) %>%  str()

List of 3
 $ : num [1:5] 5.57 4.53 4.08 5.19 5.11
 $ : num [1:5] 10.53 8.59 10.27 9.51 10.19
 $ : num [1:5] -2.1 -4.26 -3.36 -2.24 -3.1
List of 3
 $ : num [1:5] 4.85 4.91 2.89 5.02 4.64
 $ : num [1:5] 6.49 9.5 9.03 7.81 16.93
 $ : num [1:5] 0.712 -8.34 -21.278 7.161 -22.743
List of 3
 $ : num [1:5] 4.71 3.81 5.35 6.84 5.24
 $ : num [1:5] 14.5 14 9.4 11.4 10.2
 $ : num [1:5] 2.71 -5.11 -8.06 -3.08 -9.77
List of 3
 $ : num 4.29
 $ : num [1:3] 10.18 6.45 7.7
 $ : num [1:5] -7.16 3.3 13.64 -9.74 13.5
List of 3
 $ : num 3.77
 $ : num [1:3] 16.8 11.1 11.4
 $ : num [1:5] -23.5 -6.68 -2.24 2.79 -20.53


In [27]:
# 因为长度都是相同的，所以可以将各个参数保存在一个数据框中：
params <-tribble( 
    ~mean, ~sd, ~n, 
    5,     1,  1, 
    10,    5,  3, 
    -3,    10,  5
) 
params %>% pmap(rnorm)

In [28]:
# 再复杂点是调用不同函数
f <- c("runif", "rnorm", "rpois") 
param <-list( 
    list(min =-1, max =1), 
    list(sd =5), 
    list(lambda =10) 
)
# 使用invoke_map()函数
invoke_map(f, param, n = 5) %>% str()

# 也可以使用数据框
sim <-tribble( 
    ~f,      ~params, 
    "runif", list(min =-1, max =1), 
    "rnorm", list(sd =5), 
    "rpois", list(lambda =10) ) 

sim %>% mutate(sim = invoke_map(f, params, n = 10))

List of 3
 $ : num [1:5] 0.0959 0.7964 0.469 -0.5055 -0.1469
 $ : num [1:5] -12.79 3.43 -3.89 5.46 -2.46
 $ : int [1:5] 5 5 11 12 6


f,params,sim
<chr>,<list>,<list>
runif,"-1, 1","0.8064592, -0.5800157, -0.8758826, 0.1162058, -0.4481403, -0.5444552, -0.2009330, 0.8922024, 0.6315626, 0.3517507"
rnorm,5,"-6.9614559, 1.7678565, 7.4130429, -4.1046393, 1.2004052, -3.5973553, -3.0707497, -2.4898078, -3.6925922, 0.4619668"
rpois,10,"6, 14, 11, 12, 10, 12, 3, 11, 12, 17"


## 游走函数

使用这个函数的目的是在屏幕上提供输出或者将文件保存到磁盘——重要的是操作过程，而不是返回值。

In [29]:
# 简单例子
x <- list(1, "a", 3)
x %>% walk(print)

[1] 1
[1] "a"
[1] 3


In [30]:
# pwalk保存文件
library(ggplot2)
plots <- mtcars %>% split(.$cyl) %>%
    map(~ggplot(., aes(mpg, wt)) + geom_point())
paths <- str_c(names(plots), ".pdf")
# plots
pwalk(list(paths, plots), ggsave, path = tempdir())
# tempdir()函数表示获得临时目录
tempdir()

Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image


## for循环的其他模式

In [31]:
# 预测函数
iris %>% keep(is.factor) %>% str()# keep 保留为T的元素
iris %>% discard(is.factor) %>% str()# discard保留为F的元素

x <- list(1:5, letters, list(10))

x %>% some(is_character)# 部分为真
x %>% every(is_vector)# 全部为真

'data.frame':	150 obs. of  1 variable:
 $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
'data.frame':	150 obs. of  4 variables:
 $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...


In [32]:
# detect()函数可以找出预测值为真的第一个元素，
# detect_index()函数则可以返回这个元素的位置：

x <- sample(10)
x %>% t()
x %>% detect(~ . > 5)
x %>% detect_index(~ . > 5)
# head_while()和tail_while()分别从向量的开头和结尾找出预测值为真的元素：

x %>% head_while(~ . > 5)
x %>% tail_while(~ . > 5)

0,1,2,3,4,5,6,7,8,9
7,10,4,3,2,6,5,1,8,9


In [33]:
# # 归约与累计
# reduce()函数使用一个“二元”函数（即具有两个基本输入的函数），
# 将其不断应用于一个列表，直到最后只剩下一个元素为止。
dfs <- list(   
    age =tibble(name ="John", age =30),   
    sex =tibble(name =c("John", "Mary"), sex =c("M", "F")),   
    trt =tibble(name ="Mary", treatment ="A") ) 
dfs %>% reduce(full_join)# 合并表

vs <- list( 
    c(1, 3, 5, 6, 10), 
    c(1, 2, 3, 7, 8, 10), 
    c(1, 2, 3, 4, 8, 9, 10) 
    )
vs %>% reduce(intersect)# 找交集

# 累计函数与归约函数很相似，但前者会保留所有中间结果。你可以使用它来实现累计求和：

x <- sample(10)
x %>% accumulate(`+`) %>% t()

Joining, by = "name"
Joining, by = "name"


name,age,sex,treatment
<chr>,<dbl>,<chr>,<chr>
John,30.0,M,
Mary,,F,A


0,1,2,3,4,5,6,7,8,9
9,16,21,25,33,39,40,50,52,55
