In [1]:
library(palmerpenguins)
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
penguins

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>
Adelie,Torgersen,39.1,18.7,181,3750,male,2007
Adelie,Torgersen,39.5,17.4,186,3800,female,2007
Adelie,Torgersen,40.3,18.0,195,3250,female,2007
Adelie,Torgersen,,,,,,2007
Adelie,Torgersen,36.7,19.3,193,3450,female,2007
Adelie,Torgersen,39.3,20.6,190,3650,male,2007
Adelie,Torgersen,38.9,17.8,181,3625,female,2007
Adelie,Torgersen,39.2,19.6,195,4675,male,2007
Adelie,Torgersen,34.1,18.1,193,3475,,2007
Adelie,Torgersen,42.0,20.2,190,4250,,2007


In [3]:
penguins_summary <-  penguins %>%
    group_by(species) %>%  # 先对企鹅数据进行分组
    # summarise: 为分组后的数据进行汇总
    # na.rm=TRUE: 忽略缺失值
    # round: 将平均值四舍五入到小数点后1位
    summarise(bill=round(mean(bill_length_mm, na.rm=TRUE),digits=1), flipper=round(mean(flipper_length_mm, na.rm=TRUE),digits=1), weight=round(mean(body_mass_g, na.rm=TRUE),digits=1))
print(penguins_summary)

[90m# A tibble: 3 × 4[39m
  species    bill flipper weight
  [3m[90m<fct>[39m[23m     [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m  [3m[90m<dbl>[39m[23m
[90m1[39m Adelie     38.8    190   [4m3[24m701.
[90m2[39m Chinstrap  48.8    196.  [4m3[24m733.
[90m3[39m Gentoo     47.5    217.  [4m5[24m076 


In [4]:
penguins_summary_narrow <- penguins_summary %>%  # 将原本的 宽格式数据 转换为 窄格式数据
    # c(bill, flipper, weight): 选取需要转换的列
    # names_to='property': 将列名(bill, flipper, weight)放入新列 property 中
    # values_to='value': 将原来每一列对应的值放入新列 value 中
    pivot_longer(c(bill, flipper, weight), names_to='property', values_to='value')  # anti-intuitive
print(penguins_summary_narrow)

[90m# A tibble: 9 × 3[39m
  species   property  value
  [3m[90m<fct>[39m[23m     [3m[90m<chr>[39m[23m     [3m[90m<dbl>[39m[23m
[90m1[39m Adelie    bill       38.8
[90m2[39m Adelie    flipper   190  
[90m3[39m Adelie    weight   [4m3[24m701. 
[90m4[39m Chinstrap bill       48.8
[90m5[39m Chinstrap flipper   196. 
[90m6[39m Chinstrap weight   [4m3[24m733. 
[90m7[39m Gentoo    bill       47.5
[90m8[39m Gentoo    flipper   217. 
[90m9[39m Gentoo    weight   [4m5[24m076  


In [5]:
penguins_summary_wide <- penguins_summary_narrow %>%
    pivot_wider(names_from = property, values_from = value)  # 转换为宽数据
print(penguins_summary_wide)

[90m# A tibble: 3 × 4[39m
  species    bill flipper weight
  [3m[90m<fct>[39m[23m     [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m  [3m[90m<dbl>[39m[23m
[90m1[39m Adelie     38.8    190   [4m3[24m701.
[90m2[39m Chinstrap  48.8    196.  [4m3[24m733.
[90m3[39m Gentoo     47.5    217.  [4m5[24m076 


In [6]:
penguins_summary_wide %>% 
    pivot_longer(c(bill, flipper, weight), names_to='property', values_to='value')

species,property,value
<fct>,<chr>,<dbl>
Adelie,bill,38.8
Adelie,flipper,190.0
Adelie,weight,3700.7
Chinstrap,bill,48.8
Chinstrap,flipper,195.8
Chinstrap,weight,3733.1
Gentoo,bill,47.5
Gentoo,flipper,217.2
Gentoo,weight,5076.0


In [7]:
penguins_summary_wide %>% 
    pivot_longer(cols = !species, names_to='property', values_to='value')
# cols = !species: 选择除了species列以外的所有列进行转换

species,property,value
<fct>,<chr>,<dbl>
Adelie,bill,38.8
Adelie,flipper,190.0
Adelie,weight,3700.7
Chinstrap,bill,48.8
Chinstrap,flipper,195.8
Chinstrap,weight,3733.1
Gentoo,bill,47.5
Gentoo,flipper,217.2
Gentoo,weight,5076.0


In [8]:
print(penguins_summary)

[90m# A tibble: 3 × 4[39m
  species    bill flipper weight
  [3m[90m<fct>[39m[23m     [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m  [3m[90m<dbl>[39m[23m
[90m1[39m Adelie     38.8    190   [4m3[24m701.
[90m2[39m Chinstrap  48.8    196.  [4m3[24m733.
[90m3[39m Gentoo     47.5    217.  [4m5[24m076 


In [9]:
uni_df <- penguins_summary %>% 
    unite(flipper_over_weight, flipper, weight, sep="/")
print(uni_df)

[90m# A tibble: 3 × 3[39m
  species    bill flipper_over_weight
  [3m[90m<fct>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m              
[90m1[39m Adelie     38.8 190/3700.7         
[90m2[39m Chinstrap  48.8 195.8/3733.1       
[90m3[39m Gentoo     47.5 217.2/5076         


In [10]:
sep_df <- uni_df %>% 
    separate(flipper_over_weight, into=c("flipper", "weight"), sep="/")
print(sep_df)
# 默认情况下, separate函数保留列的数据类型(因此flipper和weight是字符列)

[90m# A tibble: 3 × 4[39m
  species    bill flipper weight
  [3m[90m<fct>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m   [3m[90m<chr>[39m[23m 
[90m1[39m Adelie     38.8 190     3700.7
[90m2[39m Chinstrap  48.8 195.8   3733.1
[90m3[39m Gentoo     47.5 217.2   5076  


In [11]:
sep_df_double <- uni_df %>% 
    separate(flipper_over_weight, into=c("flipper", "weight"), sep="/", convert = TRUE)  # 拆分出来的数值会被自动转换为适当的类型
print(sep_df_double)

[90m# A tibble: 3 × 4[39m
  species    bill flipper weight
  [3m[90m<fct>[39m[23m     [3m[90m<dbl>[39m[23m   [3m[90m<dbl>[39m[23m  [3m[90m<dbl>[39m[23m
[90m1[39m Adelie     38.8    190   [4m3[24m701.
[90m2[39m Chinstrap  48.8    196.  [4m3[24m733.
[90m3[39m Gentoo     47.5    217.  [4m5[24m076 


In [12]:
musicians <- full_join(band_members, band_instruments)

[1m[22mJoining with `by = join_by(name)`


In [13]:
print(musicians)

[90m# A tibble: 4 × 3[39m
  name  band    plays 
  [3m[90m<chr>[39m[23m [3m[90m<chr>[39m[23m   [3m[90m<chr>[39m[23m 
[90m1[39m Mick  Stones  [31mNA[39m    
[90m2[39m John  Beatles guitar
[90m3[39m Paul  Beatles bass  
[90m4[39m Keith [31mNA[39m      guitar


In [14]:
musicians_nest <- musicians %>%
    group_by(name) %>%
    nest()
print(musicians_nest)

[90m# A tibble: 4 × 2[39m
[90m# Groups:   name [4][39m
  name  data            
  [3m[90m<chr>[39m[23m [3m[90m<list>[39m[23m          
[90m1[39m Mick  [90m<tibble [1 × 2]>[39m
[90m2[39m John  [90m<tibble [1 × 2]>[39m
[90m3[39m Paul  [90m<tibble [1 × 2]>[39m
[90m4[39m Keith [90m<tibble [1 × 2]>[39m


In [15]:
print(filter(musicians_nest,name=='Mick')$data)

[[1]]
[90m# A tibble: 1 × 2[39m
  band   plays
  [3m[90m<chr>[39m[23m  [3m[90m<chr>[39m[23m
[90m1[39m Stones [31mNA[39m   



In [16]:
musicians_nest %>% 
  unnest(cols = data)

name,band,plays
<chr>,<chr>,<chr>
Mick,Stones,
John,Beatles,guitar
Paul,Beatles,bass
Keith,,guitar


In [17]:
is_div_2_3 <- function(x) {
    if (x%%2==0 | x%%3==0){
        return (TRUE)
    } else {
        return (FALSE)
    }
}
v <- c(1,2,3,5,6)
map(v, is_div_2_3)

In [18]:
col_name <- 'body_mass_g'
df = penguins

# 从数据框 df 中提取名为 col_name (即 body_mass_g)的列, v_col 保存的是该列的数据
v_col <- select(df, all_of(col_name))
print(v_col)
# select_if(df, is.numeric) 先选择数据框中的所有数值型列
# select(-all_of(col_name)) 将指定的列从 df_num 中排除, 保留其他数值型列
df_num <- select_if(df, is.numeric) %>% select(-all_of(col_name)) 

# 定义了一个函数, 接受一个向量x, 并计算 x 和 v_col之间的相关性
# use='complete.obs' 用于忽略缺失值
cor_func <- function(x){ cor(x, v_col, use='complete.obs') } 

# 使用map()对 df_num 中的每一列去 应用相关性函数cor_func, 并返回相关性结果
# unlist()将返回的列表转换为普通向量
correlations <- unlist(map(df_num, cor_func)) 
print('the computed correlations are:'); print(correlations)

max_abs_cor_var <- names( which( abs(correlations)==max(abs(correlations))  ) )  # 哪个最大
cor_val <- as.double(correlations[max_abs_cor_var])  # 提取出来并转换为double类型
print('\ncolumn with maximal correlation:' ); print(max_abs_cor_var); print(cor_val)

[90m# A tibble: 344 × 1[39m
   body_mass_g
         [3m[90m<int>[39m[23m
[90m 1[39m        [4m3[24m750
[90m 2[39m        [4m3[24m800
[90m 3[39m        [4m3[24m250
[90m 4[39m          [31mNA[39m
[90m 5[39m        [4m3[24m450
[90m 6[39m        [4m3[24m650
[90m 7[39m        [4m3[24m625
[90m 8[39m        [4m4[24m675
[90m 9[39m        [4m3[24m475
[90m10[39m        [4m4[24m250
[90m# ℹ 334 more rows[39m
[1] "the computed correlations are:"
   bill_length_mm     bill_depth_mm flipper_length_mm              year 
       0.59510982       -0.47191562        0.87120177        0.04220939 
[1] "\ncolumn with maximal correlation:"
[1] "flipper_length_mm"
[1] 0.8712018


In [19]:
max_cor_var <- function(df, col_name) {
    v_col <- select(df, all_of(col_name))  # 提取由col_name指定的列
    df_num <- select_if(df, is.numeric) %>% select(-all_of(col_name))  # 不包含col_name数值型列的数据框df_num
    
    # 定义了一个函数, 接受一个向量x, 并计算 x 和 v_col之间的相关性
    # use='complete.obs' 用于忽略缺失值
    cor_func <- function(x){ cor(x, v_col, use='complete.obs') }

    # 使用map()对 df_num 中的每一列去 应用相关性函数cor_func, 并返回相关性结果
    # unlist()将返回的列表转换为普通向量
    correlations <- unlist(map(df_num, cor_func)) 

    max_abs_cor_var <- names( which( abs(correlations)==max(abs(correlations))  ) ) 
    cor_val <- as.double(correlations[max_abs_cor_var])
  
    return (data.frame(var_name=max_abs_cor_var, cor=cor_val)) # return as a data frame 
}

max_cor_var(penguins, "body_mass_g")

var_name,cor
<chr>,<dbl>
flipper_length_mm,0.8712018


In [20]:
cor_by_group <- penguins %>%
    group_by(species) %>%
    nest() %>%
    mutate(max_cor=map(data, function(x){max_cor_var(x, 'body_mass_g')}))  # 添加一列 max_cor, data是由nest()自动生成的

print(cor_by_group)

[90m# A tibble: 3 × 3[39m
[90m# Groups:   species [3][39m
  species   data               max_cor     
  [3m[90m<fct>[39m[23m     [3m[90m<list>[39m[23m             [3m[90m<list>[39m[23m      
[90m1[39m Adelie    [90m<tibble [152 × 7]>[39m [90m<df [1 × 2]>[39m
[90m2[39m Gentoo    [90m<tibble [124 × 7]>[39m [90m<df [1 × 2]>[39m
[90m3[39m Chinstrap [90m<tibble [68 × 7]>[39m  [90m<df [1 × 2]>[39m


In [21]:
select(cor_by_group, -data) %>%
    unnest(cols=max_cor)  # 拆cols

species,var_name,cor
<fct>,<chr>,<dbl>
Adelie,bill_depth_mm,0.5761382
Gentoo,bill_depth_mm,0.719085
Chinstrap,flipper_length_mm,0.6415594


In [22]:
stocks <- tibble(
  year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)
print(stocks)

[90m# A tibble: 7 × 3[39m
   year   qtr return
  [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m  [3m[90m<dbl>[39m[23m
[90m1[39m  [4m2[24m015     1   1.88
[90m2[39m  [4m2[24m015     2   0.59
[90m3[39m  [4m2[24m015     3   0.35
[90m4[39m  [4m2[24m015     4  [31mNA[39m   
[90m5[39m  [4m2[24m016     2   0.92
[90m6[39m  [4m2[24m016     3   0.17
[90m7[39m  [4m2[24m016     4   2.66


In [23]:
complete(stocks, year, qtr)  # complete() 函数将隐性缺失数据显性化
# 将会根据 year 和 qtr 的所有可能组合, 生成一个包含所有组合的数据集

year,qtr,return
<dbl>,<dbl>,<dbl>
2015,1,1.88
2015,2,0.59
2015,3,0.35
2015,4,
2016,1,
2016,2,0.92
2016,3,0.17
2016,4,2.66


In [24]:
complete.cases(stocks)

In [25]:
filter(stocks, complete.cases(stocks))  # 通过布尔表达式筛选

year,qtr,return
<dbl>,<dbl>,<dbl>
2015,1,1.88
2015,2,0.59
2015,3,0.35
2016,2,0.92
2016,3,0.17
2016,4,2.66


In [26]:
replace_by_mean <- function(x){
    mu <- mean(x, na.rm=TRUE) # 忽略缺失值, 计算平均数
  
    impute_f <- function(z){  # 如果z是缺失值, 用mu代替; 否则保持不变
        if (is.na(z)){
            return (mu)
        } else {
            return (z)
        }
    }
    # 使用purrr包中的map_dbl函数遍历向量 x 中的每一个元素 并对每个元素应用 impute_f 函数 (刚定义的)
    # map_dbl函数会返回一个 <<<"数值型向量">>> 其中缺失值被替换为了均值 其他非缺失值保持原样
    # map 会返回一个列表
    return (map_dbl(x, impute_f)) 
}

x <- c(1,2,NA,4)
replace_by_mean(x)

In [27]:
mutate(stocks, return=replace_by_mean(return))
# mutate()函数, 我们可以对数据框中的列进行修改

year,qtr,return
<dbl>,<dbl>,<dbl>
2015,1,1.88
2015,2,0.59
2015,3,0.35
2015,4,1.095
2016,2,0.92
2016,3,0.17
2016,4,2.66
