In [1]:
library(tidyverse) 
library(Stat2Data)
data("Hawks")

library(dplyr)  # 为了使用rename函数

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
head(Hawks)  # 看看数据什么样子

Unnamed: 0_level_0,Month,Day,Year,CaptureTime,ReleaseTime,BandNumber,Species,Age,Sex,Wing,Weight,Culmen,Hallux,Tail,StandardTail,Tarsus,WingPitFat,KeelFat,Crop
Unnamed: 0_level_1,<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<int>,<dbl>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>
1,9,19,1992,13:30,,877-76317,RT,I,,385,920,25.7,30.1,219,,,,,
2,9,22,1992,10:30,,877-76318,RT,I,,376,930,,,221,,,,,
3,9,23,1992,12:45,,877-76319,RT,I,,381,990,26.7,31.3,235,,,,,
4,9,23,1992,10:50,,745-49508,CH,I,F,265,470,18.7,23.5,220,,,,,
5,9,27,1992,11:15,,1253-98801,SS,I,F,205,170,12.5,14.3,157,,,,,
6,9,28,1992,11:25,,1207-55910,RT,I,,412,1090,28.5,32.2,230,,,,,


In [3]:
# 1.1 Select and filter
# hSF <- select(filter(Hawks, Weight >= 1 & Species == "RT"), Wing, Weight, Tail)  # select + filter

# %>% 化简:
hSF <- Hawks %>% filter(Weight >= 1 & Species == "RT") %>% select(Wing, Weight, Tail)

In [4]:
summary(hSF)  # 机器学习分析师

      Wing           Weight          Tail      
 Min.   : 37.2   Min.   : 101   Min.   :122.0  
 1st Qu.:372.0   1st Qu.: 980   1st Qu.:214.0  
 Median :384.0   Median :1070   Median :221.0  
 Mean   :383.6   Mean   :1094   Mean   :222.1  
 3rd Qu.:399.0   3rd Qu.:1210   3rd Qu.:230.0  
 Max.   :480.0   Max.   :2030   Max.   :288.0  

In [5]:
hSF

Wing,Weight,Tail
<dbl>,<int>,<int>
385,920,219
376,930,221
381,990,235
412,1090,230
370,960,212
375,855,243
412,1210,210
405,1120,238
393,1010,222
371,1010,217


In [6]:
# 1.2 The arrange function 
hSF %>% arrange(Wing)  # 貌似只需要Wing的排序

Wing,Weight,Tail
<dbl>,<int>,<int>
37.2,1180,210
111.0,1340,226
199.0,1290,222
230.0,340,200
233.0,337,196
241.0,1320,235
262.0,1020,200
277.0,940,218
277.0,1500,207
313.0,930,215


In [7]:
# 1.3 Join and rename functions Q1
# 按照要求创建DataFrame 关系顺序不能乱
species_code <- unique(Hawks$Species)
species_name_full <- c("Red-tailed", "Cooper\'s", "Sharp-shinned")
hawkSpeciesNameCodes <- data.frame(species_code, species_name_full)
print(hawkSpeciesNameCodes)

  species_code species_name_full
1           RT        Red-tailed
2           CH          Cooper's
3           SS     Sharp-shinned


In [8]:
# 1.3 Join and rename functions Q2
# hawkSpeciesNameCodes <- rename(hawkSpeciesNameCodes, Species = species_code)  # 首先重新命名 必须有两个相同的列名, 否则会报错!!!
# hawkSpeciesNameCodes
# hawksFullName <- left_join(Hawks, hawkSpeciesNameCodes)

# %>% 化简:
hawksFullName <- Hawks %>%
                        left_join(hawkSpeciesNameCodes %>% rename(Species = species_code))

[1m[22mJoining with `by = join_by(Species)`


In [9]:
# 1.3 Join and rename functions Q3
# 把之前的 Species列{CR, SS...}这些删掉, 之后剩下的那个 species_name_full 变成 Species
hawksFullName <- hawksFullName %>% select(-Species) %>% rename(Species = species_name_full)  
hawksFullName  

Month,Day,Year,CaptureTime,ReleaseTime,BandNumber,Age,Sex,Wing,Weight,Culmen,Hallux,Tail,StandardTail,Tarsus,WingPitFat,KeelFat,Crop,Species
<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<int>,<dbl>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<chr>
9,19,1992,13:30,,877-76317,I,,385,920,25.7,30.1,219,,,,,,Red-tailed
9,22,1992,10:30,,877-76318,I,,376,930,,,221,,,,,,Red-tailed
9,23,1992,12:45,,877-76319,I,,381,990,26.7,31.3,235,,,,,,Red-tailed
9,23,1992,10:50,,745-49508,I,F,265,470,18.7,23.5,220,,,,,,Cooper's
9,27,1992,11:15,,1253-98801,I,F,205,170,12.5,14.3,157,,,,,,Sharp-shinned
9,28,1992,11:25,,1207-55910,I,,412,1090,28.5,32.2,230,,,,,,Red-tailed
9,28,1992,13:30,,877-76320,I,,370,960,25.3,30.1,212,,,,,,Red-tailed
9,29,1992,11:45,,877-76321,A,,375,855,27.2,30.0,243,,,,,,Red-tailed
9,29,1992,15:35,,877-76322,A,,412,1210,29.3,31.3,210,,,,,,Red-tailed
9,30,1992,13:45,,1207-55911,I,,405,1120,26.0,30.2,238,,,,,,Red-tailed


In [10]:
hawksFullName[1:7,] %>% select(Species, Wing, Weight)

Unnamed: 0_level_0,Species,Wing,Weight
Unnamed: 0_level_1,<chr>,<dbl>,<int>
1,Red-tailed,385,920
2,Red-tailed,376,930
3,Red-tailed,381,990
4,Cooper's,265,470
5,Sharp-shinned,205,170
6,Red-tailed,412,1090
7,Red-tailed,370,960


In [11]:
hawksFullName

Month,Day,Year,CaptureTime,ReleaseTime,BandNumber,Age,Sex,Wing,Weight,Culmen,Hallux,Tail,StandardTail,Tarsus,WingPitFat,KeelFat,Crop,Species
<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<int>,<dbl>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<chr>
9,19,1992,13:30,,877-76317,I,,385,920,25.7,30.1,219,,,,,,Red-tailed
9,22,1992,10:30,,877-76318,I,,376,930,,,221,,,,,,Red-tailed
9,23,1992,12:45,,877-76319,I,,381,990,26.7,31.3,235,,,,,,Red-tailed
9,23,1992,10:50,,745-49508,I,F,265,470,18.7,23.5,220,,,,,,Cooper's
9,27,1992,11:15,,1253-98801,I,F,205,170,12.5,14.3,157,,,,,,Sharp-shinned
9,28,1992,11:25,,1207-55910,I,,412,1090,28.5,32.2,230,,,,,,Red-tailed
9,28,1992,13:30,,877-76320,I,,370,960,25.3,30.1,212,,,,,,Red-tailed
9,29,1992,11:45,,877-76321,A,,375,855,27.2,30.0,243,,,,,,Red-tailed
9,29,1992,15:35,,877-76322,A,,412,1210,29.3,31.3,210,,,,,,Red-tailed
9,30,1992,13:45,,1207-55911,I,,405,1120,26.0,30.2,238,,,,,,Red-tailed


In [12]:
# 1.4 The mutate function Q1
hawksWithBMI <- Hawks %>%  # 使用了Hawks, 而不是之前创建的对象, 老师给的例子里是缩写
    mutate(bird_BMI = 1000 * Weight / Wing ** 2) %>%
    select(Species, bird_BMI) %>%
    arrange(desc(bird_BMI))
hawksWithBMI 

Species,bird_BMI
<fct>,<dbl>
RT,852.699734
RT,108.757406
RT,32.574935
RT,22.726881
CH,22.408179
RT,19.549323
CH,15.219976
RT,14.859274
CH,14.763552
RT,14.669326


In [13]:
# 错误案例 1.5 Summarize and group-by functions Q1
# hawksFullName %>% 
#     select(Wing, Tail) %>%
#     summary(num_rows = n(),  # summary 和 summarize 两个东西
#         mn_wing = mean(Wing, na.rm = TRUE),
#         nd_wing = median(Wing, na.rm = TRUE),
#         t_mn_wing = mean(Wing, trim = 0.1, na.rm = TRUE),
#         b_wt_ratio = max(Wing / Tail, na.rm = TRUE))

In [14]:
# 1.5 Summarize and group-by functions Q1
hawksFullName %>%
    select(Species, Wing, Tail) %>%  
    group_by(Species) %>%
    summarise(
        num_rows = n(),
        mn_wing = mean(Wing, na.rm = TRUE),
        nd_wing = median(Wing, na.rm = TRUE),
        t_mn_wing = mean(Wing, trim = 0.1, na.rm = TRUE),
        b_wt_ratio = max(Wing / Tail, na.rm = TRUE)
    )

Species,num_rows,mn_wing,nd_wing,t_mn_wing,b_wt_ratio
<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>
Cooper's,70,244.1449,240,243.1754,1.668142
Red-tailed,577,383.3036,384,385.1123,3.155738
Sharp-shinned,261,184.9464,191,183.8278,1.674208


In [15]:
# 1.5 Summarize and group-by functions Q2繁琐版
NA_summary <- hawksFullName %>%   
                select(Wing, Weight, Culmen, Hallux, Tail, StandardTail, Tarsus, Crop, Species) %>%  
                group_by(Species) %>%  
                summarize(Wing = sum(is.na(Wing)), 
                        Weight = sum(is.na(Weight)), 
                        Culmen =sum(is.na(Culmen)), 
                        Hallux=sum(is.na(Hallux)),
                        Tail = sum(is.na(Tail)),
                        StandardTail = sum(is.na(StandardTail)),
                        Tarsus = sum(is.na(Tarsus)),
                        Crop = sum(is.na(Crop)),
                )
NA_summary

Species,Wing,Weight,Culmen,Hallux,Tail,StandardTail,Tarsus,Crop
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
Cooper's,1,0,0,0,0,19,62,21
Red-tailed,0,5,4,3,0,250,538,254
Sharp-shinned,0,5,3,3,0,68,233,68


In [16]:
# 1.5 Summarize and group-by functions Q2简化版
NA_summary_pro <- hawksFullName %>% 
      select(Wing, Weight, Culmen, Hallux, Tail, StandardTail, Tarsus, Crop, Species) %>%
      group_by(Species) %>%
      summarize(across(everything(), ~sum(is.na(.x)))  # 匿名函数
           )

In [17]:
NA_summary_pro

Species,Wing,Weight,Culmen,Hallux,Tail,StandardTail,Tarsus,Crop
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
Cooper's,1,0,0,0,0,19,62,21
Red-tailed,0,5,4,3,0,250,538,254
Sharp-shinned,0,5,3,3,0,68,233,68
