format publications and update dataset

YuLab-SMU · Jun 21, 2022 · 03a9394 · 03a9394
2 parents e4253b2 + 5a5e041
commit 03a9394
Show file tree

Hide file tree

Showing 12 changed files with 183 additions and 96 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: scholar
 Type: Package
 Title: Analyse Citation Data from Google Scholar
-Version: 0.2.2.991
+Version: 0.2.3
 Authors@R: c(
     person("Guangchuang", "Yu",   email = "guangchuangyu@gmail.com",   role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6485-8781")),
     person("James", "Keirstead",  email = "james.keirstead@gmail.com", role = "aut"),
@@ -16,11 +16,12 @@ Description: Provides functions to extract citation data from Google
     Scholar.  Convenience functions are also provided for comparing
     multiple scholars and predicting future h-index values.
 Depends:
-    R (>= 3.4.0)
+    R (>= 3.5.0)
 Imports:
     R.cache,
     dplyr,
     httr,
+    rlang,
     rvest,
     stringr,
     xml2,
@@ -38,6 +39,6 @@ VignetteBuilder: knitr
 License: MIT + file LICENSE
 URL: https://github.com/YuLab-SMU/scholar
 BugReports: https://github.com/YuLab-SMU/scholar/issues
-RoxygenNote: 7.1.1
+RoxygenNote: 7.1.2
 Encoding: UTF-8
 Language: en-GB
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,7 +7,6 @@ export(get_article_cite_history)
 export(get_citation_history)
 export(get_coauthors)
 export(get_complete_authors)
-export(get_impactfactor)
 export(get_journalrank)
 export(get_num_articles)
 export(get_num_distinct_journals)
@@ -31,6 +30,7 @@ importFrom(dplyr,row_number)
 importFrom(dplyr,summarize)
 importFrom(httr,GET)
 importFrom(httr,content)
+importFrom(rlang,.data)
 importFrom(rvest,html_attr)
 importFrom(rvest,html_children)
 importFrom(rvest,html_nodes)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
-# scholar 0.2.2.991
+# scholar 0.2.3
 
++ `format_publications` to format publication list (2022-06-21, Tue)
+    - <https://github.com/jkeirstead/scholar/issues/110>
++ update journal ranking data
++ remove `get_impactfactor`
 + fixed when some years contain 0 cites (@jefferis, #101)
 + update documents (@jefferis, #100)
 

diff --git a/R/data.R b/R/data.R
@@ -1,3 +1,7 @@
+##
+## REMOVE
+##
+##
 ## #' Impact Factor (2017)
 ## #'
 ## #' Downloaded from https://www.researchgate.net/post/New_Impact_factors_2017_for_Journals_are_released_now
@@ -28,14 +32,14 @@
 ## "impactfactor"
 
 
-# Journal metrics (2017)
+# Journal metrics (2021)
 # 
 # Downloaded from https://www.scimagojr.com/journalrank.php
 # 
-# Rename (remove whitespaces) to scimagojr2017.csv
+# Rename (remove whitespaces) to scimagojr2021.csv
 # 
 # library(tidyverse)
-# journalrankings <- read.csv2("scimagojr2017.csv") %>%
+# journalrankings <- read.csv2("scimagojr2021.csv") %>%
 #    dplyr::rename(Journal = Title)
 # # 
-# save(journalrankings, file="journalrankings.rda", compress="xz")
+# save(journalrankings, file="R/sysdata.rda", compress='xz')
diff --git a/R/format_publications.R b/R/format_publications.R
@@ -0,0 +1,47 @@
+##' Format publication list
+##' 
+##'
+##' @title format_publications
+##' @param scholar.profile scholar profile ID
+##' @param author.name name of author to be highlighted using bold font
+##' @examples
+##' \dontrun{
+##'  library(scholar)
+##'  format_publications("DO5oG40AAAAJ")    
+##' }
+##' @return a vector of formated publications
+##' @importFrom rlang .data
+##' @author R Thériault and modified by Guangchuang Yu
+format_publications <- function(scholar.profile, author.name = NULL) {
+  pubs <- get_publications(scholar.profile)
+  pubs2 <- pubs %>% 
+    strsplit(x = .data$author, split = ",") 
+
+  pubs$author <- lapply(pubs2, function(x) {
+    x <- swap_initials(x)
+    x[length(x)] <- paste0("& ", x[length(x)])
+    x <- paste0(x, collapse = ", ")
+    ifelse(startsWith(x, "& "), sub("& ", "", x), x)
+    })
+
+  author.name2 <- swap_initials(author.name)
+
+  res <- pubs %>% 
+    arrange(desc(.data$year)) %>%
+    mutate(journal = paste0("*", .data$journal, "*"),
+           Publications = paste0(.data$author, " (", .data$year, "). ", 
+                                 .data$title, ". ", .data$journal, ". ", 
+                                 .data$number)
+    ) %>% 
+    pull(.data$Publications)
+
+    if (is.null(author.name2)) return(res)
+    gsub(author.name2, paste0("**", author.name2, "**"), res)
+}
+
+
+swap_initials <- function(author.name) {
+    if (is.null(author.name)) return (NULL)
+    sub("(.*) (.*)", "\\2, \\1.", trimws(author.name))
+}
+
diff --git a/R/publications.r b/R/publications.r
@@ -224,33 +224,34 @@ get_oldest_article <- function(id) {
 
 
 
-##' Get journal metrics.
-##'
-##' Get journal metrics (impact factor) for a journal list.
-##'
-##' @examples
-##' \dontrun{
-##' library(scholar)
-##'
-##' id <- get_publications("bg0BZ-QAAAAJ&hl")
-##' impact <- get_impactfactor(journals=id$journal, max.distance = 0.1)
-##'
-##' id <- cbind(id, impact)
-##'}
-##' @param journals a character list giving the journal list
-##' @param max.distance maximum distance allowed for a match between journal and journal list.
-##' Expressed either as integer, or as a fraction of the pattern length times the maximal transformation cost
-##' (will be replaced by the smallest integer not less than the corresponding fraction), or a list with possible components
-##'
-##' @return Journal metrics data.
-##'
-##' @import dplyr
-##' @export
-##' @author Dominique Makowski and Guangchuang Yu
-get_impactfactor <- function(journals, max.distance = 0.05) {
-    message("The impact factor data is out-of-date and we may remove this function in future release.")
-    get_journal_stats(journals, max.distance, impactfactor)
-}
+
+# ##' Get journal metrics.
+# ##'
+# ##' Get journal metrics (impact factor) for a journal list.
+# ##'
+# ##' @examples
+# ##' \dontrun{
+# ##' library(scholar)
+# ##'
+# ##' id <- get_publications("bg0BZ-QAAAAJ&hl")
+# ##' impact <- get_impactfactor(journals=id$journal, max.distance = 0.1)
+# ##'
+# ##' id <- cbind(id, impact)
+# ##'}
+# ##' @param journals a character list giving the journal list
+# ##' @param max.distance maximum distance allowed for a match between journal and journal list.
+# ##' Expressed either as integer, or as a fraction of the pattern length times the maximal transformation cost
+# ##' (will be replaced by the smallest integer not less than the corresponding fraction), or a list with possible components
+# ##'
+# ##' @return Journal metrics data.
+# ##'
+# ##' @import dplyr
+# ##' @export
+# ##' @author Dominique Makowski and Guangchuang Yu
+# get_impactfactor <- function(journals, max.distance = 0.05) {
+#     message("The impact factor data is out-of-date and we may remove this function in future release.")
+#     get_journal_stats(journals, max.distance, impactfactor)
+# }
 
 
 get_journal_stats <- function(journals, max.distance, source_data, col = "Journal") {

diff --git a/R/scholar.r b/R/scholar.r
@@ -8,17 +8,27 @@ utils::globalVariables(c("name"))
 ##' Gets profile information for a researcher from Google Scholar.
 ##' Each scholar profile page gives the researcher's name,
 ##' affiliation, their homepage (if specified), and a summary of their
-##' key citation and impact metrics.  The scholar ID can be found by
-##' searching Google Scholar at \url{http://scholar.google.com}.
+##' key citation and publication availability metrics. The scholar
+##' ID can be found by searching Google Scholar at 
+##' \url{http://scholar.google.com}.
 ##'
 ##' @param id 	a character string specifying the Google Scholar ID.
 ##' If multiple ids are specified, only the first value is used and a
-##' warning is generated.  See the example below for how to profile
+##' warning is generated. See the example below for how to profile
 ##' multiple scholars.
 ##'
 ##' @return 	a list containing the scholar's name, affiliation,
-##' citations, impact metrics, research interests, homepage and
-##' the author's list of coauthors provided by Google Scholar.
+##' citations, impact and publication availability metrics,
+##' research interests, homepage and coauthors.
+##' 
+##' Metrics include:
+##' \itemize{
+##'  \item {total_cites}   {combined citations to all publications}
+##'  \item {h_index}       {the largest number h such that h publications each have at least h citations}
+##'  \item {i10_index}     {the number of publications that each have at least 10 citations}
+##'  \item {available}     {the number of publications that have a version online that can be read for free (though not necessarily reusable under an open access license)}
+##'  \item {not_available} {the number of publications only available behind a paywall}
+##' }
 ##'
 ##' @examples {
 ##'    ## Gets profiles of some famous physicists
@@ -41,6 +51,7 @@ get_profile <- function(id) {
 
     page <- page %>% read_html()
     tables <- page %>% html_table()
+
 
   ## The citation stats are in tables[[1]]$tables$stats
   ## but the number of rows seems to vary by OS
@@ -49,28 +60,45 @@ get_profile <- function(id) {
 
   ## The personal info is in
   name <- page %>% html_nodes(xpath="//*/div[@id='gsc_prf_in']") %>% html_text()
-  bio_info <- page %>% html_nodes(xpath="//*/div[@class='gsc_prf_il']") %>% html_text()
-  interests <- page %>% html_nodes(xpath="//*/div[@id='gsc_prf_int']") %>% html_children() %>% html_text()
-  affiliation <- bio_info[1]
+  bio_info <- page %>% html_nodes(xpath = "//*/div[@class='gsc_prf_il']")
+  affiliation <- html_text(bio_info)[1]
 
-  ## Specialities (trim out HTML non-breaking space)
-  specs <- iconv(bio_info[2], from="UTF8", to="ASCII")
-  specs <- str_trim(tolower(str_split(specs, ",")[[1]]))
+  ## Specialities (leave capitalisation as is)
+  specs <- html_nodes(bio_info[3],".gsc_prf_inta") %>% html_text()
+  specs <- str_trim(iconv(specs, from = "UTF8", to = "ASCII"))
 
   ## Extract the homepage
   homepage <- page %>% html_nodes(xpath="//*/div[@id='gsc_prf_ivh']//a/@href") %>% html_text()
 
   ## Grab all coauthors
   coauthors <- list_coauthors(id, n_coauthors = 20) # maximum availabe in profile
 
-  return(list(id=id, name=name, affiliation=affiliation,
-              total_cites=as.numeric(as.character(stats[rows-2,2])),
-              h_index=as.numeric(as.character(stats[rows-1,2])),
-              i10_index=as.numeric(as.character(stats[rows,2])),
-              fields=specs,
-              homepage=homepage,
-              interests=interests,
-              coauthors=coauthors$coauthors))
+  ## Check 'publicly available' vs 'not publicly available' statistics
+  ## (note, not actually detecting open access, just free-to view) 
+  available <- page %>% html_nodes(xpath = "//*/div[@class='gsc_rsb_m_a']") %>% html_text()
+  if(!identical(available, character(0))){
+    available <- as.numeric(str_split(available," ")[[1]][1])
+  }else{
+    available <- NA
+  }
+  not_available <- page %>% html_nodes(xpath = "//*/div[@class='gsc_rsb_m_na']") %>% html_text()
+  if(!identical(not_available, character(0))){
+    not_available <- as.numeric(str_split(not_available," ")[[1]][1])  
+  }else{
+    not_available <- NA
+  }
+
+  return(list(id = id,
+              name = name,
+              affiliation = affiliation, 
+              total_cites = as.numeric(as.character(stats[rows - 2,2])),
+              h_index = as.numeric(as.character(stats[rows - 1, 2])),
+              i10_index = as.numeric(as.character(stats[rows, 2])),
+              fields = specs,
+              homepage = homepage,
+              coauthors = coauthors$coauthors,
+              available = available,
+              not_available = not_available))
 }
 
 ##' Get historical citation data for a scholar

diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/man/format_publications.Rd b/man/format_publications.Rd
diff --git a/man/get_impactfactor.Rd b/man/get_impactfactor.Rd
diff --git a/man/get_profile.Rd b/man/get_profile.Rd