Skip to content

Commit

Permalink
format publications and update dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
GuangchuangYu committed Jun 21, 2022
2 parents e4253b2 + 5a5e041 commit 03a9394
Show file tree
Hide file tree
Showing 12 changed files with 183 additions and 96 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: scholar
Type: Package
Title: Analyse Citation Data from Google Scholar
Version: 0.2.2.991
Version: 0.2.3
Authors@R: c(
person("Guangchuang", "Yu", email = "guangchuangyu@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6485-8781")),
person("James", "Keirstead", email = "james.keirstead@gmail.com", role = "aut"),
Expand All @@ -16,11 +16,12 @@ Description: Provides functions to extract citation data from Google
Scholar. Convenience functions are also provided for comparing
multiple scholars and predicting future h-index values.
Depends:
R (>= 3.4.0)
R (>= 3.5.0)
Imports:
R.cache,
dplyr,
httr,
rlang,
rvest,
stringr,
xml2,
Expand All @@ -38,6 +39,6 @@ VignetteBuilder: knitr
License: MIT + file LICENSE
URL: https://github.com/YuLab-SMU/scholar
BugReports: https://github.com/YuLab-SMU/scholar/issues
RoxygenNote: 7.1.1
RoxygenNote: 7.1.2
Encoding: UTF-8
Language: en-GB
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ export(get_article_cite_history)
export(get_citation_history)
export(get_coauthors)
export(get_complete_authors)
export(get_impactfactor)
export(get_journalrank)
export(get_num_articles)
export(get_num_distinct_journals)
Expand All @@ -31,6 +30,7 @@ importFrom(dplyr,row_number)
importFrom(dplyr,summarize)
importFrom(httr,GET)
importFrom(httr,content)
importFrom(rlang,.data)
importFrom(rvest,html_attr)
importFrom(rvest,html_children)
importFrom(rvest,html_nodes)
Expand Down
6 changes: 5 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# scholar 0.2.2.991
# scholar 0.2.3

+ `format_publications` to format publication list (2022-06-21, Tue)
- <https://github.com/jkeirstead/scholar/issues/110>
+ update journal ranking data
+ remove `get_impactfactor`
+ fixed when some years contain 0 cites (@jefferis, #101)
+ update documents (@jefferis, #100)

Expand Down
12 changes: 8 additions & 4 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
##
## REMOVE
##
##
## #' Impact Factor (2017)
## #'
## #' Downloaded from https://www.researchgate.net/post/New_Impact_factors_2017_for_Journals_are_released_now
Expand Down Expand Up @@ -28,14 +32,14 @@
## "impactfactor"


# Journal metrics (2017)
# Journal metrics (2021)
#
# Downloaded from https://www.scimagojr.com/journalrank.php
#
# Rename (remove whitespaces) to scimagojr2017.csv
# Rename (remove whitespaces) to scimagojr2021.csv
#
# library(tidyverse)
# journalrankings <- read.csv2("scimagojr2017.csv") %>%
# journalrankings <- read.csv2("scimagojr2021.csv") %>%
# dplyr::rename(Journal = Title)
# #
# save(journalrankings, file="journalrankings.rda", compress="xz")
# save(journalrankings, file="R/sysdata.rda", compress='xz')
47 changes: 47 additions & 0 deletions R/format_publications.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
##' Format publication list
##'
##'
##' @title format_publications
##' @param scholar.profile scholar profile ID
##' @param author.name name of author to be highlighted using bold font
##' @examples
##' \dontrun{
##' library(scholar)
##' format_publications("DO5oG40AAAAJ")
##' }
##' @return a vector of formated publications
##' @importFrom rlang .data
##' @author R Thériault and modified by Guangchuang Yu
format_publications <- function(scholar.profile, author.name = NULL) {
pubs <- get_publications(scholar.profile)
pubs2 <- pubs %>%
strsplit(x = .data$author, split = ",")

pubs$author <- lapply(pubs2, function(x) {
x <- swap_initials(x)
x[length(x)] <- paste0("& ", x[length(x)])
x <- paste0(x, collapse = ", ")
ifelse(startsWith(x, "& "), sub("& ", "", x), x)
})

author.name2 <- swap_initials(author.name)

res <- pubs %>%
arrange(desc(.data$year)) %>%
mutate(journal = paste0("*", .data$journal, "*"),
Publications = paste0(.data$author, " (", .data$year, "). ",
.data$title, ". ", .data$journal, ". ",
.data$number)
) %>%
pull(.data$Publications)

if (is.null(author.name2)) return(res)
gsub(author.name2, paste0("**", author.name2, "**"), res)
}


swap_initials <- function(author.name) {
if (is.null(author.name)) return (NULL)
sub("(.*) (.*)", "\\2, \\1.", trimws(author.name))
}

55 changes: 28 additions & 27 deletions R/publications.r
Original file line number Diff line number Diff line change
Expand Up @@ -224,33 +224,34 @@ get_oldest_article <- function(id) {



##' Get journal metrics.
##'
##' Get journal metrics (impact factor) for a journal list.
##'
##' @examples
##' \dontrun{
##' library(scholar)
##'
##' id <- get_publications("bg0BZ-QAAAAJ&hl")
##' impact <- get_impactfactor(journals=id$journal, max.distance = 0.1)
##'
##' id <- cbind(id, impact)
##'}
##' @param journals a character list giving the journal list
##' @param max.distance maximum distance allowed for a match between journal and journal list.
##' Expressed either as integer, or as a fraction of the pattern length times the maximal transformation cost
##' (will be replaced by the smallest integer not less than the corresponding fraction), or a list with possible components
##'
##' @return Journal metrics data.
##'
##' @import dplyr
##' @export
##' @author Dominique Makowski and Guangchuang Yu
get_impactfactor <- function(journals, max.distance = 0.05) {
message("The impact factor data is out-of-date and we may remove this function in future release.")
get_journal_stats(journals, max.distance, impactfactor)
}

# ##' Get journal metrics.
# ##'
# ##' Get journal metrics (impact factor) for a journal list.
# ##'
# ##' @examples
# ##' \dontrun{
# ##' library(scholar)
# ##'
# ##' id <- get_publications("bg0BZ-QAAAAJ&hl")
# ##' impact <- get_impactfactor(journals=id$journal, max.distance = 0.1)
# ##'
# ##' id <- cbind(id, impact)
# ##'}
# ##' @param journals a character list giving the journal list
# ##' @param max.distance maximum distance allowed for a match between journal and journal list.
# ##' Expressed either as integer, or as a fraction of the pattern length times the maximal transformation cost
# ##' (will be replaced by the smallest integer not less than the corresponding fraction), or a list with possible components
# ##'
# ##' @return Journal metrics data.
# ##'
# ##' @import dplyr
# ##' @export
# ##' @author Dominique Makowski and Guangchuang Yu
# get_impactfactor <- function(journals, max.distance = 0.05) {
# message("The impact factor data is out-of-date and we may remove this function in future release.")
# get_journal_stats(journals, max.distance, impactfactor)
# }


get_journal_stats <- function(journals, max.distance, source_data, col = "Journal") {
Expand Down
66 changes: 47 additions & 19 deletions R/scholar.r
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,27 @@ utils::globalVariables(c("name"))
##' Gets profile information for a researcher from Google Scholar.
##' Each scholar profile page gives the researcher's name,
##' affiliation, their homepage (if specified), and a summary of their
##' key citation and impact metrics. The scholar ID can be found by
##' searching Google Scholar at \url{http://scholar.google.com}.
##' key citation and publication availability metrics. The scholar
##' ID can be found by searching Google Scholar at
##' \url{http://scholar.google.com}.
##'
##' @param id a character string specifying the Google Scholar ID.
##' If multiple ids are specified, only the first value is used and a
##' warning is generated. See the example below for how to profile
##' warning is generated. See the example below for how to profile
##' multiple scholars.
##'
##' @return a list containing the scholar's name, affiliation,
##' citations, impact metrics, research interests, homepage and
##' the author's list of coauthors provided by Google Scholar.
##' citations, impact and publication availability metrics,
##' research interests, homepage and coauthors.
##'
##' Metrics include:
##' \itemize{
##' \item {total_cites} {combined citations to all publications}
##' \item {h_index} {the largest number h such that h publications each have at least h citations}
##' \item {i10_index} {the number of publications that each have at least 10 citations}
##' \item {available} {the number of publications that have a version online that can be read for free (though not necessarily reusable under an open access license)}
##' \item {not_available} {the number of publications only available behind a paywall}
##' }
##'
##' @examples {
##' ## Gets profiles of some famous physicists
Expand All @@ -41,6 +51,7 @@ get_profile <- function(id) {

page <- page %>% read_html()
tables <- page %>% html_table()


## The citation stats are in tables[[1]]$tables$stats
## but the number of rows seems to vary by OS
Expand All @@ -49,28 +60,45 @@ get_profile <- function(id) {

## The personal info is in
name <- page %>% html_nodes(xpath="//*/div[@id='gsc_prf_in']") %>% html_text()
bio_info <- page %>% html_nodes(xpath="//*/div[@class='gsc_prf_il']") %>% html_text()
interests <- page %>% html_nodes(xpath="//*/div[@id='gsc_prf_int']") %>% html_children() %>% html_text()
affiliation <- bio_info[1]
bio_info <- page %>% html_nodes(xpath = "//*/div[@class='gsc_prf_il']")
affiliation <- html_text(bio_info)[1]

## Specialities (trim out HTML non-breaking space)
specs <- iconv(bio_info[2], from="UTF8", to="ASCII")
specs <- str_trim(tolower(str_split(specs, ",")[[1]]))
## Specialities (leave capitalisation as is)
specs <- html_nodes(bio_info[3],".gsc_prf_inta") %>% html_text()
specs <- str_trim(iconv(specs, from = "UTF8", to = "ASCII"))

## Extract the homepage
homepage <- page %>% html_nodes(xpath="//*/div[@id='gsc_prf_ivh']//a/@href") %>% html_text()

## Grab all coauthors
coauthors <- list_coauthors(id, n_coauthors = 20) # maximum availabe in profile

return(list(id=id, name=name, affiliation=affiliation,
total_cites=as.numeric(as.character(stats[rows-2,2])),
h_index=as.numeric(as.character(stats[rows-1,2])),
i10_index=as.numeric(as.character(stats[rows,2])),
fields=specs,
homepage=homepage,
interests=interests,
coauthors=coauthors$coauthors))
## Check 'publicly available' vs 'not publicly available' statistics
## (note, not actually detecting open access, just free-to view)
available <- page %>% html_nodes(xpath = "//*/div[@class='gsc_rsb_m_a']") %>% html_text()
if(!identical(available, character(0))){
available <- as.numeric(str_split(available," ")[[1]][1])
}else{
available <- NA
}
not_available <- page %>% html_nodes(xpath = "//*/div[@class='gsc_rsb_m_na']") %>% html_text()
if(!identical(not_available, character(0))){
not_available <- as.numeric(str_split(not_available," ")[[1]][1])
}else{
not_available <- NA
}

return(list(id = id,
name = name,
affiliation = affiliation,
total_cites = as.numeric(as.character(stats[rows - 2,2])),
h_index = as.numeric(as.character(stats[rows - 1, 2])),
i10_index = as.numeric(as.character(stats[rows, 2])),
fields = specs,
homepage = homepage,
coauthors = coauthors$coauthors,
available = available,
not_available = not_available))
}

##' Get historical citation data for a scholar
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
28 changes: 28 additions & 0 deletions man/format_publications.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 0 additions & 34 deletions man/get_impactfactor.Rd

This file was deleted.

20 changes: 15 additions & 5 deletions man/get_profile.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 03a9394

Please sign in to comment.