From 1efa80e913006051083544275f6af9b5ab9ccb0a Mon Sep 17 00:00:00 2001 From: Carl Boettiger Date: Fri, 28 Sep 2012 13:03:20 -0700 Subject: [PATCH] Functions for migrating from OWW to Jekyll --- export_oww.R | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++ shortcodes.R | 1 + wiki_get.R | 49 +++++++++++++++++++++ 3 files changed, 167 insertions(+) create mode 100644 export_oww.R create mode 100644 wiki_get.R diff --git a/export_oww.R b/export_oww.R new file mode 100644 index 0000000..0a61fd5 --- /dev/null +++ b/export_oww.R @@ -0,0 +1,117 @@ +require(httr) +require(gsubfn) + +oww_to_md <- function(pages, user="Carl_Boettiger", baseurl = "http://openwetware.org"){ + # Store filenames (with dates) and categories for each page, for reference later + filenames <- get_filenames(pages) + categories <- get_categories(pages) + ## Use the API function to extract the content + parsed <- export_oww(pages, user, baseurl) + ## Add the baseurl back into all the links + htmls <- add_baseurl(parsed) + ## Create YAML header text + headers <- add_header_txt(parsed, categories) + ## Use pandoc to convert the html to markdown + mds <- html_to_md(htmls, filenames) + ## Delete the OWW header info we don't want in the markdown + mds <- clean_md(mds) + ## Stick headers onto markdown files and name them according to filenames + write_mds(mds, headers, filenames) +} + +get_filenames <- function(pages){ + lapply(pages, function(p){ + p <- gsub(" ", "_", p) + filename <- gsub("/", "-", p) + filename <- gsub("(.*)-(\\d+-\\d+-\\d+)", "\\2-\\1", filename) + }) +} + +#' Take a list of page titles to a user's OWW notebooks and export them as Jekyll markdown entries. +get_categories <- function(pages){ + lapply(pages, function(p){ + p <- gsub("_", " ", p) + filename <- gsub("/", "-", p) + category <- gsub("(.*)-(\\d+-\\d+-\\d+)", "\\1", filename) + }) +} + + +#' Take a list of page titles to a user's OWW notebooks and export them as Jekyll markdown entries. +export_oww <- function(pages, user="Carl_Boettiger", baseurl= "http://openwetware.org"){ + lapply(pages, function(p){ + p <- gsub(" ", "_", p) + page <- paste("User:", user, "/Notebook/", p, sep="") + out <- wiki_parse(page, baseurl=baseurl) + }) +} + +add_baseurl <- function(parsed, baseurl = "http://openwetware.org"){ + lapply(parsed, function(out){ + html <- out$parse$text[[1]] + html <- gsubfn("src=\"/images/", paste("src=\"", + baseurl,"/images/", sep=""), html) + html <- gsubfn("href=\"/wiki/", paste("src=\"", + baseurl,"/wiki/", + sep=""), html) + }) +} + +add_header_txt <- function(parsed, mycategories){ + sapply(1:length(parsed), function(i){ + out <- parsed[[i]] + category <- mycategories[[i]] + if(length(out$parse$categories)>0){ + categories <- sapply(out$parse$categories, function(x) x$`*`) + gsub("_", " ", categories) + ## Standarize some of my tags, called "categories" on OWW + tags <- paste("tags: ", "[", + paste0(categories, collapse = ", "), + "]", sep="") + } else { + tags <- "" + } + + header <- c("---", + "layout: post", + tags, + paste("categories: ", category), + "---\n\n") + }) +} + + +html_to_md <- function(htmls, filenames){ + lapply(1:length(htmls), function(html){ + writeLines(htmls[[i]], paste(filenames[i], ".html", sep="")) + md <- system(paste("pandoc ", filenames[i], ".html -w markdown", + sep=""), intern=TRUE) + }) +} + + +clean_md <- function(mds){ + lapply(mds, function(md){ + ## Remove OWW header + md <- gsub("!\\[image\\]\\(http://openwetware.org/images/f/f8/Owwnotebook_icon.png\\)", "", md) + md <- gsub("!\\[image\\]\\(http://openwetware.org/images/9/94/Report.png\\)", "", md) + md <- gsub("Main project", "", md) + md <- gsub("page\\\\", "", md) + md <- gsub("!\\[image\\]\\(http://openwetware.org/images/c/c3/Resultset_previous.png\\)", "", md) + md <- gsub("Previous$", "", md) + md <- gsub("^entry Next$", "", md) + md <- gsub("!\\[image\\]\\(http://openwetware.org/images/5/5c/Resultset_next.png\\)", "", md) + md <- gsub("^entry$", "", md) + md <- gsub("^Stochastic Population Dynamics", "", md) + md <- gsub("^Comparative Phylogenetics$", "", md) + md <- gsub("^Teaching$", "", md) + }) +} + + + +write_md <- function(mds, headers, filenames){ + lapply(1:length(mds), function(i){ + writeLines(c(headers[[i]], mds[[i]]), paste(filenames[i], ".markdown", sep="")) + }) +} diff --git a/shortcodes.R b/shortcodes.R index 5ee9e0b..fc7a7e2 100644 --- a/shortcodes.R +++ b/shortcodes.R @@ -85,6 +85,7 @@ fix_flickr <- function(files){ lapply(files, function(file){ content <- readLines(file) + content <- gsubfn("\\[flickr.*?\\]http://www.flickr.com\\/photos\\/cboettig\\/(\\d+)?\\[\\/flickr\\]", flickr_url, content) content <- gsubfn("\\[flickr.*?\\]http://flickr.com\\/photos\\/46456847@N08\\/(\\d+)?\\[\\/flickr\\]", flickr_url, content) content <- gsubfn("\\[flickr.*?\\](\\d+)?\\[\\/flickr\\]", flickr_url, content) writeLines(content, file) diff --git a/wiki_get.R b/wiki_get.R new file mode 100644 index 0000000..f0897da --- /dev/null +++ b/wiki_get.R @@ -0,0 +1,49 @@ +#' Place a GET call to the mediawiki API using query action +#' @param titles the tile of the page to be grabbed. +#' @param baseurl the base url of the Wiki, e.g. "http://en.wikipedia.org/w". +#' @param format the desired format for the output +#' @param prop the revision desired. Default is most recent first +#' @param rvprop What should be returned from the revision. Defaults to "content". see details. Can include more than one of the options. +#' @returns the contents of a mediawiki page in the requested format +#' @import httr gsubfn +#' @details +#' http://en.wikipedia.org/w/api.php?format=xml&action=query&titles=Main%20Page&prop=revisions&rvprop=content +#'Parameters: +#' rvprop - Which properties to get for each revision: +#' ids - The ID of the revision +#' flags - Revision flags (minor) +#' timestamp - The timestamp of the revision +#' user - User that made the revision +#' userid - User id of revision creator +#' size - Length (bytes) of the revision +#' sha1 - SHA-1 (base 16) of the revision +#' comment - Comment by the user for revision +#' parsedcomment - Parsed comment by the user for the revision +#' content - Text of the revision +#' tags - Tags for the revision +#' Values (separate with '|'): ids, flags, timestamp, user, userid, size, sha1, comment, parsedcomment, content, tags, flagged +#' Default: ids|timestamp|flags|comment|user +#' +#' @references See http://www.mediawiki.org/wiki/API:Main_page for and introduction, and http://en.wikipedia.org/w/api.php for the full documentation of the MediaWiki API +#' @export +wiki_get <- function(titles, baseurl="http://wikipedia.org", format=c("xml", "json"), prop=c("revisions"), rvprop=c("content", "tags"), ...){ + format <- match.arg(format) +# if(!is.null(baseurl)) + if(length(rvprop) > 1) + rvprop <- paste0(rvprop, collapse="|") + action="query" + addr <- paste(baseurl, "/api.php?format=", format, "&action=", action, "&titles=", titles, "&prop=", prop, "&rvprop=", rvprop, sep="") + config <- c(add_headers("User-Agent" = "rwiki"), ...) + out <- GET(addr, config=config) +} + +# +wiki_parse <- function(page, baseurl, format="json", ...){ + require(httr) + action = "parse" + addr <- paste(baseurl, "/api.php?format=", format, "&action=", action, "&page=", page, sep="") + config <- c(add_headers("User-Agent" = "rwiki"), ...) + out <- GET(addr, config=config) + parsed_content(out) +} +