Permalink
Browse files

Functions for migrating from OWW to Jekyll

  • Loading branch information...
1 parent c489da8 commit 1efa80e913006051083544275f6af9b5ab9ccb0a @cboettig committed Sep 28, 2012
Showing with 167 additions and 0 deletions.
  1. +117 −0 export_oww.R
  2. +1 −0 shortcodes.R
  3. +49 −0 wiki_get.R
View
@@ -0,0 +1,117 @@
+require(httr)
+require(gsubfn)
+
+oww_to_md <- function(pages, user="Carl_Boettiger", baseurl = "http://openwetware.org"){
+ # Store filenames (with dates) and categories for each page, for reference later
+ filenames <- get_filenames(pages)
+ categories <- get_categories(pages)
+ ## Use the API function to extract the content
+ parsed <- export_oww(pages, user, baseurl)
+ ## Add the baseurl back into all the links
+ htmls <- add_baseurl(parsed)
+ ## Create YAML header text
+ headers <- add_header_txt(parsed, categories)
+ ## Use pandoc to convert the html to markdown
+ mds <- html_to_md(htmls, filenames)
+ ## Delete the OWW header info we don't want in the markdown
+ mds <- clean_md(mds)
+ ## Stick headers onto markdown files and name them according to filenames
+ write_mds(mds, headers, filenames)
+}
+
+get_filenames <- function(pages){
+ lapply(pages, function(p){
+ p <- gsub(" ", "_", p)
+ filename <- gsub("/", "-", p)
+ filename <- gsub("(.*)-(\\d+-\\d+-\\d+)", "\\2-\\1", filename)
+ })
+}
+
+#' Take a list of page titles to a user's OWW notebooks and export them as Jekyll markdown entries.
+get_categories <- function(pages){
+ lapply(pages, function(p){
+ p <- gsub("_", " ", p)
+ filename <- gsub("/", "-", p)
+ category <- gsub("(.*)-(\\d+-\\d+-\\d+)", "\\1", filename)
+ })
+}
+
+
+#' Take a list of page titles to a user's OWW notebooks and export them as Jekyll markdown entries.
+export_oww <- function(pages, user="Carl_Boettiger", baseurl= "http://openwetware.org"){
+ lapply(pages, function(p){
+ p <- gsub(" ", "_", p)
+ page <- paste("User:", user, "/Notebook/", p, sep="")
+ out <- wiki_parse(page, baseurl=baseurl)
+ })
+}
+
+add_baseurl <- function(parsed, baseurl = "http://openwetware.org"){
+ lapply(parsed, function(out){
+ html <- out$parse$text[[1]]
+ html <- gsubfn("src=\"/images/", paste("src=\"",
+ baseurl,"/images/", sep=""), html)
+ html <- gsubfn("href=\"/wiki/", paste("src=\"",
+ baseurl,"/wiki/",
+ sep=""), html)
+ })
+}
+
+add_header_txt <- function(parsed, mycategories){
+ sapply(1:length(parsed), function(i){
+ out <- parsed[[i]]
+ category <- mycategories[[i]]
+ if(length(out$parse$categories)>0){
+ categories <- sapply(out$parse$categories, function(x) x$`*`)
+ gsub("_", " ", categories)
+ ## Standarize some of my tags, called "categories" on OWW
+ tags <- paste("tags: ", "[",
+ paste0(categories, collapse = ", "),
+ "]", sep="")
+ } else {
+ tags <- ""
+ }
+
+ header <- c("---",
+ "layout: post",
+ tags,
+ paste("categories: ", category),
+ "---\n\n")
+ })
+}
+
+
+html_to_md <- function(htmls, filenames){
+ lapply(1:length(htmls), function(html){
+ writeLines(htmls[[i]], paste(filenames[i], ".html", sep=""))
+ md <- system(paste("pandoc ", filenames[i], ".html -w markdown",
+ sep=""), intern=TRUE)
+ })
+}
+
+
+clean_md <- function(mds){
+ lapply(mds, function(md){
+ ## Remove OWW header
+ md <- gsub("!\\[image\\]\\(http://openwetware.org/images/f/f8/Owwnotebook_icon.png\\)", "", md)
+ md <- gsub("!\\[image\\]\\(http://openwetware.org/images/9/94/Report.png\\)", "", md)
+ md <- gsub("Main project", "", md)
+ md <- gsub("page\\\\", "", md)
+ md <- gsub("!\\[image\\]\\(http://openwetware.org/images/c/c3/Resultset_previous.png\\)", "", md)
+ md <- gsub("Previous$", "", md)
+ md <- gsub("^entry Next$", "", md)
+ md <- gsub("!\\[image\\]\\(http://openwetware.org/images/5/5c/Resultset_next.png\\)", "", md)
+ md <- gsub("^entry$", "", md)
+ md <- gsub("^Stochastic Population Dynamics", "", md)
+ md <- gsub("^Comparative Phylogenetics$", "", md)
+ md <- gsub("^Teaching$", "", md)
+ })
+}
+
+
+
+write_md <- function(mds, headers, filenames){
+ lapply(1:length(mds), function(i){
+ writeLines(c(headers[[i]], mds[[i]]), paste(filenames[i], ".markdown", sep=""))
+ })
+}
View
@@ -85,6 +85,7 @@ fix_flickr <- function(files){
lapply(files, function(file){
content <- readLines(file)
+ content <- gsubfn("\\[flickr.*?\\]http://www.flickr.com\\/photos\\/cboettig\\/(\\d+)?\\[\\/flickr\\]", flickr_url, content)
content <- gsubfn("\\[flickr.*?\\]http://flickr.com\\/photos\\/46456847@N08\\/(\\d+)?\\[\\/flickr\\]", flickr_url, content)
content <- gsubfn("\\[flickr.*?\\](\\d+)?\\[\\/flickr\\]", flickr_url, content)
writeLines(content, file)
View
@@ -0,0 +1,49 @@
+#' Place a GET call to the mediawiki API using query action
+#' @param titles the tile of the page to be grabbed.
+#' @param baseurl the base url of the Wiki, e.g. "http://en.wikipedia.org/w".
+#' @param format the desired format for the output
+#' @param prop the revision desired. Default is most recent first
+#' @param rvprop What should be returned from the revision. Defaults to "content". see details. Can include more than one of the options.
+#' @returns the contents of a mediawiki page in the requested format
+#' @import httr gsubfn
+#' @details
+#' http://en.wikipedia.org/w/api.php?format=xml&action=query&titles=Main%20Page&prop=revisions&rvprop=content
+#'Parameters:
+#' rvprop - Which properties to get for each revision:
+#' ids - The ID of the revision
+#' flags - Revision flags (minor)
+#' timestamp - The timestamp of the revision
+#' user - User that made the revision
+#' userid - User id of revision creator
+#' size - Length (bytes) of the revision
+#' sha1 - SHA-1 (base 16) of the revision
+#' comment - Comment by the user for revision
+#' parsedcomment - Parsed comment by the user for the revision
+#' content - Text of the revision
+#' tags - Tags for the revision
+#' Values (separate with '|'): ids, flags, timestamp, user, userid, size, sha1, comment, parsedcomment, content, tags, flagged
+#' Default: ids|timestamp|flags|comment|user
+#'
+#' @references See http://www.mediawiki.org/wiki/API:Main_page for and introduction, and http://en.wikipedia.org/w/api.php for the full documentation of the MediaWiki API
+#' @export
+wiki_get <- function(titles, baseurl="http://wikipedia.org", format=c("xml", "json"), prop=c("revisions"), rvprop=c("content", "tags"), ...){
+ format <- match.arg(format)
+# if(!is.null(baseurl))
+ if(length(rvprop) > 1)
+ rvprop <- paste0(rvprop, collapse="|")
+ action="query"
+ addr <- paste(baseurl, "/api.php?format=", format, "&action=", action, "&titles=", titles, "&prop=", prop, "&rvprop=", rvprop, sep="")
+ config <- c(add_headers("User-Agent" = "rwiki"), ...)
+ out <- GET(addr, config=config)
+}
+
+#
+wiki_parse <- function(page, baseurl, format="json", ...){
+ require(httr)
+ action = "parse"
+ addr <- paste(baseurl, "/api.php?format=", format, "&action=", action, "&page=", page, sep="")
+ config <- c(add_headers("User-Agent" = "rwiki"), ...)
+ out <- GET(addr, config=config)
+ parsed_content(out)
+}
+

0 comments on commit 1efa80e

Please sign in to comment.