Functions for migrating from OWW to Jekyll

cboettig · Sep 28, 2012 · 1efa80e · 1efa80e
1 parent c489da8
commit 1efa80e
Show file tree

Hide file tree

Showing 3 changed files with 167 additions and 0 deletions.
diff --git a/export_oww.R b/export_oww.R
@@ -0,0 +1,117 @@
+require(httr)
+require(gsubfn)
+
+oww_to_md <- function(pages, user="Carl_Boettiger",  baseurl = "http://openwetware.org"){
+  # Store filenames (with dates) and categories for each page, for reference later
+  filenames <- get_filenames(pages)
+  categories <- get_categories(pages)
+  ## Use the API function to extract the content
+  parsed <- export_oww(pages, user, baseurl)
+  ## Add the baseurl back into all the links
+  htmls <- add_baseurl(parsed)
+  ## Create YAML header text
+  headers <- add_header_txt(parsed, categories)
+  ## Use pandoc to convert the html to markdown
+  mds <- html_to_md(htmls, filenames) 
+  ## Delete the OWW header info we don't want in the markdown
+  mds <- clean_md(mds)
+  ## Stick headers onto markdown files and name them according to filenames
+  write_mds(mds, headers, filenames) 
+}
+
+get_filenames <- function(pages){
+  lapply(pages, function(p){
+    p <- gsub(" ", "_", p)
+    filename <- gsub("/", "-", p)
+    filename <- gsub("(.*)-(\\d+-\\d+-\\d+)", "\\2-\\1", filename)
+  })
+}
+
+#' Take a list of page titles to a user's OWW notebooks and export them as Jekyll markdown entries.  
+get_categories <- function(pages){
+  lapply(pages, function(p){
+    p <- gsub("_", " ", p)
+    filename <- gsub("/", "-", p)
+    category <- gsub("(.*)-(\\d+-\\d+-\\d+)", "\\1", filename)
+  })
+}
+
+
+#' Take a list of page titles to a user's OWW notebooks and export them as Jekyll markdown entries.  
+export_oww <- function(pages, user="Carl_Boettiger",  baseurl=  "http://openwetware.org"){
+  lapply(pages, function(p){
+    p <- gsub(" ", "_", p)
+    page <- paste("User:", user, "/Notebook/", p, sep="")
+    out <- wiki_parse(page, baseurl=baseurl)
+  })
+}
+
+add_baseurl <- function(parsed,  baseurl = "http://openwetware.org"){
+  lapply(parsed, function(out){
+    html <- out$parse$text[[1]]
+    html <- gsubfn("src=\"/images/", paste("src=\"", 
+                   baseurl,"/images/", sep=""), html)
+    html <- gsubfn("href=\"/wiki/", paste("src=\"", 
+                   baseurl,"/wiki/", 
+                    sep=""), html)
+  })
+}
+
+add_header_txt <- function(parsed, mycategories){
+  sapply(1:length(parsed), function(i){
+    out <- parsed[[i]]
+    category <- mycategories[[i]]
+    if(length(out$parse$categories)>0){
+      categories <- sapply(out$parse$categories, function(x) x$`*`)
+      gsub("_", " ", categories)
+      ## Standarize some of my tags, called "categories" on OWW
+      tags <- paste("tags: ", "[", 
+                          paste0(categories, collapse = ", "),
+                          "]", sep="")  
+    } else {
+      tags <- ""
+    }
+
+    header <- c("---", 
+                "layout: post",
+                tags, 
+                paste("categories: ", category),
+                "---\n\n")
+  })
+}
+
+
+html_to_md <- function(htmls, filenames){
+  lapply(1:length(htmls), function(html){
+    writeLines(htmls[[i]], paste(filenames[i], ".html", sep=""))
+    md <- system(paste("pandoc ", filenames[i], ".html -w markdown", 
+                       sep=""), intern=TRUE)
+  })
+}
+
+
+clean_md <- function(mds){
+  lapply(mds, function(md){
+    ## Remove OWW header
+    md <- gsub("!\\[image\\]\\(http://openwetware.org/images/f/f8/Owwnotebook_icon.png\\)", "", md)
+    md <- gsub("!\\[image\\]\\(http://openwetware.org/images/9/94/Report.png\\)",  "", md)
+    md <- gsub("Main project", "", md)
+    md <- gsub("page\\\\", "", md)
+    md <- gsub("!\\[image\\]\\(http://openwetware.org/images/c/c3/Resultset_previous.png\\)",  "", md)
+    md <- gsub("Previous$",  "", md)
+    md <- gsub("^entry Next$", "", md)
+    md <- gsub("!\\[image\\]\\(http://openwetware.org/images/5/5c/Resultset_next.png\\)", "", md)
+    md <- gsub("^entry$", "", md)
+    md <- gsub("^Stochastic Population Dynamics", "", md)
+    md <- gsub("^Comparative Phylogenetics$", "", md)
+    md <- gsub("^Teaching$", "", md)
+  })
+}
+
+
+
+write_md <- function(mds, headers, filenames){
+  lapply(1:length(mds), function(i){
+    writeLines(c(headers[[i]], mds[[i]]), paste(filenames[i], ".markdown", sep="")) 
+  })
+}
diff --git a/shortcodes.R b/shortcodes.R
@@ -85,6 +85,7 @@ fix_flickr <- function(files){
 
   lapply(files, function(file){
     content <- readLines(file)
+    content <- gsubfn("\\[flickr.*?\\]http://www.flickr.com\\/photos\\/cboettig\\/(\\d+)?\\[\\/flickr\\]", flickr_url, content)
     content <- gsubfn("\\[flickr.*?\\]http://flickr.com\\/photos\\/46456847@N08\\/(\\d+)?\\[\\/flickr\\]", flickr_url, content)
     content <- gsubfn("\\[flickr.*?\\](\\d+)?\\[\\/flickr\\]", flickr_url, content)
     writeLines(content, file)

diff --git a/wiki_get.R b/wiki_get.R
@@ -0,0 +1,49 @@
+#' Place a GET call to the mediawiki API using query action
+#' @param titles the tile of the page to be grabbed. 
+#' @param baseurl the base url of the Wiki, e.g. "http://en.wikipedia.org/w".  
+#' @param format the desired format for the output
+#' @param prop the revision desired.  Default is most recent first
+#' @param rvprop What should be returned from the revision. Defaults to "content". see details. Can include more than one of the options. 
+#' @returns the contents of a mediawiki page in the requested format
+#' @import httr gsubfn 
+#' @details
+#' http://en.wikipedia.org/w/api.php?format=xml&action=query&titles=Main%20Page&prop=revisions&rvprop=content
+#'Parameters:
+#'  rvprop              - Which properties to get for each revision:
+#'                         ids            - The ID of the revision
+#'                         flags          - Revision flags (minor)
+#'                         timestamp      - The timestamp of the revision
+#'                         user           - User that made the revision
+#'                         userid         - User id of revision creator
+#'                         size           - Length (bytes) of the revision
+#'                         sha1           - SHA-1 (base 16) of the revision
+#'                         comment        - Comment by the user for revision
+#'                         parsedcomment  - Parsed comment by the user for the revision
+#'                         content        - Text of the revision
+#'                         tags           - Tags for the revision
+#'                        Values (separate with '|'): ids, flags, timestamp, user, userid, size, sha1, comment, parsedcomment, content, tags, flagged
+#'                        Default: ids|timestamp|flags|comment|user
+#'
+#' @references See http://www.mediawiki.org/wiki/API:Main_page for and introduction, and http://en.wikipedia.org/w/api.php for the full documentation of the MediaWiki API
+#' @export
+wiki_get <- function(titles,  baseurl="http://wikipedia.org", format=c("xml", "json"), prop=c("revisions"), rvprop=c("content", "tags"), ...){
+  format <- match.arg(format)
+#  if(!is.null(baseurl))
+  if(length(rvprop) > 1)
+    rvprop <- paste0(rvprop, collapse="|")
+  action="query"
+  addr <- paste(baseurl, "/api.php?format=", format, "&action=", action, "&titles=", titles, "&prop=", prop, "&rvprop=", rvprop, sep="")
+  config <- c(add_headers("User-Agent" = "rwiki"), ...)
+  out <- GET(addr, config=config)
+}
+
+# 
+wiki_parse <- function(page, baseurl, format="json", ...){
+  require(httr)
+  action = "parse"
+  addr <- paste(baseurl, "/api.php?format=", format, "&action=", action, "&page=", page, sep="")
+  config <- c(add_headers("User-Agent" = "rwiki"), ...)
+  out <- GET(addr, config=config)
+  parsed_content(out)
+}
+