Navigation Menu

Skip to content

Commit

Permalink
Functions for migrating from OWW to Jekyll
Browse files Browse the repository at this point in the history
  • Loading branch information
cboettig committed Sep 28, 2012
1 parent c489da8 commit 1efa80e
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 0 deletions.
117 changes: 117 additions & 0 deletions export_oww.R
@@ -0,0 +1,117 @@
require(httr)
require(gsubfn)

oww_to_md <- function(pages, user="Carl_Boettiger", baseurl = "http://openwetware.org"){
# Store filenames (with dates) and categories for each page, for reference later
filenames <- get_filenames(pages)
categories <- get_categories(pages)
## Use the API function to extract the content
parsed <- export_oww(pages, user, baseurl)
## Add the baseurl back into all the links
htmls <- add_baseurl(parsed)
## Create YAML header text
headers <- add_header_txt(parsed, categories)
## Use pandoc to convert the html to markdown
mds <- html_to_md(htmls, filenames)
## Delete the OWW header info we don't want in the markdown
mds <- clean_md(mds)
## Stick headers onto markdown files and name them according to filenames
write_mds(mds, headers, filenames)
}

get_filenames <- function(pages){
lapply(pages, function(p){
p <- gsub(" ", "_", p)
filename <- gsub("/", "-", p)
filename <- gsub("(.*)-(\\d+-\\d+-\\d+)", "\\2-\\1", filename)
})
}

#' Take a list of page titles to a user's OWW notebooks and export them as Jekyll markdown entries.
get_categories <- function(pages){
lapply(pages, function(p){
p <- gsub("_", " ", p)
filename <- gsub("/", "-", p)
category <- gsub("(.*)-(\\d+-\\d+-\\d+)", "\\1", filename)
})
}


#' Take a list of page titles to a user's OWW notebooks and export them as Jekyll markdown entries.
export_oww <- function(pages, user="Carl_Boettiger", baseurl= "http://openwetware.org"){
lapply(pages, function(p){
p <- gsub(" ", "_", p)
page <- paste("User:", user, "/Notebook/", p, sep="")
out <- wiki_parse(page, baseurl=baseurl)
})
}

add_baseurl <- function(parsed, baseurl = "http://openwetware.org"){
lapply(parsed, function(out){
html <- out$parse$text[[1]]
html <- gsubfn("src=\"/images/", paste("src=\"",
baseurl,"/images/", sep=""), html)
html <- gsubfn("href=\"/wiki/", paste("src=\"",
baseurl,"/wiki/",
sep=""), html)
})
}

add_header_txt <- function(parsed, mycategories){
sapply(1:length(parsed), function(i){
out <- parsed[[i]]
category <- mycategories[[i]]
if(length(out$parse$categories)>0){
categories <- sapply(out$parse$categories, function(x) x$`*`)
gsub("_", " ", categories)
## Standarize some of my tags, called "categories" on OWW
tags <- paste("tags: ", "[",
paste0(categories, collapse = ", "),
"]", sep="")
} else {
tags <- ""
}

header <- c("---",
"layout: post",
tags,
paste("categories: ", category),
"---\n\n")
})
}


html_to_md <- function(htmls, filenames){
lapply(1:length(htmls), function(html){
writeLines(htmls[[i]], paste(filenames[i], ".html", sep=""))
md <- system(paste("pandoc ", filenames[i], ".html -w markdown",
sep=""), intern=TRUE)
})
}


clean_md <- function(mds){
lapply(mds, function(md){
## Remove OWW header
md <- gsub("!\\[image\\]\\(http://openwetware.org/images/f/f8/Owwnotebook_icon.png\\)", "", md)
md <- gsub("!\\[image\\]\\(http://openwetware.org/images/9/94/Report.png\\)", "", md)
md <- gsub("Main project", "", md)
md <- gsub("page\\\\", "", md)
md <- gsub("!\\[image\\]\\(http://openwetware.org/images/c/c3/Resultset_previous.png\\)", "", md)
md <- gsub("Previous$", "", md)
md <- gsub("^entry Next$", "", md)
md <- gsub("!\\[image\\]\\(http://openwetware.org/images/5/5c/Resultset_next.png\\)", "", md)
md <- gsub("^entry$", "", md)
md <- gsub("^Stochastic Population Dynamics", "", md)
md <- gsub("^Comparative Phylogenetics$", "", md)
md <- gsub("^Teaching$", "", md)
})
}



write_md <- function(mds, headers, filenames){
lapply(1:length(mds), function(i){
writeLines(c(headers[[i]], mds[[i]]), paste(filenames[i], ".markdown", sep=""))
})
}
1 change: 1 addition & 0 deletions shortcodes.R
Expand Up @@ -85,6 +85,7 @@ fix_flickr <- function(files){


lapply(files, function(file){ lapply(files, function(file){
content <- readLines(file) content <- readLines(file)
content <- gsubfn("\\[flickr.*?\\]http://www.flickr.com\\/photos\\/cboettig\\/(\\d+)?\\[\\/flickr\\]", flickr_url, content)
content <- gsubfn("\\[flickr.*?\\]http://flickr.com\\/photos\\/46456847@N08\\/(\\d+)?\\[\\/flickr\\]", flickr_url, content) content <- gsubfn("\\[flickr.*?\\]http://flickr.com\\/photos\\/46456847@N08\\/(\\d+)?\\[\\/flickr\\]", flickr_url, content)
content <- gsubfn("\\[flickr.*?\\](\\d+)?\\[\\/flickr\\]", flickr_url, content) content <- gsubfn("\\[flickr.*?\\](\\d+)?\\[\\/flickr\\]", flickr_url, content)
writeLines(content, file) writeLines(content, file)
Expand Down
49 changes: 49 additions & 0 deletions wiki_get.R
@@ -0,0 +1,49 @@
#' Place a GET call to the mediawiki API using query action
#' @param titles the tile of the page to be grabbed.
#' @param baseurl the base url of the Wiki, e.g. "http://en.wikipedia.org/w".
#' @param format the desired format for the output
#' @param prop the revision desired. Default is most recent first
#' @param rvprop What should be returned from the revision. Defaults to "content". see details. Can include more than one of the options.
#' @returns the contents of a mediawiki page in the requested format
#' @import httr gsubfn
#' @details
#' http://en.wikipedia.org/w/api.php?format=xml&action=query&titles=Main%20Page&prop=revisions&rvprop=content
#'Parameters:
#' rvprop - Which properties to get for each revision:
#' ids - The ID of the revision
#' flags - Revision flags (minor)
#' timestamp - The timestamp of the revision
#' user - User that made the revision
#' userid - User id of revision creator
#' size - Length (bytes) of the revision
#' sha1 - SHA-1 (base 16) of the revision
#' comment - Comment by the user for revision
#' parsedcomment - Parsed comment by the user for the revision
#' content - Text of the revision
#' tags - Tags for the revision
#' Values (separate with '|'): ids, flags, timestamp, user, userid, size, sha1, comment, parsedcomment, content, tags, flagged
#' Default: ids|timestamp|flags|comment|user
#'
#' @references See http://www.mediawiki.org/wiki/API:Main_page for and introduction, and http://en.wikipedia.org/w/api.php for the full documentation of the MediaWiki API
#' @export
wiki_get <- function(titles, baseurl="http://wikipedia.org", format=c("xml", "json"), prop=c("revisions"), rvprop=c("content", "tags"), ...){
format <- match.arg(format)
# if(!is.null(baseurl))
if(length(rvprop) > 1)
rvprop <- paste0(rvprop, collapse="|")
action="query"
addr <- paste(baseurl, "/api.php?format=", format, "&action=", action, "&titles=", titles, "&prop=", prop, "&rvprop=", rvprop, sep="")
config <- c(add_headers("User-Agent" = "rwiki"), ...)
out <- GET(addr, config=config)
}

#
wiki_parse <- function(page, baseurl, format="json", ...){
require(httr)
action = "parse"
addr <- paste(baseurl, "/api.php?format=", format, "&action=", action, "&page=", page, sep="")
config <- c(add_headers("User-Agent" = "rwiki"), ...)
out <- GET(addr, config=config)
parsed_content(out)
}

0 comments on commit 1efa80e

Please sign in to comment.