Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Functions for migrating from OWW to Jekyll
- Loading branch information
Showing
3 changed files
with
167 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,117 @@ | |||
require(httr) | |||
require(gsubfn) | |||
|
|||
oww_to_md <- function(pages, user="Carl_Boettiger", baseurl = "http://openwetware.org"){ | |||
# Store filenames (with dates) and categories for each page, for reference later | |||
filenames <- get_filenames(pages) | |||
categories <- get_categories(pages) | |||
## Use the API function to extract the content | |||
parsed <- export_oww(pages, user, baseurl) | |||
## Add the baseurl back into all the links | |||
htmls <- add_baseurl(parsed) | |||
## Create YAML header text | |||
headers <- add_header_txt(parsed, categories) | |||
## Use pandoc to convert the html to markdown | |||
mds <- html_to_md(htmls, filenames) | |||
## Delete the OWW header info we don't want in the markdown | |||
mds <- clean_md(mds) | |||
## Stick headers onto markdown files and name them according to filenames | |||
write_mds(mds, headers, filenames) | |||
} | |||
|
|||
get_filenames <- function(pages){ | |||
lapply(pages, function(p){ | |||
p <- gsub(" ", "_", p) | |||
filename <- gsub("/", "-", p) | |||
filename <- gsub("(.*)-(\\d+-\\d+-\\d+)", "\\2-\\1", filename) | |||
}) | |||
} | |||
|
|||
#' Take a list of page titles to a user's OWW notebooks and export them as Jekyll markdown entries. | |||
get_categories <- function(pages){ | |||
lapply(pages, function(p){ | |||
p <- gsub("_", " ", p) | |||
filename <- gsub("/", "-", p) | |||
category <- gsub("(.*)-(\\d+-\\d+-\\d+)", "\\1", filename) | |||
}) | |||
} | |||
|
|||
|
|||
#' Take a list of page titles to a user's OWW notebooks and export them as Jekyll markdown entries. | |||
export_oww <- function(pages, user="Carl_Boettiger", baseurl= "http://openwetware.org"){ | |||
lapply(pages, function(p){ | |||
p <- gsub(" ", "_", p) | |||
page <- paste("User:", user, "/Notebook/", p, sep="") | |||
out <- wiki_parse(page, baseurl=baseurl) | |||
}) | |||
} | |||
|
|||
add_baseurl <- function(parsed, baseurl = "http://openwetware.org"){ | |||
lapply(parsed, function(out){ | |||
html <- out$parse$text[[1]] | |||
html <- gsubfn("src=\"/images/", paste("src=\"", | |||
baseurl,"/images/", sep=""), html) | |||
html <- gsubfn("href=\"/wiki/", paste("src=\"", | |||
baseurl,"/wiki/", | |||
sep=""), html) | |||
}) | |||
} | |||
|
|||
add_header_txt <- function(parsed, mycategories){ | |||
sapply(1:length(parsed), function(i){ | |||
out <- parsed[[i]] | |||
category <- mycategories[[i]] | |||
if(length(out$parse$categories)>0){ | |||
categories <- sapply(out$parse$categories, function(x) x$`*`) | |||
gsub("_", " ", categories) | |||
## Standarize some of my tags, called "categories" on OWW | |||
tags <- paste("tags: ", "[", | |||
paste0(categories, collapse = ", "), | |||
"]", sep="") | |||
} else { | |||
tags <- "" | |||
} | |||
|
|||
header <- c("---", | |||
"layout: post", | |||
tags, | |||
paste("categories: ", category), | |||
"---\n\n") | |||
}) | |||
} | |||
|
|||
|
|||
html_to_md <- function(htmls, filenames){ | |||
lapply(1:length(htmls), function(html){ | |||
writeLines(htmls[[i]], paste(filenames[i], ".html", sep="")) | |||
md <- system(paste("pandoc ", filenames[i], ".html -w markdown", | |||
sep=""), intern=TRUE) | |||
}) | |||
} | |||
|
|||
|
|||
clean_md <- function(mds){ | |||
lapply(mds, function(md){ | |||
## Remove OWW header | |||
md <- gsub("!\\[image\\]\\(http://openwetware.org/images/f/f8/Owwnotebook_icon.png\\)", "", md) | |||
md <- gsub("!\\[image\\]\\(http://openwetware.org/images/9/94/Report.png\\)", "", md) | |||
md <- gsub("Main project", "", md) | |||
md <- gsub("page\\\\", "", md) | |||
md <- gsub("!\\[image\\]\\(http://openwetware.org/images/c/c3/Resultset_previous.png\\)", "", md) | |||
md <- gsub("Previous$", "", md) | |||
md <- gsub("^entry Next$", "", md) | |||
md <- gsub("!\\[image\\]\\(http://openwetware.org/images/5/5c/Resultset_next.png\\)", "", md) | |||
md <- gsub("^entry$", "", md) | |||
md <- gsub("^Stochastic Population Dynamics", "", md) | |||
md <- gsub("^Comparative Phylogenetics$", "", md) | |||
md <- gsub("^Teaching$", "", md) | |||
}) | |||
} | |||
|
|||
|
|||
|
|||
write_md <- function(mds, headers, filenames){ | |||
lapply(1:length(mds), function(i){ | |||
writeLines(c(headers[[i]], mds[[i]]), paste(filenames[i], ".markdown", sep="")) | |||
}) | |||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,49 @@ | |||
#' Place a GET call to the mediawiki API using query action | |||
#' @param titles the tile of the page to be grabbed. | |||
#' @param baseurl the base url of the Wiki, e.g. "http://en.wikipedia.org/w". | |||
#' @param format the desired format for the output | |||
#' @param prop the revision desired. Default is most recent first | |||
#' @param rvprop What should be returned from the revision. Defaults to "content". see details. Can include more than one of the options. | |||
#' @returns the contents of a mediawiki page in the requested format | |||
#' @import httr gsubfn | |||
#' @details | |||
#' http://en.wikipedia.org/w/api.php?format=xml&action=query&titles=Main%20Page&prop=revisions&rvprop=content | |||
#'Parameters: | |||
#' rvprop - Which properties to get for each revision: | |||
#' ids - The ID of the revision | |||
#' flags - Revision flags (minor) | |||
#' timestamp - The timestamp of the revision | |||
#' user - User that made the revision | |||
#' userid - User id of revision creator | |||
#' size - Length (bytes) of the revision | |||
#' sha1 - SHA-1 (base 16) of the revision | |||
#' comment - Comment by the user for revision | |||
#' parsedcomment - Parsed comment by the user for the revision | |||
#' content - Text of the revision | |||
#' tags - Tags for the revision | |||
#' Values (separate with '|'): ids, flags, timestamp, user, userid, size, sha1, comment, parsedcomment, content, tags, flagged | |||
#' Default: ids|timestamp|flags|comment|user | |||
#' | |||
#' @references See http://www.mediawiki.org/wiki/API:Main_page for and introduction, and http://en.wikipedia.org/w/api.php for the full documentation of the MediaWiki API | |||
#' @export | |||
wiki_get <- function(titles, baseurl="http://wikipedia.org", format=c("xml", "json"), prop=c("revisions"), rvprop=c("content", "tags"), ...){ | |||
format <- match.arg(format) | |||
# if(!is.null(baseurl)) | |||
if(length(rvprop) > 1) | |||
rvprop <- paste0(rvprop, collapse="|") | |||
action="query" | |||
addr <- paste(baseurl, "/api.php?format=", format, "&action=", action, "&titles=", titles, "&prop=", prop, "&rvprop=", rvprop, sep="") | |||
config <- c(add_headers("User-Agent" = "rwiki"), ...) | |||
out <- GET(addr, config=config) | |||
} | |||
|
|||
# | |||
wiki_parse <- function(page, baseurl, format="json", ...){ | |||
require(httr) | |||
action = "parse" | |||
addr <- paste(baseurl, "/api.php?format=", format, "&action=", action, "&page=", page, sep="") | |||
config <- c(add_headers("User-Agent" = "rwiki"), ...) | |||
out <- GET(addr, config=config) | |||
parsed_content(out) | |||
} | |||
|