Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

rwiki template for package (old)

  • Loading branch information...
commit c9c89307f64d665e29fb0f7095387f7837810264 1 parent 72f6c29
@cboettig authored
View
2  R-tricks/rwiki/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.markdown
View
15 R-tricks/rwiki/DESCRIPTION
@@ -0,0 +1,15 @@
+Package: rwiki
+Description: an R interface to the mediawiki API
+Title: an R interface to the MediaWiki API
+Version: 0.1
+License: CC0
+URL: https://github.com/ropensci/rwiki
+BugReports: https://github.com/cboettig/rwiki/issues
+Date: 2012
+Authors@R: c(person("Carl", "Boettiger", role = c("aut", "cre"), email =
+ "cboettig@gmail.com"))
+Depends:
+ R (>= 2.15)
+Imports:
+ httr (>= 0.1.1),
+
View
117 R-tricks/rwiki/R/export_oww.R
@@ -0,0 +1,117 @@
+require(httr)
+require(gsubfn)
+
+oww_to_md <- function(pages, user="Carl_Boettiger", baseurl = "http://openwetware.org"){
+ # Store filenames (with dates) and categories for each page, for reference later
+ filenames <- get_filenames(pages)
+ categories <- get_categories(pages)
+ ## Use the API function to extract the content
+ parsed <- export_oww(pages, user, baseurl)
+ ## Add the baseurl back into all the links
+ htmls <- add_baseurl(parsed)
+ ## Create YAML header text
+ headers <- add_header_txt(parsed, categories)
+ ## Use pandoc to convert the html to markdown
+ mds <- html_to_md(htmls, filenames)
+ ## Delete the OWW header info we don't want in the markdown
+ mds <- clean_md(mds)
+ ## Stick headers onto markdown files and name them according to filenames
+ write_mds(mds, headers, filenames)
+}
+
+get_filenames <- function(pages){
+ lapply(pages, function(p){
+ p <- gsub(" ", "_", p)
+ filename <- gsub("/", "-", p)
+ filename <- gsub("(.*)-(\\d+-\\d+-\\d+)", "\\2-\\1", filename)
+ })
+}
+
+#' Take a list of page titles to a user's OWW notebooks and export them as Jekyll markdown entries.
+get_categories <- function(pages){
+ lapply(pages, function(p){
+ p <- gsub("_", " ", p)
+ filename <- gsub("/", "-", p)
+ category <- gsub("(.*)-(\\d+-\\d+-\\d+)", "\\1", filename)
+ })
+}
+
+
+#' Take a list of page titles to a user's OWW notebooks and export them as Jekyll markdown entries.
+export_oww <- function(pages, user="Carl_Boettiger", baseurl= "http://openwetware.org"){
+ lapply(pages, function(p){
+ p <- gsub(" ", "_", p)
+ page <- paste("User:", user, "/Notebook/", p, sep="")
+ out <- wiki_parse(page, baseurl=baseurl)
+ })
+}
+
+add_baseurl <- function(parsed, baseurl = "http://openwetware.org"){
+ lapply(parsed, function(out){
+ html <- out$parse$text[[1]]
+ html <- gsubfn("src=\"/images/", paste("src=\"",
+ baseurl,"/images/", sep=""), html)
+ html <- gsubfn("href=\"/wiki/", paste("src=\"",
+ baseurl,"/wiki/",
+ sep=""), html)
+ })
+}
+
+add_header_txt <- function(parsed, mycategories){
+ sapply(1:length(parsed), function(i){
+ out <- parsed[[i]]
+ category <- mycategories[[i]]
+ if(length(out$parse$categories)>0){
+ categories <- sapply(out$parse$categories, function(x) x$`*`)
+ gsub("_", " ", categories)
+ ## Standarize some of my tags, called "categories" on OWW
+ tags <- paste("tags: ", "[",
+ paste0(categories, collapse = ", "),
+ "]", sep="")
+ } else {
+ tags <- ""
+ }
+
+ header <- c("---",
+ "layout: post",
+ tags,
+ paste("categories: ", category),
+ "---\n\n")
+ })
+}
+
+
+html_to_md <- function(htmls, filenames){
+ lapply(1:length(htmls), function(html){
+ writeLines(htmls[[i]], paste(filenames[i], ".html", sep=""))
+ md <- system(paste("pandoc ", filenames[i], ".html -w markdown",
+ sep=""), intern=TRUE)
+ })
+}
+
+
+clean_md <- function(mds){
+ lapply(mds, function(md){
+ ## Remove OWW header
+ md <- gsub("!\\[image\\]\\(http://openwetware.org/images/f/f8/Owwnotebook_icon.png\\)", "", md)
+ md <- gsub("!\\[image\\]\\(http://openwetware.org/images/9/94/Report.png\\)", "", md)
+ md <- gsub("Main project", "", md)
+ md <- gsub("page\\\\", "", md)
+ md <- gsub("!\\[image\\]\\(http://openwetware.org/images/c/c3/Resultset_previous.png\\)", "", md)
+ md <- gsub("Previous$", "", md)
+ md <- gsub("^entry Next$", "", md)
+ md <- gsub("!\\[image\\]\\(http://openwetware.org/images/5/5c/Resultset_next.png\\)", "", md)
+ md <- gsub("^entry$", "", md)
+ md <- gsub("^Stochastic Population Dynamics", "", md)
+ md <- gsub("^Comparative Phylogenetics$", "", md)
+ md <- gsub("^Teaching$", "", md)
+ })
+}
+
+
+
+write_md <- function(mds, headers, filenames){
+ lapply(1:length(mds), function(i){
+ writeLines(c(headers[[i]], mds[[i]]), paste(filenames[i], ".markdown", sep=""))
+ })
+}
View
49 R-tricks/rwiki/R/wiki_get.R
@@ -0,0 +1,49 @@
+#' Place a GET call to the mediawiki API using query action
+#' @param titles the tile of the page to be grabbed.
+#' @param baseurl the base url of the Wiki, e.g. "http://en.wikipedia.org/w".
+#' @param format the desired format for the output
+#' @param prop the revision desired. Default is most recent first
+#' @param rvprop What should be returned from the revision. Defaults to "content". see details. Can include more than one of the options.
+#' @returns the contents of a mediawiki page in the requested format
+#' @import httr gsubfn
+#' @details
+#' http://en.wikipedia.org/w/api.php?format=xml&action=query&titles=Main%20Page&prop=revisions&rvprop=content
+#'Parameters:
+#' rvprop - Which properties to get for each revision:
+#' ids - The ID of the revision
+#' flags - Revision flags (minor)
+#' timestamp - The timestamp of the revision
+#' user - User that made the revision
+#' userid - User id of revision creator
+#' size - Length (bytes) of the revision
+#' sha1 - SHA-1 (base 16) of the revision
+#' comment - Comment by the user for revision
+#' parsedcomment - Parsed comment by the user for the revision
+#' content - Text of the revision
+#' tags - Tags for the revision
+#' Values (separate with '|'): ids, flags, timestamp, user, userid, size, sha1, comment, parsedcomment, content, tags, flagged
+#' Default: ids|timestamp|flags|comment|user
+#'
+#' @references See http://www.mediawiki.org/wiki/API:Main_page for and introduction, and http://en.wikipedia.org/w/api.php for the full documentation of the MediaWiki API
+#' @export
+wiki_get <- function(titles, baseurl="http://wikipedia.org", format=c("xml", "json"), prop=c("revisions"), rvprop=c("content", "tags"), ...){
+ format <- match.arg(format)
+# if(!is.null(baseurl))
+ if(length(rvprop) > 1)
+ rvprop <- paste0(rvprop, collapse="|")
+ action="query"
+ addr <- paste(baseurl, "/api.php?format=", format, "&action=", action, "&titles=", titles, "&prop=", prop, "&rvprop=", rvprop, sep="")
+ config <- c(add_headers("User-Agent" = "rwiki"), ...)
+ out <- GET(addr, config=config)
+}
+
+#
+wiki_parse <- function(page, baseurl, format="json", ...){
+ require(httr)
+ action = "parse"
+ addr <- paste(baseurl, "/api.php?format=", format, "&action=", action, "&page=", page, sep="")
+ config <- c(add_headers("User-Agent" = "rwiki"), ...)
+ out <- GET(addr, config=config)
+ parsed_content(out)
+}
+
View
21 R-tricks/rwiki/R/wiki_to_md.R
@@ -0,0 +1,21 @@
+
+## Dumb idea to write one's own parser
+wiki_to_md <- function(file){
+ pattern <- "\\b(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|]"
+
+ txt <- readLines(file)
+ txt <- gsubfn("^={1}\\s(.+)\\s={1} *$", "# \\1", txt)
+ txt <- gsubfn("^={2}\\s(.+)\\s={2} *$", "## \\1", txt)
+ txt <- gsubfn("^={3}\\s(.+)\\s={3} *$", "### \\1", txt)
+ txt <- gsubfn("^={4}\\s(.+)\\s={4} *$", "### \\1", txt)
+ txt <- gsubfn("^={5}\\s(.+)\\s={5} *$", "### \\1", txt)
+ txt <- gsubfn("^={6}\\s(.+)\\s={6} *$", "### \\1", txt)
+
+ txt <- gsubfn(paste("\\[(", pattern, ") (.+)\\]", sep=""), "[\\2](\\1)", txt)
+
+ categories <- strapply(txt, "\\[\\[Category:(\\w+_*\\w*)\\]\\]")
+
+ categories <- strapply(txt, "\\[\\[Category:(\\w+_*\\w*)( *\\|* (\\w+_*\\w*) *)*\\]\\]")
+
+
+}

0 comments on commit c9c8930

Please sign in to comment.
Something went wrong with that request. Please try again.