Skip to content
Browse files

cleanup

  • Loading branch information...
1 parent 3e9e641 commit 5284cefdabcdb43b24ea76bf05a5de2afb0b2a24 @cboettig committed
Showing with 100 additions and 526 deletions.
  1. +6 −0 .gitignore
  2. +94 −0 fixtags.R
  3. +0 −12 minimal.md
  4. +0 −54 minimal.tex
  5. +0 −88 parallel.Rmd
  6. +0 −60 parallel.Rnw
  7. +0 −312 parallel.md
View
6 .gitignore
@@ -0,0 +1,6 @@
+*.Rproj
+.Rproj.user
+.Rhistory
+.RData
+cache/
+figure/
View
94 fixtags.R
@@ -0,0 +1,94 @@
+## need to support other yaml lists too
+fix_tags <- function(files){
+ require(gsubfn)
+ lapply(files, function(file){
+
+ fix <- function(tags){
+ tags <- strsplit(tags, ",")
+ tags <- gsub("\\[", "", tags)
+ tags <- gsub("\\]", "", tags) #drop for the moment if present, add back later
+ tags <- gsub("-", " ", tags)
+ tags <- gsub("-", " ", tags)
+ tags <- gsub("\\b(\\w)", "\\L\\1", tags, perl=TRUE) #lowercase
+ paste("tags: \\[", paste0(tags, collapse =","), "\\]", sep="")
+ }
+
+ content <- readLines(file)
+ content <- gsubfn("tags: (.*)", fix, content)
+
+ writeLines(content, file)
+ })
+}
+
+
+files <- system("ls *.markdown *.md", intern=TRUE)
+easytags <- function(files){
+ require(gsubfn)
+ lapply(files, function(file){
+ content <- readLines(file)
+ yaml <- grep("^---$", content)
+ content[yaml[1]:yaml[2]] <- gsubfn("Seminar", "seminar", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Teaching", "teaching", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Stochastic Population Dynamics", "ecology", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Phylogenetics", "evolution", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Computation", "computation", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Logistics", "logistics", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("ecology/evolution,*", "", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("progress report,*", "", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("site configuration", "site-configuration", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("conferences", "conference", content[yaml[1]:yaml[2]])
+ writeLines(content, file)
+ })
+}
+
+
+easytags <- function(files){
+ require(gsubfn)
+ lapply(files, function(file){
+ content <- readLines(file)
+ yaml <- grep("^---$", content)
+ content[yaml[1]:yaml[2]] <- gsubfn("categories: Stochastic Population Dynamics", "categories: [Stochastic Population Dynamics]", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("OpenScience", "open-science", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("open notebook thoughts", "open-science", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Beetles", "tribolium", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Regimes_model,*", "", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Paper_Outlines,*", "", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("prosecutor-fallacy,*", "", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("codepost,*", "", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("seminar,*", "", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("abstract,*", "", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Model_Choice", "model-choice", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("api", "hpc", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("computing", "hpc", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("ABC", "algorithms", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Adaptive Dynamics", "adaptive-dynamics", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Adaptive_Dynamics", "adaptive-dynamics", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Warning_signals", "warning-signals", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("early-warning", "warning-signals", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Code_tricks", "code-tricks", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("code tricks", "code-tricks", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Web2.0_Tools", "code-tricks", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Science2.0", "open-science", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Open Notebook Thoughts", "open-science", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Carl_meetings", "conferences", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Carl_talks", "conferences", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("Conferences", "conferences", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("conference[,$]+", "conferences", content[yaml[1]:yaml[2]])
+ writeLines(content, file)
+ })
+}
+
+
+easycategories <- function(files){
+ require(gsubfn)
+ lapply(files, function(file){
+ content <- readLines(file)
+ yaml <- grep("^---$", content)
+ content[yaml[1]:yaml[2]] <- gsubfn("categories: Stochastic_Population_Dynamics", "categories: Stochastic Population Dynamics", content[yaml[1]:yaml[2]])
+ content[yaml[1]:yaml[2]] <- gsubfn("categories: Comparative_Phylogenetics", "categories: Phylogenetics", content[yaml[1]:yaml[2]])
+ writeLines(content, file)
+ })
+}
+
+
+
View
12 minimal.md
@@ -1,12 +0,0 @@
-
-A simple table
-
-
- aa bb
- -- --
- x y
- z w
-
- Table: a caption, with a comma
-
-come on, please work
View
54 minimal.tex
@@ -1,54 +0,0 @@
-\documentclass[]{article}
-\usepackage{amssymb,amsmath}
-\usepackage{ifxetex,ifluatex}
-\ifxetex
- \usepackage{fontspec,xltxtra,xunicode}
- \defaultfontfeatures{Mapping=tex-text,Scale=MatchLowercase}
-\else
- \ifluatex
- \usepackage{fontspec}
- \defaultfontfeatures{Mapping=tex-text,Scale=MatchLowercase}
- \else
- \usepackage[utf8]{inputenc}
- \fi
-\fi
-\usepackage{ctable}
-\usepackage{float} % provides the H option for float placement
-\ifxetex
- \usepackage[setpagesize=false, % page size defined by xetex
- unicode=false, % unicode breaks when used with xetex
- xetex,
- colorlinks=true,
- linkcolor=blue]{hyperref}
-\else
- \usepackage[unicode=true,
- colorlinks=true,
- linkcolor=blue]{hyperref}
-\fi
-\hypersetup{breaklinks=true, pdfborder={0 0 0}}
-\setlength{\parindent}{0pt}
-\setlength{\parskip}{6pt plus 2pt minus 1pt}
-\setlength{\emergencystretch}{3em} % prevent overfull lines
-\setcounter{secnumdepth}{0}
-
-
-\begin{document}
-
-A simple table
-
-\ctable[caption = a caption, with a comma, pos = H, center, botcap]{ll}
-{% notes
-}
-{% rows
-\FL
-aa & bb
-\ML
-x & y
-\\\noalign{\medskip}
-z & w
-\LL
-}
-
-come on, please work
-
-\end{document}
View
88 parallel.Rmd
@@ -1,88 +0,0 @@
-# Parallelization on High-Performance Clusters in R
-`ro cache=FALSE or`
-
-# Parallel high performance computing environments for R
-
-This is a minimal tutorial on parallel computing environments for R. While an array of packages (and a lot of native R since the 2.14.0 release) supports multi-core parallelization, allowing R to take advantage of the multiple processors found on a single chip that are now standard in laptop and desktop machines, this focus here is on running R in larger clusters.
-
-Large clusters connect many "compute nodes" together in a way that allows them to share access to data through the hard disk, but each node has it's own processor and it's own memory. The challenge comes entirely from this latter situation -- unlike the multicore chips on laptops and desktops, the different processors each look at their own memory sitting next to them -- they cannot see the memory (or RAM) on another node. This requires data to be passed back and forth between the nodes explicitly. The Message Passing Interface (MPI) has been the standard protocol to do this on large supercomputers, and will be our focus here. You'll notice that most of these commands directly deal with this challenge of passing data to the different compute nodes.
-
-Note that the cluster or supercomputer architecture is designed essentially with this in mind. All the nodes can still access the same harddisk, and the approach still requires the nodes in the cluster to all be directly connected. This is distinct from more distributed computing or cloud computing architectures where the nodes are not as tightly coupled, and passing data between nodes becomes even more challenging. Also note that this approach still assumes data can be loaded into memory. While that might be several gigabytes per node on most compute clusters, due to the rapid increase in the number of cores or processors we can put on a single chip, the amount of memory per processor is actually been flat or decreasing. GPU architectures are a prime example -- providing 100s of processors on a single node, but very little memory per processor. This trend further exacerbates the basic problem of big data.
-
-## Onto the Code
-
-We will ignore these issues for the time being and simply introduce the syntax for running a command across a series of processors in these high-performance computing environments. The code is usually submitted to a cluster using a queue, which allows multiple users to efficiently share access to the computational resource. The syntax of these queues differs with different software and hardware. In this example, I use [this script](https://github.com/cboettig/sandbox/blob/master/mpi.sh) to request my run on the cluster. The job is then submitted to the cluster using the command
-
-```
-qsub mpi.sh
-```
-## RMPI
-
-The direct Rmpi way:
-
-``` {r }
- library(Rmpi)
- mpi.spawn.Rslaves(nslaves=3)
- slavefn <- function() { print(paste("Hello from", foldNumber)) }
- mpi.bcast.cmd(foldNumber <- mpi.comm.rank())
- mpi.bcast.Robj2slave(slavefn)
- result <- mpi.remote.exec(slavefn())
- print(result)
- mpi.close.Rslaves()
-````
-
-# Benchmark
-
-``` {r }
- A <- matrix(rnorm(1e6), 1e3)
- system.time(A %*% A)
-````
-
-
-
-## SNOW
-
-``` {r }
- library(snow)
- cluster <- makeCluster(4, type="MPI")
- clusterEvalQ(cluster, library(utils)) # load a library
- clusterExport(cluster, ls()) # export everything
- out <- parSapply(cluster, 1:4, function(x) print(paste("snow hello from ", x)))
- print(out)
- system.time(parMM(cluster, A, A))
- stopCluster(cluster)
-````
-
-## SNOWFALL
-(default "SOCK" type, for multicore machines).
-
-``` {r }
- library(snowfall)
- sfInit( parallel=TRUE, cpus=4)
- sfExportAll()
- sfLibrary(utils)
- out <- sfSapply(1:4, function(x) print(paste("snow hello from ", x)))
- print(out)
- system.time(sfMM(A, A))
- sfStop()
-````
-
-Snowfall using MPI mode, for distributing across nodes in a cluster (that use a shared hard disk but don't share memory).
-
-``` {r eval=FALSE }
- library(snowfall)
- sfInit( parallel=TRUE, cpus=4, type="MPI" )
- sfExportAll()
- sfLibrary(utils)
- out <- sfSapply(1:4, function(x) print(paste("snow hello from ", x)))
- print(out)
- system.time(sfMM(A, A))
- sfStop()
-````
-For reasons unknown to me, this last command does not work on farm, though it works fine on NERSC cluster.
-
-snow's close command, which shuts down and quits from script.
-
-``` {r eval=FALSE}
-mpi.quit(save = "no")
-```
View
60 parallel.Rnw
@@ -1,60 +0,0 @@
-\documentclass{article}
-\begin{document}
-
-<<cache=FALSE>>=
-require(doSNOW)
-library(foreach)
-registerDoSNOW(makeCluster(4, type = "SOCK"))
-getDoParWorkers()
-@
-
-When cache=TRUE, function defined in the same chunk can be
-successfully exported automatically by foreach:
-
-<<test, cache=TRUE>>=
-set.seed(100)
-m <- matrix(rnorm(400), 200, 2)
-myMean <- function(x) mean(x)
-
-time <- system.time(res <- foreach(i=1:nrow(m), .combine='c') %dopar%
-{
- myMean(m[i,])
-})
-res[1:3]
-apply(m[1:3, ], 1, mean)
-@
-
-However, if the user defined function is not within the same chunk,
-foreach cannot export this function:
-<<test2, cache=TRUE>>=
-time <- system.time(res2 <- foreach(i=1:nrow(m), .combine='c') %dopar%
-{
- myMean(m[i,])
-})
-res2[1:3]
-apply(m[1:3, ], 1, mean)
-@
-
-Mannually exporting the user defined function and data works fine:
-<<test3, cache=TRUE>>=
-time <- system.time(res3 <-
-foreach(i=1:nrow(m), .combine='c', .export=c('myMean','m')) %dopar% {
- myMean(m[i,])
-})
-res3[1:3]
-apply(m[1:3, ], 1, mean)
-@
-
-If cache=FALSE, everything is fine:
-
-<<test4>>=
-time <- system.time(res4 <- foreach(i=1:nrow(m), .combine='c') %dopar%
-{
- myMean(m[i,])
-})
-res4[1:3]
-apply(m[1:3, ], 1, mean)
-@
-
-
-\end{document}
View
312 parallel.md
@@ -1,312 +0,0 @@
-# Parallelization on High-Performance Clusters in R
-
-
-# Parallel high performance computing environments for R
-
-This is a minimal tutorial on parallel computing environments for R. While an array of packages (and a lot of native R since the 2.14.0 release) supports multi-core parallelization, allowing R to take advantage of the multiple processors found on a single chip that are now standard in laptop and desktop machines, this focus here is on running R in larger clusters.
-
-Large clusters connect many "compute nodes" together in a way that allows them to share access to data through the hard disk, but each node has it's own processor and it's own memory. The challenge comes entirely from this latter situation -- unlike the multicore chips on laptops and desktops, the different processors each look at their own memory sitting next to them -- they cannot see the memory (or RAM) on another node. This requires data to be passed back and forth between the nodes explicitly. The Message Passing Interface (MPI) has been the standard protocol to do this on large supercomputers, and will be our focus here. You'll notice that most of these commands directly deal with this challenge of passing data to the different compute nodes.
-
-Note that the cluster or supercomputer architecture is designed essentially with this in mind. All the nodes can still access the same harddisk, and the approach still requires the nodes in the cluster to all be directly connected. This is distinct from more distributed computing or cloud computing architectures where the nodes are not as tightly coupled, and passing data between nodes becomes even more challenging. Also note that this approach still assumes data can be loaded into memory. While that might be several gigabytes per node on most compute clusters, due to the rapid increase in the number of cores or processors we can put on a single chip, the amount of memory per processor is actually been flat or decreasing. GPU architectures are a prime example -- providing 100s of processors on a single node, but very little memory per processor. This trend further exacerbates the basic problem of big data.
-
-## Onto the Code
-
-We will ignore these issues for the time being and simply introduce the syntax for running a command across a series of processors in these high-performance computing environments. The code is usually submitted to a cluster using a queue, which allows multiple users to efficiently share access to the computational resource. The syntax of these queues differs with different software and hardware. In this example, I use [this script](https://github.com/cboettig/sandbox/blob/master/mpi.sh) to request my run on the cluster. The job is then submitted to the cluster using the command
-
-```
-qsub mpi.sh
-```
-## RMPI
-
-The direct Rmpi way:
-
-
-
-```r
-library(Rmpi)
-mpi.spawn.Rslaves(nslaves = 3)
-```
-
-
-
-```
-## 3 slaves are spawned successfully. 0 failed.
-## master (rank 0, comm 1) of size 4 is running on: c0-6
-## slave1 (rank 1, comm 1) of size 4 is running on: c0-6
-## slave2 (rank 2, comm 1) of size 4 is running on: c0-6
-## slave3 (rank 3, comm 1) of size 4 is running on: c0-6
-```
-
-
-
-```r
-slavefn <- function() {
- print(paste("Hello from", foldNumber))
-}
-mpi.bcast.cmd(foldNumber <- mpi.comm.rank())
-mpi.bcast.Robj2slave(slavefn)
-result <- mpi.remote.exec(slavefn())
-print(result)
-```
-
-
-
-```
-## $slave1
-## [1] "Hello from 1"
-##
-## $slave2
-## [1] "Hello from 2"
-##
-## $slave3
-## [1] "Hello from 3"
-##
-```
-
-
-
-```r
-mpi.close.Rslaves()
-```
-
-
-
-```
-## [1] 1
-```
-
-
-
-
-# Benchmark
-
-
-
-```r
-A <- matrix(rnorm(1e+06), 1000)
-system.time(A %*% A)
-```
-
-
-
-```
-## user system elapsed
-## 1.475 0.004 1.480
-```
-
-
-
-
-
-
-## SNOW
-
-
-
-```r
-library(snow)
-cluster <- makeCluster(4, type = "MPI")
-```
-
-
-
-```
-## 4 slaves are spawned successfully. 0 failed.
-```
-
-
-
-```r
-clusterEvalQ(cluster, library(utils)) # load a library
-```
-
-
-
-```
-## [[1]]
-## [1] "snow" "Rmpi" "methods" "stats" "graphics" "grDevices"
-## [7] "utils" "datasets" "base"
-##
-## [[2]]
-## [1] "snow" "Rmpi" "methods" "stats" "graphics" "grDevices"
-## [7] "utils" "datasets" "base"
-##
-## [[3]]
-## [1] "snow" "Rmpi" "methods" "stats" "graphics" "grDevices"
-## [7] "utils" "datasets" "base"
-##
-## [[4]]
-## [1] "snow" "Rmpi" "methods" "stats" "graphics" "grDevices"
-## [7] "utils" "datasets" "base"
-##
-```
-
-
-
-```r
-clusterExport(cluster, ls()) # export everything
-out <- parSapply(cluster, 1:4, function(x) print(paste("snow hello from ",
- x)))
-print(out)
-```
-
-
-
-```
-## [1] "snow hello from 1" "snow hello from 2" "snow hello from 3"
-## [4] "snow hello from 4"
-```
-
-
-
-```r
-system.time(parMM(cluster, A, A))
-```
-
-
-
-```
-## user system elapsed
-## 0.741 0.310 1.051
-```
-
-
-
-```r
-stopCluster(cluster)
-```
-
-
-
-```
-## [1] 1
-```
-
-
-
-
-## SNOWFALL
-(default "SOCK" type, for multicore machines).
-
-
-
-```r
-library(snowfall)
-sfInit(parallel = TRUE, cpus = 4)
-```
-
-
-
-```
-## R Version: R version 2.15.0 (2012-03-30)
-##
-```
-
-
-
-```
-## snowfall 1.84 initialized (using snow 0.3-8): parallel execution on 4 CPUs.
-##
-```
-
-
-
-```r
-sfExportAll()
-sfLibrary(utils)
-```
-
-
-
-```
-## Library utils loaded.
-```
-
-
-
-```
-## Library utils loaded in cluster.
-##
-```
-
-
-
-```
-## Warning message: 'keep.source' is deprecated and will be ignored
-```
-
-
-
-```r
-out <- sfSapply(1:4, function(x) print(paste("snow hello from ",
- x)))
-print(out)
-```
-
-
-
-```
-## [1] "snow hello from 1" "snow hello from 2" "snow hello from 3"
-## [4] "snow hello from 4"
-```
-
-
-
-```r
-system.time(sfMM(A, A))
-```
-
-
-
-```
-## user system elapsed
-## 0.277 0.030 0.733
-```
-
-
-
-```r
-sfStop()
-```
-
-
-
-```
-##
-## Stopping cluster
-##
-```
-
-
-
-
-Snowfall using MPI mode, for distributing across nodes in a cluster (that use a shared hard disk but don't share memory).
-
-
-
-```r
-library(snowfall)
-sfInit(parallel = TRUE, cpus = 4, type = "MPI")
-sfExportAll()
-sfLibrary(utils)
-out <- sfSapply(1:4, function(x) print(paste("snow hello from ",
- x)))
-print(out)
-system.time(sfMM(A, A))
-sfStop()
-```
-
-
-
-For reasons unknown to me, this last command does not work on farm, though it works fine on NERSC cluster.
-
-snow's close command, which shuts down and quits from script.
-
-
-
-```r
-mpi.quit(save = "no")
-```
-
-
-

0 comments on commit 5284cef

Please sign in to comment.
Something went wrong with that request. Please try again.