From d39797a04af8f41aeb3afbe1ba93f5804ff51765 Mon Sep 17 00:00:00 2001
From: ianmilligan1 <i2milligan@uwaterloo.ca>
Date: Mon, 14 Nov 2016 19:59:21 -0500
Subject: [PATCH 1/8] first pass at aws scripts

---
 README.md                | 53 +++++++++++++++++++++++++++++++---------
 Vagrantfile              | 19 +++++++++++++-
 coursework/lessonplan.md | 52 +++++++++++++++++++--------------------
 scripts/warcbase.sh      | 24 +++++++++---------
 4 files changed, 97 insertions(+), 51 deletions(-)

diff --git a/README.md b/README.md
index ce6e96c..ce604e8 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ To install this virtual machine, you have two options.
 
 [You can download it from this link and "import the appliance" using VirtualBox](http://alpha.library.yorku.ca/releases/warcbase_workshop/Warcbase_workshop_VM.ova). Note that this is a 6.4GB download. If you do this, [skip to "Spark Notebook" below](https://github.com/web-archive-group/warcbase_workshop_vagrant#spark-notebook).
 
-Or you can use vagrant to build it yourself.
+Or you can use vagrant to build it yourself, or provision it using `aws`.
 
 ## Use
 
@@ -38,6 +38,35 @@ From a working directory, please run the following commands.
 
 Once you run these three commands, you will have a running virtual machine with the latest version of warcbase installed.
 
+## Cloud Deployment
+
+You can also deploy this as an AWS machine. To do so, install [vagrant-aws](https://github.com/mitchellh/vagrant-aws). 
+
+`vagrant plugin install vagrant-aws`
+
+And then modify the `VagrantFile` to point to your AWS information. The following block will need to be changed:
+
+```
+  config.vm.provider :aws do |aws, override|
+  aws.access_key_id = "KEYHERE"
+  aws.secret_access_key = "SECRETKEYHERE"
+  aws.region = "us-west-2"
+
+  aws.region_config "us-west-2" do |region|
+      region.ami = "ami-01f05461"
+      region.keypair_name = "KEYPAIRNAME"
+  end
+
+  override.ssh.username = "ubuntu"
+  override.ssh.private_key_path = "PATHTOPRIVATEKEY"
+```
+
+You can then load it by typing:
+
+`vagrant up --provider aws`
+
+Note, you will need to change your AWS Security Group to allow for incoming connections on port 22 (SSH) and 9000 (for Spark Notebook).
+
 ## Connect
 
 Now you need to connect to the machine. This will be done through your command line, but also through your browser through Spark Notebook.
@@ -47,14 +76,14 @@ We use three commands to connect to this virtual machine. `ssh` to connect to it
 To get started, type `vagrant ssh` in the directory where you installed the VM. 
 
 When prompted:
-  - username: `vagrant`
-  - password: `vagrant`
+  - username: `ubuntu`
+  - password: `ubuntu`
 
 Here are some other example commands:
-* `ssh -p 2222 vagrant@localhost` - will connect to the machine using `ssh`;
-* `scp -P 2222 somefile.txt vagrant@localhost:/destination/path` - will copy `somefile.txt` to your vagrant machine. 
-  - You'll need to specify the destination. For example, `scp -P 2222 WARC.warc.gz vagrant@localhost:/home/vagrant` will copy WARC.warc.gz to the home directory of the vagrant machine.
-* `rsync --rsh='ssh -p2222' -av somedir vagrant@localhost:/home/vagrant` - will sync `somedir` to your home directory of the vagrant machine.
+* `ssh -p 2222 ubuntu@localhost` - will connect to the machine using `ssh`;
+* `scp -P 2222 somefile.txt ubuntu@localhost:/destination/path` - will copy `somefile.txt` to your vagrant machine. 
+  - You'll need to specify the destination. For example, `scp -P 2222 WARC.warc.gz ubuntu@localhost:/home/ubuntu` will copy WARC.warc.gz to the home directory of the vagrant machine.
+* `rsync --rsh='ssh -p2222' -av somedir ubuntu@localhost:/home/ubuntu` - will sync `somedir` to your home directory of the vagrant machine.
 
 ## Environment
 
@@ -72,7 +101,7 @@ Here are some other example commands:
 To run spark notebook, type the following:
 
 * `vagrant ssh` (if on vagrant; if you downloaded the ova file and are running with VirtualBox you do not need to do this)
-* `cd project/spark-notebook-0.6.2-SNAPSHOT-scala-2.10.4-spark-1.5.1-hadoop-2.6.0-cdh5.4.2/bin`
+* `cd /home/ubuntu/project/spark-notebook-0.6.3-scala-2.11.7-spark-1.6.2-hadoop-2.7.2/bin`
 * `./spark-notebook -Dhttp.port=9000 -J-Xms1024m`
 * Visit http://127.0.0.1:9000/ in your web browser.
 
@@ -84,11 +113,11 @@ To run spark shell:
 
 * `vagrant ssh` (if you did not run that in the previous step)
 * `cd project/spark-1.5.1-bin-hadoop2.6/bin`
-* `./spark-shell --jars /home/vagrant/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar`
+* `./spark-shell --jars /home/ubuntu/project/warcbase/warcbase-core/target/warcbase-core-0.1.0-SNAPSHOT-fatjar.jar`
 
 Example:
 ```bash
-vagrant@warcbase:~/project/spark-1.5.1-bin-hadoop2.6/bin$ ./spark-shell --jars /home/vagrant/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar
+ubuntu@warcbase:~/project/spark-1.5.1-bin-hadoop2.6/bin$ ./spark-shell --jars /home/ubuntu/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar
 WARN  NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 Welcome to
       ____              __
@@ -116,7 +145,7 @@ scala> :paste
 
 import org.warcbase.spark.matchbox._ 
 import org.warcbase.spark.rdd.RecordRDD._ 
-val r = RecordLoader.loadArchives("/home/vagrant/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc)
+val r = RecordLoader.loadArchives("/home/ubuntu/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc)
   .keepValidPages()
   .map(r => ExtractDomain(r.getUrl))
   .countItems()
@@ -134,7 +163,7 @@ To quit Spark Shell, you can exit using Ctrl+C.
 
 ## Resources
 
-This build also includes the [warcbase resources](https://github.com/lintool/warcbase-resources) repository, which contains NER libraries as well as sample data from the University of Toronto (located in `/home/vagrant/project/warcbase-resources/Sample-Data/`).
+This build also includes the [warcbase resources](https://github.com/lintool/warcbase-resources) repository, which contains NER libraries as well as sample data from the University of Toronto (located in `/home/ubuntu/project/warcbase-resources/Sample-Data/`).
 
 The ARC and WARC file are drawn from the [Canadian Political Parties & Political Interest Groups Archive-It Collection](https://archive-it.org/collections/227), collected by the University of Toronto. We are grateful that they've provided this material to us.
 
diff --git a/Vagrantfile b/Vagrantfile
index b5085db..f2e1233 100644
--- a/Vagrantfile
+++ b/Vagrantfile
@@ -14,10 +14,27 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
   config.vm.hostname = "warcbase"
 
   # Every Vagrant virtual environment requires a box to build off of.
-  config.vm.box = "ubuntu/trusty64"
+  config.vm.box = "dummy"
 
   config.vm.network :forwarded_port, guest: 9000, host: 9000 # Spark Notebook
 
+  config.vm.provider :aws do |aws, override|
+  aws.access_key_id = "KEY"
+  aws.secret_access_key = "SECRETKEY"
+  #aws.security_groups = "sg-eaf78b93"
+
+  #aws.session_token = ""
+  aws.region = "us-west-2"
+
+  aws.region_config "us-west-2" do |region|
+      region.ami = "ami-01f05461"
+      region.keypair_name = "KEYPAIRNAME"
+  end
+
+  override.ssh.username = "ubuntu"
+  override.ssh.private_key_path = "/PATH/TO/PRIVATE/key.pem"
+  end
+
   config.vm.provider "virtualbox" do |vb|
     vb.customize ["modifyvm", :id, "--memory", '2056']
     vb.customize ["modifyvm", :id, "--cpus", "2"]   
diff --git a/coursework/lessonplan.md b/coursework/lessonplan.md
index 225bb11..afe63ef 100644
--- a/coursework/lessonplan.md
+++ b/coursework/lessonplan.md
@@ -44,8 +44,8 @@ Go to your File menu, select "Import Appliance," and select the OVA file. Press
 Then press "Start."
 
 If you're lucky, the terminal window will appear. If you're asked for a username or password, it is:
-  - username: `vagrant`
-  - password: `vagrant`
+  - username: `ubuntu`
+  - password: `ubuntu`
 
 ### Option Two: Vagrant
 
@@ -78,33 +78,33 @@ When prompted:
   - password: `vagrant`
 
 Here are some other example commands:
-* `ssh -p 2222 vagrant@localhost` - will connect to the machine using `ssh`;
-* `scp -P 2222 somefile.txt vagrant@localhost:/destination/path` - will copy `somefile.txt` to your vagrant machine. 
-  - You'll need to specify the destination. For example, `scp -P 2222 WARC.warc.gz vagrant@localhost:/home/vagrant` will copy WARC.warc.gz to the home directory of the vagrant machine.
-* `rsync --rsh='ssh -p2222' -av somedir vagrant@localhost:/home/vagrant` - will sync `somedir` to your home directory of the vagrant machine.
+* `ssh -p 2222 ubuntu@localhost` - will connect to the machine using `ssh`;
+* `scp -P 2222 somefile.txt ubuntu@localhost:/destination/path` - will copy `somefile.txt` to your vagrant machine. 
+  - You'll need to specify the destination. For example, `scp -P 2222 WARC.warc.gz ubuntu@localhost:/home/ubuntu` will copy WARC.warc.gz to the home directory of the vagrant machine.
+* `rsync --rsh='ssh -p2222' -av somedir ubuntu@localhost:/home/ubuntu` - will sync `somedir` to your home directory of the vagrant machine.
 
 ## Testing
 
 Let's make sure we can get spark notebook running. On vagrant, connect using `vagrant ssh`. 
 
-If you used VirtualBox, you have two options. On OS X or Linux, you can minimize your window, open your terminal, and connect to it using: `ssh -p 2222 vagrant@localhost`.
+If you used VirtualBox, you have two options. On OS X or Linux, you can minimize your window, open your terminal, and connect to it using: `ssh -p 2222 ubuntu@localhost`.
 
 On Windows, you'll have to use your VirtualBox terminal.
 
 Either way, you should be at a prompt that looks like:
 
 ```
-vagrant@warcbase:~$
+ubuntu@warcbase:~$
 ```
 
 ### Testing Spark Shell
 
 * `cd project/spark-1.5.1-bin-hadoop2.6/bin`
-* `./spark-shell --jars /home/vagrant/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar`
+* `./spark-shell --jars /home/ubuntu/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar`
 
 Example:
 ```bash
-vagrant@warcbase:~/project/spark-1.5.1-bin-hadoop2.6/bin$ ./spark-shell --jars /home/vagrant/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar
+ubuntu@warcbase:~/project/spark-1.5.1-bin-hadoop2.6/bin$ ./spark-shell --jars /home/ubuntu/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar
 WARN  NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 Welcome to
       ____              __
@@ -158,7 +158,7 @@ Let's start a new notebook. Click the "new" button in the upper right, and then
 First, you need to load the warcbase jar. Paste this into the first command and press the play button.
 
 ```bash
-:cp /home/vagrant/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar
+:cp /home/ubuntu/project/warcbase/warcbase-core/target/warcbase-core-0.1.0-SNAPSHOT-fatjar.jar
 ```
 
 Second, you need to import the classes.
@@ -172,7 +172,7 @@ Third, let's run a test script. The following will load one of the ARC files fro
 
 ```scala
 val r = 
-  RecordLoader.loadArchives("/home/vagrant/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", 
+  RecordLoader.loadArchives("/home/ubuntu/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", 
 sc) 
   .keepValidPages() 
   .map(r => ExtractDomain(r.getUrl)) 
@@ -192,7 +192,7 @@ Let's give it a try by adapting some of the scripts that we might run in the She
 
 ```scala
 val r = 
-  RecordLoader.loadArchives("/home/vagrant/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", 
+  RecordLoader.loadArchives("/home/ubuntu/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", 
 sc) 
   .keepValidPages() 
   .map(r => { 
@@ -209,7 +209,7 @@ Again, change a variable. Right now, we see 100 characters of each webpage. Let'
 Sometimes it can get boring typing out the same thing over and over again. We can set variables to make our life easier, such as:
 
 ```scala
-val warc="/home/vagrant/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz"
+val warc="/home/ubuntu/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz"
 ```
 
 Now instead of typing the path, we can just use `warc`. Try running that cell and replacing it in the script above. For the lazy, it looks like:
@@ -260,13 +260,13 @@ For example, to grab the plain text from the collection and **save it to a file*
 import org.warcbase.spark.rdd.RecordRDD._
 import org.warcbase.spark.matchbox.{RemoveHTML, RecordLoader}
 
-RecordLoader.loadArchives("/home/vagrant/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc)
+RecordLoader.loadArchives("/home/ubuntu/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc)
   .keepValidPages()
   .map(r => (r.getCrawldate, r.getDomain, r.getUrl, RemoveHTML(r.getContentString)))
-  .saveAsTextFile("/home/vagrant/WARC-plain-text")
+  .saveAsTextFile("/home/ubuntu/WARC-plain-text")
 ```
 
-You should now have a directory in `/home/vagrant/` with the plain text. I will show you it.
+You should now have a directory in `/home/ubuntu/` with the plain text. I will show you it.
 
 ##### Text by Domain
 
@@ -276,11 +276,11 @@ Above, we saw that there were 34 pages belonging to `davidsuzuki.org`. Imagine w
 import org.warcbase.spark.matchbox.{RemoveHTML, RecordLoader}
 import org.warcbase.spark.rdd.RecordRDD._
 
-RecordLoader.loadArchives("/home/vagrant/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc)
+RecordLoader.loadArchives("/home/ubuntu/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc)
   .keepValidPages()
   .keepDomains(Set("www.davidsuzuki.org"))
   .map(r => (r.getCrawldate, r.getDomain, r.getUrl, RemoveHTML(r.getContentString)))
-  .saveAsTextFile("/home/vagrant/WARC-plain-text-David-Suzuki")
+  .saveAsTextFile("/home/ubuntu/WARC-plain-text-David-Suzuki")
 ```
 
 It should work as well. Note that your command `keepDomains(Set("www.davidsuzuki.org"))` needs to match the string you found above. 
@@ -300,7 +300,7 @@ import org.warcbase.spark.matchbox._
 import org.warcbase.spark.rdd.RecordRDD._
 import StringUtils._
 
-val links = RecordLoader.loadArchives("/home/vagrant/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc)
+val links = RecordLoader.loadArchives("/home/ubuntu/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc)
   .keepValidPages()
   .flatMap(r => ExtractLinks(r.getUrl, r.getContentString))
   .map(r => (ExtractDomain(r._1).removePrefixWWW(), ExtractDomain(r._2).removePrefixWWW()))
@@ -308,7 +308,7 @@ val links = RecordLoader.loadArchives("/home/vagrant/project/warcbase-resources/
   .countItems()
   .filter(r => r._2 > 5)
 
-links.saveAsTextFile("/home/vagrant/WARC-links-all/")
+links.saveAsTextFile("/home/ubuntu/WARC-links-all/")
 ```
 
 By now this should be seeming pretty straightforward. In your other window, visit the resulting file (the `part-00000` file in your `WARC-links-all` direcrory) and type:
@@ -340,7 +340,7 @@ You may want to do work with images. The following script finds all the image UR
 import org.warcbase.spark.matchbox._
 import org.warcbase.spark.rdd.RecordRDD._
 
-val links = RecordLoader.loadArchives("/home/vagrant/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc)
+val links = RecordLoader.loadArchives("/home/ubuntu/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc)
   .keepValidPages()
   .flatMap(r => ExtractImageLinks(r.getUrl, r.getContentString))
   .countItems()
@@ -364,13 +364,13 @@ We won't have much time for Spark Shell today, but we wanted to briefly show it.
 To run, navigate to the spark-shell directory by
 
 ```bash
-cd /home/vagrant/project/spark-1.5.1-bin-hadoop2.6/bin
+cd /home/ubuntu/project/spark-1.5.1-bin-hadoop2.6/bin
 ```
 
 Then run with:
 
 ```bash
-./spark-shell --jars /home/vagrant/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar
+./spark-shell --jars /home/ubuntu/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar
 ``` 
 
 >On your own system, you might want to pass different variables to allocate more memory and the such (i.e. on our server, we often use `/home/i2millig/spark-1.5.1/bin/spark-shell --driver-memory 60G --jars ~/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar` to give it 60GB of memory; or on the cluster, we use `spark-shell --jars ~/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar --num-executors 75 --executor-cores 5 --executor-memory 20G --driver-memory 26G`).
@@ -386,7 +386,7 @@ Then you can paste the following script. When it's looking right, press `Ctrl` a
 ```scala
 import org.warcbase.spark.matchbox._ 
 import org.warcbase.spark.rdd.RecordRDD._ 
-val r = RecordLoader.loadArchives("/home/vagrant/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc)
+val r = RecordLoader.loadArchives("/home/ubuntu/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc)
   .keepValidPages()
   .map(r => ExtractDomain(r.getUrl))
   .countItems()
@@ -405,7 +405,7 @@ Let's try setting it up on your own servers, or in a real production environment
 
 # Acknowledgements and Final Notes
 
-This build also includes the [warcbase resources](https://github.com/lintool/warcbase-resources) repository, which contains NER libraries as well as sample data from the University of Toronto (located in `/home/vagrant/project/warcbase-resources/Sample-Data/`).
+This build also includes the [warcbase resources](https://github.com/lintool/warcbase-resources) repository, which contains NER libraries as well as sample data from the University of Toronto (located in `/home/ubuntu/project/warcbase-resources/Sample-Data/`).
 
 The ARC and WARC file are drawn from the [Canadian Political Parties & Political Interest Groups Archive-It Collection](https://archive-it.org/collections/227), collected by the University of Toronto. We are grateful that they've provided this material to us.
 
diff --git a/scripts/warcbase.sh b/scripts/warcbase.sh
index 9f969f0..20b80cd 100644
--- a/scripts/warcbase.sh
+++ b/scripts/warcbase.sh
@@ -1,20 +1,20 @@
 #/bin/bash
 
 # warcbase
-cd /home/vagrant
+cd /home/ubuntu
 mkdir project
 
 # Apache Spark
-cd /home/vagrant/project
+cd /home/ubuntu/project
 wget http://d3kbcqa49mib13.cloudfront.net/spark-1.5.1-bin-hadoop2.6.tgz
 tar -xvf spark-1.5.1-bin-hadoop2.6.tgz
 rm spark-1.5.1-bin-hadoop2.6.tgz
 
 # Spark Notebook
-cd /home/vagrant/project
-wget https://s3.eu-central-1.amazonaws.com/spark-notebook/tgz/spark-notebook-master-scala-2.10.4-spark-1.5.1-hadoop-2.6.0-cdh5.4.2.tgz
-tar -xvf spark-notebook-master-scala-2.10.4-spark-1.5.1-hadoop-2.6.0-cdh5.4.2.tgz
-rm spark-notebook-master-scala-2.10.4-spark-1.5.1-hadoop-2.6.0-cdh5.4.2.tgz
+cd /home/ubuntu/project
+wget https://s3.eu-central-1.amazonaws.com/spark-notebook/tgz/spark-notebook-0.6.3-scala-2.11.7-spark-1.6.2-hadoop-2.7.2.tgz
+tar -xvf spark-notebook-0.6.3-scala-2.11.7-spark-1.6.2-hadoop-2.7.2.tgz
+rm spark-notebook-0.6.3-scala-2.11.7-spark-1.6.2-hadoop-2.7.2.tgz
 
 # warcbase dependencies (vagrant isn't playing nice with maven, or I don't have paths setup right)
 cd /tmp
@@ -34,9 +34,9 @@ wget http://central.maven.org/maven2/org/apache/commons/commons-compress/1.9/com
 wget http://central.maven.org/maven2/org/apache/commons/commons-compress/1.9/commons-compress-1.9.pom
 
 # warcbase
-cd /home/vagrant/project
+cd /home/ubuntu/project
 git clone http://github.com/lintool/warcbase.git
-cd /home/vagrant/project/warcbase
+cd /home/ubuntu/project/warcbase
 mvn install:install-file -Dfile=/usr/share/java/bsh-2.0b4.jar -DpomFile=/usr/share/maven-repo/org/beanshell/bsh/2.0b4/bsh-2.0b4.pom
 mvn install:install-file -Dfile=/usr/share/java/commons-cli-1.2.jar -DpomFile=/usr/share/maven-repo/commons-cli/commons-cli/1.2/commons-cli-1.2.pom
 mvn install:install-file -Dfile=/tmp/commons-logging-api-1.1.jar -DpomFile=/tmp/commons-logging-api-1.1.pom
@@ -46,12 +46,12 @@ mvn install:install-file -Dfile=/tmp/commons-lang-2.6.jar -DpomFile=/tmp/commons
 mvn install:install-file -Dfile=/tmp/commons-collections-3.2.1.jar -DpomFile=/tmp/commons-collections-3.2.1.pom
 mvn install:install-file -Dfile=/tmp/hamcrest-core-1.3.jar -DpomFile=/tmp/hamcrest-core-1.3.pom
 mvn install:install-file -Dfile=/tmp/commons-compress-1.9.jar -DpomFile=/tmp/commons-compress-1.9.pom
-mvn clean package appassembler:assemble -DskipTests
+mvn clean package -pl warcbase-core -DskipTests
 
 # sample files
-cd /home/vagrant/project
+cd /home/ubuntu/project
 git clone https://github.com/lintool/warcbase-resources.git
 
 # make sure permissions are fine
-cd /home/vagrant
-chown -hR vagrant:vagrant *
+cd /home/ubuntu
+chown -hR ubuntu:ubuntu *

From ac5c7069404748c3fbc1470f35d910b7b6c6e260 Mon Sep 17 00:00:00 2001
From: ianmilligan1 <i2milligan@uwaterloo.ca>
Date: Mon, 14 Nov 2016 20:07:48 -0500
Subject: [PATCH 2/8] tweaking aws instance to be loaded

---
 Vagrantfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Vagrantfile b/Vagrantfile
index f2e1233..78d02c2 100644
--- a/Vagrantfile
+++ b/Vagrantfile
@@ -28,6 +28,8 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 
   aws.region_config "us-west-2" do |region|
       region.ami = "ami-01f05461"
+      # by default, spins up lightweight m3.medium. If want powerful, uncomment below.
+      # region.instance_type = "c3.4xlarge"
       region.keypair_name = "KEYPAIRNAME"
   end
 

From a01133332ebf9ee0b4c1547b85b0b0c9ff41e4c4 Mon Sep 17 00:00:00 2001
From: ianmilligan1 <i2milligan@uwaterloo.ca>
Date: Mon, 14 Nov 2016 20:11:06 -0500
Subject: [PATCH 3/8] updating readme

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ce604e8..2ec8ad0 100644
--- a/README.md
+++ b/README.md
@@ -54,6 +54,9 @@ And then modify the `VagrantFile` to point to your AWS information. The followin
 
   aws.region_config "us-west-2" do |region|
       region.ami = "ami-01f05461"
+      # by default, spins up lightweight m3.medium. If want powerful, uncomment below.
+      # region.instance_type = "c3.4xlarge"
+
       region.keypair_name = "KEYPAIRNAME"
   end
 
@@ -65,7 +68,7 @@ You can then load it by typing:
 
 `vagrant up --provider aws`
 
-Note, you will need to change your AWS Security Group to allow for incoming connections on port 22 (SSH) and 9000 (for Spark Notebook).
+Note, you will need to change your AWS Security Group to allow for incoming connections on port 22 (SSH) and 9000 (for Spark Notebook). By default, it launches a lightweight m3.medium. To do real work, you will need a larger (and sadly more expensive instance).
 
 ## Connect
 

From 7a37201831c044fa47321345d2942078f3a74cc2 Mon Sep 17 00:00:00 2001
From: ianmilligan1 <i2milligan@uwaterloo.ca>
Date: Mon, 14 Nov 2016 20:13:35 -0500
Subject: [PATCH 4/8] tiny tweak on example Spark Shell

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2ec8ad0..d2ad168 100644
--- a/README.md
+++ b/README.md
@@ -120,7 +120,7 @@ To run spark shell:
 
 Example:
 ```bash
-ubuntu@warcbase:~/project/spark-1.5.1-bin-hadoop2.6/bin$ ./spark-shell --jars /home/ubuntu/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar
+ubuntu@warcbase:~/project/spark-1.5.1-bin-hadoop2.6/bin$ ./spark-shell --jars /home/ubuntu/project/warcbase/warcbase-core/target/warcbase-core-0.1.0-SNAPSHOT-fatjar.jar
 WARN  NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 Welcome to
       ____              __

From 2d09fc13f3c72d56b60dac13372538b26ced8908 Mon Sep 17 00:00:00 2001
From: ianmilligan1 <i2milligan@uwaterloo.ca>
Date: Mon, 14 Nov 2016 20:14:50 -0500
Subject: [PATCH 5/8] final commit of evening; am tired so making too many
 little onesGoogle

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d2ad168..8ec5776 100644
--- a/README.md
+++ b/README.md
@@ -106,7 +106,9 @@ To run spark notebook, type the following:
 * `vagrant ssh` (if on vagrant; if you downloaded the ova file and are running with VirtualBox you do not need to do this)
 * `cd /home/ubuntu/project/spark-notebook-0.6.3-scala-2.11.7-spark-1.6.2-hadoop-2.7.2/bin`
 * `./spark-notebook -Dhttp.port=9000 -J-Xms1024m`
-* Visit http://127.0.0.1:9000/ in your web browser.
+* Visit http://127.0.0.1:9000/ in your web browser. 
+
+If you are connecting via AWS, visit the IP address of your instance (found on EC2 dashboard), port 9000 (i.e. `35.162.32.51:9000`).
 
 ![Spark Notebook](https://cloud.githubusercontent.com/assets/218561/14062458/f8c6a842-f375-11e5-991b-c5d6a80c6f1a.png)
 

From 379d9678b31c4550cd0a0a14e591e72d24833851 Mon Sep 17 00:00:00 2001
From: ianmilligan1 <i2milligan@uwaterloo.ca>
Date: Mon, 14 Nov 2016 20:28:54 -0500
Subject: [PATCH 6/8] shoutout to docnow team for inspiration

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8ec5776..30727d5 100644
--- a/README.md
+++ b/README.md
@@ -185,4 +185,4 @@ You can find more information about this collection at [WebArchives.ca](http://w
 
 ## Acknowlegements
 
-This research has been supported by the Social Sciences and Humanities Research Council with Insight Grant 435-2015-0011. Additional funding for student labour on this project comes from an Ontario Ministry of Research and Innovation Early Researcher Award.
\ No newline at end of file
+This research has been supported by the Social Sciences and Humanities Research Council with Insight Grant 435-2015-0011. Additional funding for student labour on this project comes from an Ontario Ministry of Research and Innovation Early Researcher Award. The idea for the AWS deployment came from the DocNow team and their [repository here](https://github.com/web-archive-group/warcbase_workshop_vagrant/tree/aws).
\ No newline at end of file

From d319124dce627335da97c27d804d880cdf059358 Mon Sep 17 00:00:00 2001
From: ianmilligan1 <i2milligan@uwaterloo.ca>
Date: Tue, 15 Nov 2016 08:58:54 -0500
Subject: [PATCH 7/8] changed vm box provider to aws-supported one

---
 Vagrantfile | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/Vagrantfile b/Vagrantfile
index 78d02c2..76338e5 100644
--- a/Vagrantfile
+++ b/Vagrantfile
@@ -14,7 +14,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
   config.vm.hostname = "warcbase"
 
   # Every Vagrant virtual environment requires a box to build off of.
-  config.vm.box = "dummy"
+  config.vm.box = "lattice/ubuntu-trusty-64"
 
   config.vm.network :forwarded_port, guest: 9000, host: 9000 # Spark Notebook
 
@@ -33,6 +33,20 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
       region.keypair_name = "KEYPAIRNAME"
   end
 
+  # This should work fine out of the box if environment variables are declared
+  config.vm.provider :digital_ocean do |provider, override|
+    provider.ssh_key_name = ENV['DIGITALOCEAN_KEYNAME']
+    override.ssh.private_key_path = ENV['DIGITALOCEAN_KEYPATH']
+    override.ssh.username = "vagrant"
+    override.vm.box = 'digital_ocean'
+    override.vm.box_url = "https://github.com/smdahlen/vagrant-digitalocean/raw/master/box/digital_ocean.box"
+    provider.token = ENV['DIGITALOCEAN_TOKEN']
+    provider.image = 'ubuntu-14-04-x64'
+    provider.region = 'tor1'
+    provider.size = '4gb'
+    override.vm.network :forwarded_port, guest: 80, host: 80
+  end
+
   override.ssh.username = "ubuntu"
   override.ssh.private_key_path = "/PATH/TO/PRIVATE/key.pem"
   end

From dfb693de84929eccca6c16d275ce2a26abde9c5b Mon Sep 17 00:00:00 2001
From: ianmilligan1 <i2milligan@uwaterloo.ca>
Date: Tue, 15 Nov 2016 10:40:44 -0500
Subject: [PATCH 8/8] tweaking box provision

---
 Vagrantfile | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/Vagrantfile b/Vagrantfile
index 76338e5..491391f 100644
--- a/Vagrantfile
+++ b/Vagrantfile
@@ -11,26 +11,27 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
   config.vm.provider "virtualbox" do |v|
     v.name = "Warcbase workshop VM"
   end
-  config.vm.hostname = "warcbase"
 
+  config.vm.hostname = "warcbase"
+  
   # Every Vagrant virtual environment requires a box to build off of.
-  config.vm.box = "lattice/ubuntu-trusty-64"
+  config.vm.box = "ubuntu/xenial64"
 
   config.vm.network :forwarded_port, guest: 9000, host: 9000 # Spark Notebook
 
   config.vm.provider :aws do |aws, override|
-  aws.access_key_id = "KEY"
-  aws.secret_access_key = "SECRETKEY"
-  #aws.security_groups = "sg-eaf78b93"
-
-  #aws.session_token = ""
-  aws.region = "us-west-2"
-
-  aws.region_config "us-west-2" do |region|
+    aws.access_key_id = "KEY"
+    aws.secret_access_key = "SECRETKEY"
+    override.vm.box = "lattice/ubuntu-trusty-64"
+    override.ssh.username = "ubuntu"
+    override.ssh.private_key_path = "/PATH/TO/KEY"
+    aws.region = "us-west-2"
+    aws.region_config "us-west-2" do |region|
       region.ami = "ami-01f05461"
       # by default, spins up lightweight m3.medium. If want powerful, uncomment below.
       # region.instance_type = "c3.4xlarge"
       region.keypair_name = "KEYPAIRNAME"
+    end
   end
 
   # This should work fine out of the box if environment variables are declared
@@ -47,10 +48,6 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     override.vm.network :forwarded_port, guest: 80, host: 80
   end
 
-  override.ssh.username = "ubuntu"
-  override.ssh.private_key_path = "/PATH/TO/PRIVATE/key.pem"
-  end
-
   config.vm.provider "virtualbox" do |vb|
     vb.customize ["modifyvm", :id, "--memory", '2056']
     vb.customize ["modifyvm", :id, "--cpus", "2"]