From a0934c8de384e0a5fbeb0cb3b7844e171e8044d8 Mon Sep 17 00:00:00 2001 From: nruest Date: Sat, 19 Mar 2016 23:15:48 -0400 Subject: [PATCH] Resolve #1; warcbase builds, update README. --- README.md | 54 +++++++++++++++++++++++++++++++++++++++++++++ Vagrantfile | 1 - scripts/post.sh | 4 ---- scripts/warcbase.sh | 30 ++++++++++++++++++++++++- 4 files changed, 83 insertions(+), 6 deletions(-) delete mode 100644 scripts/post.sh diff --git a/README.md b/README.md index ff780a6..647cf8f 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,60 @@ ssh, scp, rsync: ## Environment - Ubuntu 14.04 +- warcbase HEAD +- +- + +## Spark Shell + +To run spark shell: + +* `cd project/spark-1.5.1-bin-hadoop2.6/bin` +* `./spark-shell --jars /home/vagrant/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar` + +Example: +```bash +vagrant@warcbase:~/project/spark-1.5.1-bin-hadoop2.6/bin$ ./spark-shell --jars /home/vagrant/project/warcbase/target/warcbase-0.1.0-SNAPSHOT-fatjar.jar +WARN NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +Welcome to + ____ __ + / __/__ ___ _____/ /__ + _\ \/ _ \/ _ `/ __/ '_/ + /___/ .__/\_,_/_/ /_/\_\ version 1.5.1 + /_/ + +Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_74) +Type in expressions to have them evaluated. +Type :help for more information. +WARN Utils - Your hostname, warcbase resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface eth0) +WARN Utils - Set SPARK_LOCAL_IP if you need to bind to another address +WARN MetricsSystem - Using default name DAGScheduler for source because spark.app.id is not set. +Spark context available as sc. +WARN ObjectStore - Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0 +WARN ObjectStore - Failed to get database default, returning NoSuchObjectException +WARN NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +WARN ObjectStore - Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0 +WARN ObjectStore - Failed to get database default, returning NoSuchObjectException +SQL context available as sqlContext. + +scala> :paste +// Entering paste mode (ctrl-D to finish) + +import org.warcbase.spark.matchbox._ +import org.warcbase.spark.rdd.RecordRDD._ +val r = RecordLoader.loadArc("/home/vagrant/project/warcbase-resources/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz", sc) + .keepValidPages() + .map(r => ExtractTopLevelDomain(r.getUrl)) + .countItems() + .take(10) + +// Exiting paste mode, now interpreting. + +ERROR ArcRecordUtils - Read 1235 bytes but expected 1311 bytes. Continuing... +import org.warcbase.spark.matchbox._ +import org.warcbase.spark.rdd.RecordRDD._ +r: Array[(String, Int)] = Array((communist-party.ca,39), (www.gca.ca,39), (greenparty.ca,39), (www.davidsuzuki.org,34), (westernblockparty.com,26), (www.nosharia.com,24), (partimarijuana.org,22), (www.ccsd.ca,22), (canadianactionparty.ca,22), (www.nawl.ca,19)) +``` ## Authors diff --git a/Vagrantfile b/Vagrantfile index 8faafe3..25656f5 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -26,6 +26,5 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| config.vm.provision :shell, inline: "sudo sed -i '/tty/!s/mesg n/tty -s \\&\\& mesg n/' /root/.profile", :privileged =>false config.vm.provision :shell, path: "./scripts/bootstrap.sh" config.vm.provision :shell, path: "./scripts/warcbase.sh", :privileged =>false - config.vm.provision :shell, path: "./scripts/post.sh", :privileged =>true end diff --git a/scripts/post.sh b/scripts/post.sh deleted file mode 100644 index 4d1004a..0000000 --- a/scripts/post.sh +++ /dev/null @@ -1,4 +0,0 @@ -#/bin/bash - -cd /home/vagrant/project/warcbase -sudo mvn clean package appassembler:assemble -DskipTests diff --git a/scripts/warcbase.sh b/scripts/warcbase.sh index 5a3e692..9f969f0 100644 --- a/scripts/warcbase.sh +++ b/scripts/warcbase.sh @@ -16,14 +16,42 @@ wget https://s3.eu-central-1.amazonaws.com/spark-notebook/tgz/spark-notebook-mas tar -xvf spark-notebook-master-scala-2.10.4-spark-1.5.1-hadoop-2.6.0-cdh5.4.2.tgz rm spark-notebook-master-scala-2.10.4-spark-1.5.1-hadoop-2.6.0-cdh5.4.2.tgz +# warcbase dependencies (vagrant isn't playing nice with maven, or I don't have paths setup right) +cd /tmp +wget http://central.maven.org/maven2/commons-logging/commons-logging-api/1.1/commons-logging-api-1.1.pom +wget http://central.maven.org/maven2/commons-logging/commons-logging-api/1.1/commons-logging-api-1.1.jar +wget http://central.maven.org/maven2/com/google/code/findbugs/jsr305/1.3.9/jsr305-1.3.9.jar +wget http://central.maven.org/maven2/com/google/code/findbugs/jsr305/1.3.9/jsr305-1.3.9.pom +wget http://central.maven.org/maven2/oro/oro/2.0.8/oro-2.0.8.jar +wget http://central.maven.org/maven2/oro/oro/2.0.8/oro-2.0.8.pom +wget http://central.maven.org/maven2/commons-lang/commons-lang/2.6/commons-lang-2.6.jar +wget http://central.maven.org/maven2/commons-lang/commons-lang/2.6/commons-lang-2.6.pom +wget http://central.maven.org/maven2/commons-collections/commons-collections/3.2.1/commons-collections-3.2.1.jar +wget http://central.maven.org/maven2/commons-collections/commons-collections/3.2.1/commons-collections-3.2.1.pom +wget http://central.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar +wget http://central.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.pom +wget http://central.maven.org/maven2/org/apache/commons/commons-compress/1.9/commons-compress-1.9.jar +wget http://central.maven.org/maven2/org/apache/commons/commons-compress/1.9/commons-compress-1.9.pom + # warcbase cd /home/vagrant/project git clone http://github.com/lintool/warcbase.git -#su vagrant -l -c 'cd /home/vagrant/project/warcbase && sudo mvn clean package appassembler:assemble -DskipTests' cd /home/vagrant/project/warcbase +mvn install:install-file -Dfile=/usr/share/java/bsh-2.0b4.jar -DpomFile=/usr/share/maven-repo/org/beanshell/bsh/2.0b4/bsh-2.0b4.pom +mvn install:install-file -Dfile=/usr/share/java/commons-cli-1.2.jar -DpomFile=/usr/share/maven-repo/commons-cli/commons-cli/1.2/commons-cli-1.2.pom +mvn install:install-file -Dfile=/tmp/commons-logging-api-1.1.jar -DpomFile=/tmp/commons-logging-api-1.1.pom +mvn install:install-file -Dfile=/tmp/jsr305-1.3.9.jar -DpomFile=/tmp/jsr305-1.3.9.pom +mvn install:install-file -Dfile=/tmp/oro-2.0.8.jar -DpomFile=/tmp/oro-2.0.8.pom +mvn install:install-file -Dfile=/tmp/commons-lang-2.6.jar -DpomFile=/tmp/commons-lang-2.6.pom +mvn install:install-file -Dfile=/tmp/commons-collections-3.2.1.jar -DpomFile=/tmp/commons-collections-3.2.1.pom +mvn install:install-file -Dfile=/tmp/hamcrest-core-1.3.jar -DpomFile=/tmp/hamcrest-core-1.3.pom +mvn install:install-file -Dfile=/tmp/commons-compress-1.9.jar -DpomFile=/tmp/commons-compress-1.9.pom +mvn clean package appassembler:assemble -DskipTests +# sample files cd /home/vagrant/project git clone https://github.com/lintool/warcbase-resources.git +# make sure permissions are fine cd /home/vagrant chown -hR vagrant:vagrant *