From 6045e41984686a543e42ba18ce0018844839953d Mon Sep 17 00:00:00 2001
From: Michael L Heuer <heuermh@acm.org>
Date: Tue, 21 May 2019 19:15:24 -0500
Subject: [PATCH] [ADAM-2145] Add Spark 2.4.3 and Scala 2.12 to Jenkins build

* Add Spark 2.4.3 and Scala 2.12 to Jenkins build.

* Fix for Jenkins failures.

* remove move_to_spark_2.sh check

* move back to scala 2.11

* move before git test

* still not in correct place

* Spark 2.4.3 needs special case for Scala 2.12

* export SPARK_HOME correctly

* download hadoop and add to classpath

* try adding explicit dependency on paranamer 2.8

* try specifying spark.executor.memory

* try PYSPARK_SUBMIT_ARGS in Makefile

* try again

* fix conf syntax

* move conf to jenkins-test

* remove references to avro 1.7.x

* install avro 1.8.x
---
 .../org/bdgenomics/adam/cli/FlagStat.scala    |   6 +-
 adam-python/bdgenomics/adam/test/__init__.py  |   5 +-
 pom.xml                                       |   5 +
 scripts/jenkins-test                          | 334 +++++++++---------
 4 files changed, 185 insertions(+), 165 deletions(-)
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala
index 67ba2884d8..899aaa3693 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala
@@ -36,11 +36,11 @@ object FlagStat extends BDGCommandCompanion {
 
 class FlagStatArgs extends Args4jBase {
   @Argument(required = true, metaVar = "INPUT", usage = "The ADAM data to return stats for", index = 0)
-  val inputPath: String = null
+  var inputPath: String = null
   @Args4jOption(required = false, name = "-o", usage = "Optionally write the stats to this file.")
-  val outputPath: String = null
+  var outputPath: String = null
   @Args4jOption(required = false, name = "-stringency", usage = "Set the parsing stringency: SILENT, LENIENT, STRICT.")
-  val stringency: String = "SILENT"
+  var stringency: String = "SILENT"
 }
 
 class FlagStat(protected val args: FlagStatArgs) extends BDGSparkCommand[FlagStatArgs] {
diff --git a/adam-python/bdgenomics/adam/test/__init__.py b/adam-python/bdgenomics/adam/test/__init__.py
index 96be45979d..ab0297186b 100644
--- a/adam-python/bdgenomics/adam/test/__init__.py
+++ b/adam-python/bdgenomics/adam/test/__init__.py
@@ -57,7 +57,10 @@ def checkFiles(self, file1, file2):
     def setUp(self):
         self._old_sys_path = list(sys.path)
         class_name = self.__class__.__name__
-        self.ss = SparkSession.builder.master('local[4]').appName(class_name).getOrCreate()
+        self.ss = SparkSession.builder \
+                              .master('local[4]') \
+                              .appName(class_name) \
+                              .getOrCreate()
         self.sc = self.ss.sparkContext
 
         
diff --git a/pom.xml b/pom.xml
index 920ff578ce..25b5f29cac 100644
--- a/pom.xml
+++ b/pom.xml
@@ -588,6 +588,11 @@
         <artifactId>adam-codegen-spark2_${scala.version.prefix}</artifactId>
         <version>${project.version}</version>
       </dependency>
+      <dependency>
+        <groupId>com.thoughtworks.paranamer</groupId>
+        <artifactId>paranamer</artifactId>
+        <version>2.8</version>
+      </dependency>
     </dependencies>
   </dependencyManagement>
 
diff --git a/scripts/jenkins-test b/scripts/jenkins-test
index 1c49fe55aa..3dc56aeb10 100755
--- a/scripts/jenkins-test
+++ b/scripts/jenkins-test
@@ -39,29 +39,25 @@ fi
 
 set -e
 
-if [ ${SPARK_VERSION} == 2.3.3 ];
+# build defaults to Scala 2.11
+if [ ${SCALAVER} == 2.11 ];
 then
-    # shouldn't be able to move to spark 2 twice
+    # shouldn't be able to move to scala 2.11 twice
     set +e
-    ./scripts/move_to_spark_2.sh
+    ./scripts/move_to_scala_2.11.sh
     if [[ $? == 0 ]];
     then
-        echo "We have already moved to Spark 2, so running move_to_spark_2.sh a second time should fail, but error code was 0 (success)."
+        echo "We have already moved to Scala 2.11, so running move_to_scala_2.11.sh a second time should fail, but error code was 0 (success)."
         exit 1
     fi
     set -e
 fi
 
-if [ ${SCALAVER} == 2.11 ];
+# move to Scala 2.12 if requested
+if [ ${SCALAVER} == 2.12 ];
 then
-    # shouldn't be able to move to scala 2.11 twice
     set +e
-    ./scripts/move_to_scala_2.11.sh
-    if [[ $? == 0 ]];
-    then
-        echo "We have already moved to Scala 2.11, so running move_to_scala_2.11.sh a second time should fail, but error code was 0 (success)."
-        exit 1
-    fi
+    ./scripts/move_to_scala_2.12.sh
     set -e
 fi
 
@@ -70,8 +66,7 @@ echo "Testing ADAM version ${VERSION} on Spark ${SPARK_VERSION} and Hadoop ${HAD
 
 # first, build the sources, run the unit tests, and generate a coverage report
 mvn clean \
-    -Dhadoop.version=${HADOOP_VERSION} \
-    -Dspark.version=${SPARK_VERSION} 
+    -Dhadoop.version=${HADOOP_VERSION}
     
 # if this is a pull request, we need to set the coveralls pr id
 if [[ ! -z $ghprbPullId ]];
@@ -103,7 +98,6 @@ mvn -U \
     package \
     -DskipTests \
     -Dhadoop.version=${HADOOP_VERSION} \
-    -Dspark.version=${SPARK_VERSION} \
     -DargLine=${ADAM_MVN_TMP_DIR}
 
 # make sure that the distribution package contains an assembly jar
@@ -123,181 +117,199 @@ find . -name pom.xml \
     {} \;
 find . -name "*.bak" -exec rm -f {} \;
 
-if test -n "$(git status --porcelain)"
-then
-    echo "Applying move_to_xyz script marred a pom.xml file."
-    echo "Exiting..."
-    exit 1
-fi
-
 # run integration tests
-# prebuilt spark distributions are scala 2.11 for spark 2.x
-if [[ ( ${SPARK_VERSION} == 2.3.3 && ${SCALAVER} == 2.11 ) ]];
-then
 
-    # make a temp directory
-    ADAM_TMP_DIR=$(mktemp -d -t adamTestXXXXXXX)
+# make a temp directory
+ADAM_TMP_DIR=$(mktemp -d -t adamTestXXXXXXX)
 
-    # Just to be paranoid.. use a directory internal to the ADAM_TMP_DIR
-    ADAM_TMP_DIR=$ADAM_TMP_DIR/deleteMePleaseThisIsNoLongerNeeded
-    mkdir $ADAM_TMP_DIR
+# Just to be paranoid.. use a directory internal to the ADAM_TMP_DIR
+ADAM_TMP_DIR=$ADAM_TMP_DIR/deleteMePleaseThisIsNoLongerNeeded
+mkdir $ADAM_TMP_DIR
 
-    # set the TMPDIR envar, which is used by python to choose where to make temp directories
-    export TMPDIR=${ADAM_TMP_DIR}
+# set the TMPDIR envar, which is used by python to choose where to make temp directories
+export TMPDIR=${ADAM_TMP_DIR}
 
-    pushd $PROJECT_ROOT
+pushd $PROJECT_ROOT
 
-    # Copy the jar into our temp space for testing
-    cp -r . $ADAM_TMP_DIR
-    popd
+# Copy the jar into our temp space for testing
+cp -r . $ADAM_TMP_DIR
+popd
 
-    pushd $ADAM_TMP_DIR
+pushd $ADAM_TMP_DIR
 
-    # what hadoop version are we on? format string for downloading spark assembly
-    if [[ $HADOOP_VERSION =~ ^2\.6 ]]; then
-        HADOOP=hadoop2.6
-    elif [[ $HADOOP_VERSION =~ ^2\.7 ]]; then
-        HADOOP=hadoop2.7
-    else
-        echo "Unknown Hadoop version."
-        exit 1
-    fi
+# what hadoop version are we on? format string for downloading spark assembly
+if [[ $HADOOP_VERSION =~ ^2\.6 ]]; then
+    HADOOP=hadoop2.6
+elif [[ $HADOOP_VERSION =~ ^2\.7 ]]; then
+    HADOOP=hadoop2.7
+else
+    echo "Unknown Hadoop version."
+    exit 1
+fi
 
-    # set spark artifact string for downloading assembly
-    SPARK=spark-${SPARK_VERSION}
+# set spark artifact string for downloading assembly
+SPARK=spark-${SPARK_VERSION}
     
-    # download prepackaged spark assembly
+# download prepackaged spark assembly
+
+# Spark 2.4.3 needs special case for Scala 2.12
+if [ ${SCALAVER} == 2.12 ];
+then
+    curl \
+        -L "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/${SPARK}/${SPARK}-bin-without-hadoop-scala-2.12.tgz" \
+        -o ${SPARK}-bin-without-hadoop-scala-2.12.tgz
+
+    tar xzvf ${SPARK}-bin-without-hadoop-scala-2.12.tgz
+    export SPARK_HOME=${ADAM_TMP_DIR}/${SPARK}-bin-without-hadoop-scala-2.12
+
+    curl \
+        -L "http://www-us.apache.org/dist/hadoop/common/hadoop-2.7.7/hadoop-2.7.7.tar.gz" \
+        -o hadoop-2.7.7.tar.gz
+
+    tar xzvf hadoop-2.7.7.tar.gz
+
+    # remove references to avro 1.7.x
+    find hadoop-2.7.7 -name *.jar | grep avro | xargs rm
+
+    # download avro 1.8.x
+    curl \
+        -L "http://repo1.maven.org/maven2/org/apache/avro/avro/1.8.2/avro-1.8.2.jar" \
+        -o hadoop-2.7.7/share/hadoop/common/avro-1.8.2.jar
+
+    export SPARK_DIST_CLASSPATH=$(hadoop-2.7.7/bin/hadoop classpath)
+else
     curl \
         -L "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/${SPARK}/${SPARK}-bin-${HADOOP}.tgz" \
         -o ${SPARK}-bin-${HADOOP}.tgz
 
     tar xzvf ${SPARK}-bin-${HADOOP}.tgz
     export SPARK_HOME=${ADAM_TMP_DIR}/${SPARK}-bin-${HADOOP}
-    
-    # set the path to the adam submit script
-    ADAM=./bin/adam-submit
+fi
 
-    # test running adam-shell
-    ./bin/adam-shell -i scripts/jenkins-test-adam-shell.scala
+# set the path to the adam submit script
+ADAM=./bin/adam-submit
+
+# test running adam-shell
+./bin/adam-shell -i scripts/jenkins-test-adam-shell.scala
     
-    # add pyspark to the python path
-    PY4J_ZIP="$(ls -1 "${SPARK_HOME}/python/lib" | grep py4j)"
-    export PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/${PY4J_ZIP}:${PYTHONPATH}
-
-    # put adam jar on the pyspark path
-    ASSEMBLY_DIR="${ADAM_TMP_DIR}/adam-assembly/target"
-    ASSEMBLY_JAR="$(ls -1 "$ASSEMBLY_DIR" | grep "^adam[0-9A-Za-z\_\.-]*\.jar$" | grep -v javadoc | grep -v sources || true)"
-    export PYSPARK_SUBMIT_ARGS="--jars ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} --driver-class-path ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} pyspark-shell"
-
-    # create a conda environment for python build, if necessary
-    pythons=( 2.7 3.6 )
-
-    for python in ${pythons[*]}
-    do
-    	uuid=$(uuidgen)
-        conda create -y -q -n adam-build-${uuid} python=${python}
-        source activate adam-build-${uuid}
-
-        # prepare adam python
-        pushd adam-python
-        make prepare
-        popd
+# add pyspark to the python path
+PY4J_ZIP="$(ls -1 "${SPARK_HOME}/python/lib" | grep py4j)"
+export PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/${PY4J_ZIP}:${PYTHONPATH}
+
+# put adam jar on the pyspark path
+ASSEMBLY_DIR="${ADAM_TMP_DIR}/adam-assembly/target"
+ASSEMBLY_JAR="$(ls -1 "$ASSEMBLY_DIR" | grep "^adam[0-9A-Za-z\_\.-]*\.jar$" | grep -v javadoc | grep -v sources || true)"
+export PYSPARK_SUBMIT_ARGS="--conf spark.driver.memory=4g --conf spark.executor.memory=4g --jars ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} --driver-class-path ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} pyspark-shell"
+
+# create a conda environment for python build, if necessary
+pythons=( 2.7 3.6 )
+
+for python in ${pythons[*]}
+do
+    uuid=$(uuidgen)
+    conda create -y -q -n adam-build-${uuid} python=${python}
+    source activate adam-build-${uuid}
+
+    # prepare adam python
+    pushd adam-python
+    make prepare
+    popd
     	
-    	# we only support SparkR on Spark 2.x
-        if [ ${SPARK_VERSION} == 2.3.3 ];
-    	then
+    # make a directory to install SparkR into, and set the R user libs path
+    export R_LIBS_USER=${SPARK_HOME}/local_R_libs
+    mkdir -p ${R_LIBS_USER}
+    R CMD INSTALL \
+      -l ${R_LIBS_USER} \
+      ${SPARK_HOME}/R/lib/SparkR/
     	    
-    	    # make a directory to install SparkR into, and set the R user libs path
-    	    export R_LIBS_USER=${SPARK_HOME}/local_R_libs
-    	    mkdir -p ${R_LIBS_USER}
-    	    R CMD INSTALL \
-    	      -l ${R_LIBS_USER} \
-    	      ${SPARK_HOME}/R/lib/SparkR/
+    export SPARKR_SUBMIT_ARGS="--jars ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} --driver-class-path ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} sparkr-shell"
     	    
-    	    export SPARKR_SUBMIT_ARGS="--jars ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} --driver-class-path ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} sparkr-shell"
-    	    
-                # we can run the python build, now that we have a spark executable
-    	    # -DskipTests appears to skip the python/r tests?
-    	    mvn -U \
-    		-P python,r,distribution \
-    		package \
-    		-Dsuites=select.no.suites\* \
-    		-Dhadoop.version=${HADOOP_VERSION} \
-    		-Dspark.version=${SPARK_VERSION}
-
-    	    # make sure that the distribution package contains an egg
-    	    # if no assembly jar is found, this will exit with code 1 and fail the build
-    	    tar tzvf adam-distribution/target/adam-distribution*-bin.tar.gz | \
-    		grep bdgenomics.adam | \
-    		grep egg
-    	fi
+    # we can run the python build, now that we have a spark executable
+    # -DskipTests appears to skip the python/r tests?
+    mvn -U \
+    	-P python,r,distribution \
+    	package \
+    	-Dsuites=select.no.suites\* \
+    	-Dhadoop.version=${HADOOP_VERSION}
+
+    # make sure that the distribution package contains an egg
+    # if no assembly jar is found, this will exit with code 1 and fail the build
+    tar tzvf adam-distribution/target/adam-distribution*-bin.tar.gz | \
+    	grep bdgenomics.adam | \
+    	grep egg
     	
-    	# run pyadam test
-    	./bin/pyadam < scripts/jenkins-test-pyadam.py
+    # run pyadam test
+    ./bin/pyadam < scripts/jenkins-test-pyadam.py
     	
-    	# deactivate and remove the conda env
-    	source deactivate
-    	conda remove -y -n adam-build-${uuid} --all
+    # deactivate and remove the conda env
+    source deactivate
+    conda remove -y -n adam-build-${uuid} --all
 
-    	# copy python targets back
-    	cp -r adam-python/target ${PROJECT_ROOT}/adam-python/
+    # copy python targets back
+    cp -r adam-python/target ${PROJECT_ROOT}/adam-python/
 
-    	# clean after each python version
-      pushd adam-python
-      make clean
-    	make clean_sdist
-      popd
+    # clean after each python version
+    pushd adam-python
+    make clean
+    make clean_sdist
+    popd
 
-    done
+done
 	
-    # define filenames
-    BAM=mouse_chrM.bam
-    READS=${BAM}.reads.adam
-    SORTED_READS=${BAM}.reads.sorted.adam
-    FRAGMENTS=${BAM}.fragments.adam
+# define filenames
+BAM=mouse_chrM.bam
+READS=${BAM}.reads.adam
+SORTED_READS=${BAM}.reads.sorted.adam
+FRAGMENTS=${BAM}.fragments.adam
     
-    # fetch our input dataset
-    echo "Fetching BAM file"
-    rm -rf ${BAM}
-    wget -q https://s3.amazonaws.com/bdgenomics-test/${BAM}
-
-    # once fetched, convert BAM to ADAM
-    echo "Converting BAM to ADAM read format"
-    rm -rf ${READS}
-    ${ADAM} transformAlignments ${BAM} ${READS}
-
-    # then, sort the BAM
-    echo "Converting BAM to ADAM read format with sorting"
-    rm -rf ${SORTED_READS}
-    ${ADAM} transformAlignments -sort_reads ${READS} ${SORTED_READS}
-
-    # convert the reads to fragments to re-pair the reads
-    echo "Converting read file to fragments"
-    rm -rf ${FRAGMENTS}
-    ${ADAM} transformFragments -load_as_reads ${READS} ${FRAGMENTS}
-
-    # test that printing works
-    echo "Printing reads and fragments"
-    ${ADAM} print ${READS} 1>/dev/null 2>/dev/null
-    ${ADAM} print ${FRAGMENTS} 1>/dev/null 2>/dev/null
-
-    # run flagstat to verify that flagstat runs OK
-    echo "Printing read statistics"
-    ${ADAM} flagstat -print_metrics ${READS}
-    rm -rf ${ADAM_TMP_DIR}
-    popd
-    
-    # test that the source is formatted correctly
-    # we had modified the poms to add a temp dir, so back out that modification first
-    pushd ${PROJECT_ROOT}
-    ./scripts/format-source
-    if test -n "$(git status --porcelain)"
-    then
-        echo "Please run './scripts/format-source'"
-        exit 1
-    fi
-    popd    
+# fetch our input dataset
+echo "Fetching BAM file"
+rm -rf ${BAM}
+wget -q https://s3.amazonaws.com/bdgenomics-test/${BAM}
+
+# once fetched, convert BAM to ADAM
+echo "Converting BAM to ADAM read format"
+rm -rf ${READS}
+${ADAM} transformAlignments ${BAM} ${READS}
+
+# then, sort the BAM
+echo "Converting BAM to ADAM read format with sorting"
+rm -rf ${SORTED_READS}
+${ADAM} transformAlignments -sort_reads ${READS} ${SORTED_READS}
+
+# convert the reads to fragments to re-pair the reads
+echo "Converting read file to fragments"
+rm -rf ${FRAGMENTS}
+${ADAM} transformFragments -load_as_reads ${READS} ${FRAGMENTS}
+
+# test that printing works
+echo "Printing reads and fragments"
+${ADAM} print ${READS} 1>/dev/null 2>/dev/null
+${ADAM} print ${FRAGMENTS} 1>/dev/null 2>/dev/null
+
+# run flagstat to verify that flagstat runs OK
+echo "Printing read statistics"
+${ADAM} flagstat -print_metrics ${READS}
+rm -rf ${ADAM_TMP_DIR}
+popd
+
+pushd ${PROJECT_ROOT}
+# move back to Scala 2.11 as default
+if [ ${SCALAVER} == 2.12 ];
+then
+    set +e
+    ./scripts/move_to_scala_2.11.sh
+    set -e
+fi
+
+# test that the source is formatted correctly
+./scripts/format-source
+if test -n "$(git status --porcelain)"
+then
+    echo "Please run './scripts/format-source'"
+    exit 1
 fi
+popd    
 
 echo
 echo "All the tests passed"