From 6045e41984686a543e42ba18ce0018844839953d Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Tue, 21 May 2019 19:15:24 -0500 Subject: [PATCH] [ADAM-2145] Add Spark 2.4.3 and Scala 2.12 to Jenkins build * Add Spark 2.4.3 and Scala 2.12 to Jenkins build. * Fix for Jenkins failures. * remove move_to_spark_2.sh check * move back to scala 2.11 * move before git test * still not in correct place * Spark 2.4.3 needs special case for Scala 2.12 * export SPARK_HOME correctly * download hadoop and add to classpath * try adding explicit dependency on paranamer 2.8 * try specifying spark.executor.memory * try PYSPARK_SUBMIT_ARGS in Makefile * try again * fix conf syntax * move conf to jenkins-test * remove references to avro 1.7.x * install avro 1.8.x --- .../org/bdgenomics/adam/cli/FlagStat.scala | 6 +- adam-python/bdgenomics/adam/test/__init__.py | 5 +- pom.xml | 5 + scripts/jenkins-test | 334 +++++++++--------- 4 files changed, 185 insertions(+), 165 deletions(-) diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala index 67ba2884d8..899aaa3693 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala @@ -36,11 +36,11 @@ object FlagStat extends BDGCommandCompanion { class FlagStatArgs extends Args4jBase { @Argument(required = true, metaVar = "INPUT", usage = "The ADAM data to return stats for", index = 0) - val inputPath: String = null + var inputPath: String = null @Args4jOption(required = false, name = "-o", usage = "Optionally write the stats to this file.") - val outputPath: String = null + var outputPath: String = null @Args4jOption(required = false, name = "-stringency", usage = "Set the parsing stringency: SILENT, LENIENT, STRICT.") - val stringency: String = "SILENT" + var stringency: String = "SILENT" } class FlagStat(protected val args: FlagStatArgs) extends BDGSparkCommand[FlagStatArgs] { diff --git a/adam-python/bdgenomics/adam/test/__init__.py b/adam-python/bdgenomics/adam/test/__init__.py index 96be45979d..ab0297186b 100644 --- a/adam-python/bdgenomics/adam/test/__init__.py +++ b/adam-python/bdgenomics/adam/test/__init__.py @@ -57,7 +57,10 @@ def checkFiles(self, file1, file2): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ - self.ss = SparkSession.builder.master('local[4]').appName(class_name).getOrCreate() + self.ss = SparkSession.builder \ + .master('local[4]') \ + .appName(class_name) \ + .getOrCreate() self.sc = self.ss.sparkContext diff --git a/pom.xml b/pom.xml index 920ff578ce..25b5f29cac 100644 --- a/pom.xml +++ b/pom.xml @@ -588,6 +588,11 @@ adam-codegen-spark2_${scala.version.prefix} ${project.version} + + com.thoughtworks.paranamer + paranamer + 2.8 + diff --git a/scripts/jenkins-test b/scripts/jenkins-test index 1c49fe55aa..3dc56aeb10 100755 --- a/scripts/jenkins-test +++ b/scripts/jenkins-test @@ -39,29 +39,25 @@ fi set -e -if [ ${SPARK_VERSION} == 2.3.3 ]; +# build defaults to Scala 2.11 +if [ ${SCALAVER} == 2.11 ]; then - # shouldn't be able to move to spark 2 twice + # shouldn't be able to move to scala 2.11 twice set +e - ./scripts/move_to_spark_2.sh + ./scripts/move_to_scala_2.11.sh if [[ $? == 0 ]]; then - echo "We have already moved to Spark 2, so running move_to_spark_2.sh a second time should fail, but error code was 0 (success)." + echo "We have already moved to Scala 2.11, so running move_to_scala_2.11.sh a second time should fail, but error code was 0 (success)." exit 1 fi set -e fi -if [ ${SCALAVER} == 2.11 ]; +# move to Scala 2.12 if requested +if [ ${SCALAVER} == 2.12 ]; then - # shouldn't be able to move to scala 2.11 twice set +e - ./scripts/move_to_scala_2.11.sh - if [[ $? == 0 ]]; - then - echo "We have already moved to Scala 2.11, so running move_to_scala_2.11.sh a second time should fail, but error code was 0 (success)." - exit 1 - fi + ./scripts/move_to_scala_2.12.sh set -e fi @@ -70,8 +66,7 @@ echo "Testing ADAM version ${VERSION} on Spark ${SPARK_VERSION} and Hadoop ${HAD # first, build the sources, run the unit tests, and generate a coverage report mvn clean \ - -Dhadoop.version=${HADOOP_VERSION} \ - -Dspark.version=${SPARK_VERSION} + -Dhadoop.version=${HADOOP_VERSION} # if this is a pull request, we need to set the coveralls pr id if [[ ! -z $ghprbPullId ]]; @@ -103,7 +98,6 @@ mvn -U \ package \ -DskipTests \ -Dhadoop.version=${HADOOP_VERSION} \ - -Dspark.version=${SPARK_VERSION} \ -DargLine=${ADAM_MVN_TMP_DIR} # make sure that the distribution package contains an assembly jar @@ -123,181 +117,199 @@ find . -name pom.xml \ {} \; find . -name "*.bak" -exec rm -f {} \; -if test -n "$(git status --porcelain)" -then - echo "Applying move_to_xyz script marred a pom.xml file." - echo "Exiting..." - exit 1 -fi - # run integration tests -# prebuilt spark distributions are scala 2.11 for spark 2.x -if [[ ( ${SPARK_VERSION} == 2.3.3 && ${SCALAVER} == 2.11 ) ]]; -then - # make a temp directory - ADAM_TMP_DIR=$(mktemp -d -t adamTestXXXXXXX) +# make a temp directory +ADAM_TMP_DIR=$(mktemp -d -t adamTestXXXXXXX) - # Just to be paranoid.. use a directory internal to the ADAM_TMP_DIR - ADAM_TMP_DIR=$ADAM_TMP_DIR/deleteMePleaseThisIsNoLongerNeeded - mkdir $ADAM_TMP_DIR +# Just to be paranoid.. use a directory internal to the ADAM_TMP_DIR +ADAM_TMP_DIR=$ADAM_TMP_DIR/deleteMePleaseThisIsNoLongerNeeded +mkdir $ADAM_TMP_DIR - # set the TMPDIR envar, which is used by python to choose where to make temp directories - export TMPDIR=${ADAM_TMP_DIR} +# set the TMPDIR envar, which is used by python to choose where to make temp directories +export TMPDIR=${ADAM_TMP_DIR} - pushd $PROJECT_ROOT +pushd $PROJECT_ROOT - # Copy the jar into our temp space for testing - cp -r . $ADAM_TMP_DIR - popd +# Copy the jar into our temp space for testing +cp -r . $ADAM_TMP_DIR +popd - pushd $ADAM_TMP_DIR +pushd $ADAM_TMP_DIR - # what hadoop version are we on? format string for downloading spark assembly - if [[ $HADOOP_VERSION =~ ^2\.6 ]]; then - HADOOP=hadoop2.6 - elif [[ $HADOOP_VERSION =~ ^2\.7 ]]; then - HADOOP=hadoop2.7 - else - echo "Unknown Hadoop version." - exit 1 - fi +# what hadoop version are we on? format string for downloading spark assembly +if [[ $HADOOP_VERSION =~ ^2\.6 ]]; then + HADOOP=hadoop2.6 +elif [[ $HADOOP_VERSION =~ ^2\.7 ]]; then + HADOOP=hadoop2.7 +else + echo "Unknown Hadoop version." + exit 1 +fi - # set spark artifact string for downloading assembly - SPARK=spark-${SPARK_VERSION} +# set spark artifact string for downloading assembly +SPARK=spark-${SPARK_VERSION} - # download prepackaged spark assembly +# download prepackaged spark assembly + +# Spark 2.4.3 needs special case for Scala 2.12 +if [ ${SCALAVER} == 2.12 ]; +then + curl \ + -L "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/${SPARK}/${SPARK}-bin-without-hadoop-scala-2.12.tgz" \ + -o ${SPARK}-bin-without-hadoop-scala-2.12.tgz + + tar xzvf ${SPARK}-bin-without-hadoop-scala-2.12.tgz + export SPARK_HOME=${ADAM_TMP_DIR}/${SPARK}-bin-without-hadoop-scala-2.12 + + curl \ + -L "http://www-us.apache.org/dist/hadoop/common/hadoop-2.7.7/hadoop-2.7.7.tar.gz" \ + -o hadoop-2.7.7.tar.gz + + tar xzvf hadoop-2.7.7.tar.gz + + # remove references to avro 1.7.x + find hadoop-2.7.7 -name *.jar | grep avro | xargs rm + + # download avro 1.8.x + curl \ + -L "http://repo1.maven.org/maven2/org/apache/avro/avro/1.8.2/avro-1.8.2.jar" \ + -o hadoop-2.7.7/share/hadoop/common/avro-1.8.2.jar + + export SPARK_DIST_CLASSPATH=$(hadoop-2.7.7/bin/hadoop classpath) +else curl \ -L "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/${SPARK}/${SPARK}-bin-${HADOOP}.tgz" \ -o ${SPARK}-bin-${HADOOP}.tgz tar xzvf ${SPARK}-bin-${HADOOP}.tgz export SPARK_HOME=${ADAM_TMP_DIR}/${SPARK}-bin-${HADOOP} - - # set the path to the adam submit script - ADAM=./bin/adam-submit +fi - # test running adam-shell - ./bin/adam-shell -i scripts/jenkins-test-adam-shell.scala +# set the path to the adam submit script +ADAM=./bin/adam-submit + +# test running adam-shell +./bin/adam-shell -i scripts/jenkins-test-adam-shell.scala - # add pyspark to the python path - PY4J_ZIP="$(ls -1 "${SPARK_HOME}/python/lib" | grep py4j)" - export PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/${PY4J_ZIP}:${PYTHONPATH} - - # put adam jar on the pyspark path - ASSEMBLY_DIR="${ADAM_TMP_DIR}/adam-assembly/target" - ASSEMBLY_JAR="$(ls -1 "$ASSEMBLY_DIR" | grep "^adam[0-9A-Za-z\_\.-]*\.jar$" | grep -v javadoc | grep -v sources || true)" - export PYSPARK_SUBMIT_ARGS="--jars ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} --driver-class-path ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} pyspark-shell" - - # create a conda environment for python build, if necessary - pythons=( 2.7 3.6 ) - - for python in ${pythons[*]} - do - uuid=$(uuidgen) - conda create -y -q -n adam-build-${uuid} python=${python} - source activate adam-build-${uuid} - - # prepare adam python - pushd adam-python - make prepare - popd +# add pyspark to the python path +PY4J_ZIP="$(ls -1 "${SPARK_HOME}/python/lib" | grep py4j)" +export PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/${PY4J_ZIP}:${PYTHONPATH} + +# put adam jar on the pyspark path +ASSEMBLY_DIR="${ADAM_TMP_DIR}/adam-assembly/target" +ASSEMBLY_JAR="$(ls -1 "$ASSEMBLY_DIR" | grep "^adam[0-9A-Za-z\_\.-]*\.jar$" | grep -v javadoc | grep -v sources || true)" +export PYSPARK_SUBMIT_ARGS="--conf spark.driver.memory=4g --conf spark.executor.memory=4g --jars ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} --driver-class-path ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} pyspark-shell" + +# create a conda environment for python build, if necessary +pythons=( 2.7 3.6 ) + +for python in ${pythons[*]} +do + uuid=$(uuidgen) + conda create -y -q -n adam-build-${uuid} python=${python} + source activate adam-build-${uuid} + + # prepare adam python + pushd adam-python + make prepare + popd - # we only support SparkR on Spark 2.x - if [ ${SPARK_VERSION} == 2.3.3 ]; - then + # make a directory to install SparkR into, and set the R user libs path + export R_LIBS_USER=${SPARK_HOME}/local_R_libs + mkdir -p ${R_LIBS_USER} + R CMD INSTALL \ + -l ${R_LIBS_USER} \ + ${SPARK_HOME}/R/lib/SparkR/ - # make a directory to install SparkR into, and set the R user libs path - export R_LIBS_USER=${SPARK_HOME}/local_R_libs - mkdir -p ${R_LIBS_USER} - R CMD INSTALL \ - -l ${R_LIBS_USER} \ - ${SPARK_HOME}/R/lib/SparkR/ + export SPARKR_SUBMIT_ARGS="--jars ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} --driver-class-path ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} sparkr-shell" - export SPARKR_SUBMIT_ARGS="--jars ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} --driver-class-path ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} sparkr-shell" - - # we can run the python build, now that we have a spark executable - # -DskipTests appears to skip the python/r tests? - mvn -U \ - -P python,r,distribution \ - package \ - -Dsuites=select.no.suites\* \ - -Dhadoop.version=${HADOOP_VERSION} \ - -Dspark.version=${SPARK_VERSION} - - # make sure that the distribution package contains an egg - # if no assembly jar is found, this will exit with code 1 and fail the build - tar tzvf adam-distribution/target/adam-distribution*-bin.tar.gz | \ - grep bdgenomics.adam | \ - grep egg - fi + # we can run the python build, now that we have a spark executable + # -DskipTests appears to skip the python/r tests? + mvn -U \ + -P python,r,distribution \ + package \ + -Dsuites=select.no.suites\* \ + -Dhadoop.version=${HADOOP_VERSION} + + # make sure that the distribution package contains an egg + # if no assembly jar is found, this will exit with code 1 and fail the build + tar tzvf adam-distribution/target/adam-distribution*-bin.tar.gz | \ + grep bdgenomics.adam | \ + grep egg - # run pyadam test - ./bin/pyadam < scripts/jenkins-test-pyadam.py + # run pyadam test + ./bin/pyadam < scripts/jenkins-test-pyadam.py - # deactivate and remove the conda env - source deactivate - conda remove -y -n adam-build-${uuid} --all + # deactivate and remove the conda env + source deactivate + conda remove -y -n adam-build-${uuid} --all - # copy python targets back - cp -r adam-python/target ${PROJECT_ROOT}/adam-python/ + # copy python targets back + cp -r adam-python/target ${PROJECT_ROOT}/adam-python/ - # clean after each python version - pushd adam-python - make clean - make clean_sdist - popd + # clean after each python version + pushd adam-python + make clean + make clean_sdist + popd - done +done - # define filenames - BAM=mouse_chrM.bam - READS=${BAM}.reads.adam - SORTED_READS=${BAM}.reads.sorted.adam - FRAGMENTS=${BAM}.fragments.adam +# define filenames +BAM=mouse_chrM.bam +READS=${BAM}.reads.adam +SORTED_READS=${BAM}.reads.sorted.adam +FRAGMENTS=${BAM}.fragments.adam - # fetch our input dataset - echo "Fetching BAM file" - rm -rf ${BAM} - wget -q https://s3.amazonaws.com/bdgenomics-test/${BAM} - - # once fetched, convert BAM to ADAM - echo "Converting BAM to ADAM read format" - rm -rf ${READS} - ${ADAM} transformAlignments ${BAM} ${READS} - - # then, sort the BAM - echo "Converting BAM to ADAM read format with sorting" - rm -rf ${SORTED_READS} - ${ADAM} transformAlignments -sort_reads ${READS} ${SORTED_READS} - - # convert the reads to fragments to re-pair the reads - echo "Converting read file to fragments" - rm -rf ${FRAGMENTS} - ${ADAM} transformFragments -load_as_reads ${READS} ${FRAGMENTS} - - # test that printing works - echo "Printing reads and fragments" - ${ADAM} print ${READS} 1>/dev/null 2>/dev/null - ${ADAM} print ${FRAGMENTS} 1>/dev/null 2>/dev/null - - # run flagstat to verify that flagstat runs OK - echo "Printing read statistics" - ${ADAM} flagstat -print_metrics ${READS} - rm -rf ${ADAM_TMP_DIR} - popd - - # test that the source is formatted correctly - # we had modified the poms to add a temp dir, so back out that modification first - pushd ${PROJECT_ROOT} - ./scripts/format-source - if test -n "$(git status --porcelain)" - then - echo "Please run './scripts/format-source'" - exit 1 - fi - popd +# fetch our input dataset +echo "Fetching BAM file" +rm -rf ${BAM} +wget -q https://s3.amazonaws.com/bdgenomics-test/${BAM} + +# once fetched, convert BAM to ADAM +echo "Converting BAM to ADAM read format" +rm -rf ${READS} +${ADAM} transformAlignments ${BAM} ${READS} + +# then, sort the BAM +echo "Converting BAM to ADAM read format with sorting" +rm -rf ${SORTED_READS} +${ADAM} transformAlignments -sort_reads ${READS} ${SORTED_READS} + +# convert the reads to fragments to re-pair the reads +echo "Converting read file to fragments" +rm -rf ${FRAGMENTS} +${ADAM} transformFragments -load_as_reads ${READS} ${FRAGMENTS} + +# test that printing works +echo "Printing reads and fragments" +${ADAM} print ${READS} 1>/dev/null 2>/dev/null +${ADAM} print ${FRAGMENTS} 1>/dev/null 2>/dev/null + +# run flagstat to verify that flagstat runs OK +echo "Printing read statistics" +${ADAM} flagstat -print_metrics ${READS} +rm -rf ${ADAM_TMP_DIR} +popd + +pushd ${PROJECT_ROOT} +# move back to Scala 2.11 as default +if [ ${SCALAVER} == 2.12 ]; +then + set +e + ./scripts/move_to_scala_2.11.sh + set -e +fi + +# test that the source is formatted correctly +./scripts/format-source +if test -n "$(git status --porcelain)" +then + echo "Please run './scripts/format-source'" + exit 1 fi +popd echo echo "All the tests passed"