Skip to content
Permalink
Browse files

Merge 0babacd into 9f17a0f

  • Loading branch information...
heuermh committed May 14, 2019
2 parents 9f17a0f + 0babacd commit 3718c8d21eb71b332d4cb15a65a62054d27f210f
Showing with 153 additions and 161 deletions.
  1. +3 −3 adam-cli/src/main/scala/org/bdgenomics/adam/cli/FlagStat.scala
  2. +150 −158 scripts/jenkins-test
@@ -36,11 +36,11 @@ object FlagStat extends BDGCommandCompanion {

class FlagStatArgs extends Args4jBase {
@Argument(required = true, metaVar = "INPUT", usage = "The ADAM data to return stats for", index = 0)
val inputPath: String = null
var inputPath: String = null
@Args4jOption(required = false, name = "-o", usage = "Optionally write the stats to this file.")
val outputPath: String = null
var outputPath: String = null
@Args4jOption(required = false, name = "-stringency", usage = "Set the parsing stringency: SILENT, LENIENT, STRICT.")
val stringency: String = "SILENT"
var stringency: String = "SILENT"
}

class FlagStat(protected val args: FlagStatArgs) extends BDGSparkCommand[FlagStatArgs] {
@@ -39,29 +39,25 @@ fi

set -e

if [ ${SPARK_VERSION} == 2.3.3 ];
# build defaults to Scala 2.11
if [ ${SCALAVER} == 2.11 ];
then
# shouldn't be able to move to spark 2 twice
# shouldn't be able to move to scala 2.11 twice
set +e
./scripts/move_to_spark_2.sh
./scripts/move_to_scala_2.11.sh
if [[ $? == 0 ]];
then
echo "We have already moved to Spark 2, so running move_to_spark_2.sh a second time should fail, but error code was 0 (success)."
echo "We have already moved to Scala 2.11, so running move_to_scala_2.11.sh a second time should fail, but error code was 0 (success)."
exit 1
fi
set -e
fi

if [ ${SCALAVER} == 2.11 ];
# move to Scala 2.12 if requested
if [ ${SCALAVER} == 2.12 ];
then
# shouldn't be able to move to scala 2.11 twice
set +e
./scripts/move_to_scala_2.11.sh
if [[ $? == 0 ]];
then
echo "We have already moved to Scala 2.11, so running move_to_scala_2.11.sh a second time should fail, but error code was 0 (success)."
exit 1
fi
./scripts/move_to_scala_2.12.sh
set -e
fi

@@ -70,8 +66,7 @@ echo "Testing ADAM version ${VERSION} on Spark ${SPARK_VERSION} and Hadoop ${HAD

# first, build the sources, run the unit tests, and generate a coverage report
mvn clean \
-Dhadoop.version=${HADOOP_VERSION} \
-Dspark.version=${SPARK_VERSION}
-Dhadoop.version=${HADOOP_VERSION}

# if this is a pull request, we need to set the coveralls pr id
if [[ ! -z $ghprbPullId ]];
@@ -103,7 +98,6 @@ mvn -U \
package \
-DskipTests \
-Dhadoop.version=${HADOOP_VERSION} \
-Dspark.version=${SPARK_VERSION} \
-DargLine=${ADAM_MVN_TMP_DIR}

# make sure that the distribution package contains an assembly jar
@@ -131,173 +125,171 @@ then
fi

# run integration tests
# prebuilt spark distributions are scala 2.11 for spark 2.x
if [[ ( ${SPARK_VERSION} == 2.3.3 && ${SCALAVER} == 2.11 ) ]];
then

# make a temp directory
ADAM_TMP_DIR=$(mktemp -d -t adamTestXXXXXXX)
# make a temp directory
ADAM_TMP_DIR=$(mktemp -d -t adamTestXXXXXXX)

# Just to be paranoid.. use a directory internal to the ADAM_TMP_DIR
ADAM_TMP_DIR=$ADAM_TMP_DIR/deleteMePleaseThisIsNoLongerNeeded
mkdir $ADAM_TMP_DIR
# Just to be paranoid.. use a directory internal to the ADAM_TMP_DIR
ADAM_TMP_DIR=$ADAM_TMP_DIR/deleteMePleaseThisIsNoLongerNeeded
mkdir $ADAM_TMP_DIR

# set the TMPDIR envar, which is used by python to choose where to make temp directories
export TMPDIR=${ADAM_TMP_DIR}
# set the TMPDIR envar, which is used by python to choose where to make temp directories
export TMPDIR=${ADAM_TMP_DIR}

pushd $PROJECT_ROOT
pushd $PROJECT_ROOT

# Copy the jar into our temp space for testing
cp -r . $ADAM_TMP_DIR
popd
# Copy the jar into our temp space for testing
cp -r . $ADAM_TMP_DIR
popd

pushd $ADAM_TMP_DIR
pushd $ADAM_TMP_DIR

# what hadoop version are we on? format string for downloading spark assembly
if [[ $HADOOP_VERSION =~ ^2\.6 ]]; then
HADOOP=hadoop2.6
elif [[ $HADOOP_VERSION =~ ^2\.7 ]]; then
HADOOP=hadoop2.7
else
echo "Unknown Hadoop version."
exit 1
fi
# what hadoop version are we on? format string for downloading spark assembly
if [[ $HADOOP_VERSION =~ ^2\.6 ]]; then
HADOOP=hadoop2.6
elif [[ $HADOOP_VERSION =~ ^2\.7 ]]; then
HADOOP=hadoop2.7
else
echo "Unknown Hadoop version."
exit 1
fi

# set spark artifact string for downloading assembly
SPARK=spark-${SPARK_VERSION}
# set spark artifact string for downloading assembly
SPARK=spark-${SPARK_VERSION}

# download prepackaged spark assembly
curl \
-L "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/${SPARK}/${SPARK}-bin-${HADOOP}.tgz" \
-o ${SPARK}-bin-${HADOOP}.tgz
# download prepackaged spark assembly
curl \
-L "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/${SPARK}/${SPARK}-bin-${HADOOP}.tgz" \
-o ${SPARK}-bin-${HADOOP}.tgz

tar xzvf ${SPARK}-bin-${HADOOP}.tgz
export SPARK_HOME=${ADAM_TMP_DIR}/${SPARK}-bin-${HADOOP}
tar xzvf ${SPARK}-bin-${HADOOP}.tgz
export SPARK_HOME=${ADAM_TMP_DIR}/${SPARK}-bin-${HADOOP}

# set the path to the adam submit script
ADAM=./bin/adam-submit
# set the path to the adam submit script
ADAM=./bin/adam-submit

# test running adam-shell
./bin/adam-shell -i scripts/jenkins-test-adam-shell.scala
# test running adam-shell
./bin/adam-shell -i scripts/jenkins-test-adam-shell.scala

# add pyspark to the python path
PY4J_ZIP="$(ls -1 "${SPARK_HOME}/python/lib" | grep py4j)"
export PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/${PY4J_ZIP}:${PYTHONPATH}

# put adam jar on the pyspark path
ASSEMBLY_DIR="${ADAM_TMP_DIR}/adam-assembly/target"
ASSEMBLY_JAR="$(ls -1 "$ASSEMBLY_DIR" | grep "^adam[0-9A-Za-z\_\.-]*\.jar$" | grep -v javadoc | grep -v sources || true)"
export PYSPARK_SUBMIT_ARGS="--jars ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} --driver-class-path ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} pyspark-shell"

# create a conda environment for python build, if necessary
pythons=( 2.7 3.6 )

for python in ${pythons[*]}
do
uuid=$(uuidgen)
conda create -y -q -n adam-build-${uuid} python=${python}
source activate adam-build-${uuid}

# prepare adam python
pushd adam-python
make prepare
popd
# add pyspark to the python path
PY4J_ZIP="$(ls -1 "${SPARK_HOME}/python/lib" | grep py4j)"
export PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/${PY4J_ZIP}:${PYTHONPATH}

# put adam jar on the pyspark path
ASSEMBLY_DIR="${ADAM_TMP_DIR}/adam-assembly/target"
ASSEMBLY_JAR="$(ls -1 "$ASSEMBLY_DIR" | grep "^adam[0-9A-Za-z\_\.-]*\.jar$" | grep -v javadoc | grep -v sources || true)"
export PYSPARK_SUBMIT_ARGS="--jars ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} --driver-class-path ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} pyspark-shell"

# create a conda environment for python build, if necessary
pythons=( 2.7 3.6 )

for python in ${pythons[*]}
do
uuid=$(uuidgen)
conda create -y -q -n adam-build-${uuid} python=${python}
source activate adam-build-${uuid}

# prepare adam python
pushd adam-python
make prepare
popd

# we only support SparkR on Spark 2.x
if [ ${SPARK_VERSION} == 2.3.3 ];
then

# make a directory to install SparkR into, and set the R user libs path
export R_LIBS_USER=${SPARK_HOME}/local_R_libs
mkdir -p ${R_LIBS_USER}
R CMD INSTALL \
-l ${R_LIBS_USER} \
${SPARK_HOME}/R/lib/SparkR/
# make a directory to install SparkR into, and set the R user libs path
export R_LIBS_USER=${SPARK_HOME}/local_R_libs
mkdir -p ${R_LIBS_USER}
R CMD INSTALL \
-l ${R_LIBS_USER} \
${SPARK_HOME}/R/lib/SparkR/

export SPARKR_SUBMIT_ARGS="--jars ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} --driver-class-path ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} sparkr-shell"
export SPARKR_SUBMIT_ARGS="--jars ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} --driver-class-path ${ASSEMBLY_DIR}/${ASSEMBLY_JAR} sparkr-shell"

# we can run the python build, now that we have a spark executable
# -DskipTests appears to skip the python/r tests?
mvn -U \
-P python,r,distribution \
package \
-Dsuites=select.no.suites\* \
-Dhadoop.version=${HADOOP_VERSION} \
-Dspark.version=${SPARK_VERSION}

# make sure that the distribution package contains an egg
# if no assembly jar is found, this will exit with code 1 and fail the build
tar tzvf adam-distribution/target/adam-distribution*-bin.tar.gz | \
grep bdgenomics.adam | \
grep egg
fi
# we can run the python build, now that we have a spark executable
# -DskipTests appears to skip the python/r tests?
mvn -U \
-P python,r,distribution \
package \
-Dsuites=select.no.suites\* \
-Dhadoop.version=${HADOOP_VERSION}

# make sure that the distribution package contains an egg
# if no assembly jar is found, this will exit with code 1 and fail the build
tar tzvf adam-distribution/target/adam-distribution*-bin.tar.gz | \
grep bdgenomics.adam | \
grep egg

# run pyadam test
./bin/pyadam < scripts/jenkins-test-pyadam.py
# run pyadam test
./bin/pyadam < scripts/jenkins-test-pyadam.py

# deactivate and remove the conda env
source deactivate
conda remove -y -n adam-build-${uuid} --all
# deactivate and remove the conda env
source deactivate
conda remove -y -n adam-build-${uuid} --all

# copy python targets back
cp -r adam-python/target ${PROJECT_ROOT}/adam-python/
# copy python targets back
cp -r adam-python/target ${PROJECT_ROOT}/adam-python/

# clean after each python version
pushd adam-python
make clean
make clean_sdist
popd
# clean after each python version
pushd adam-python
make clean
make clean_sdist
popd

done
done

# define filenames
BAM=mouse_chrM.bam
READS=${BAM}.reads.adam
SORTED_READS=${BAM}.reads.sorted.adam
FRAGMENTS=${BAM}.fragments.adam
# define filenames
BAM=mouse_chrM.bam
READS=${BAM}.reads.adam
SORTED_READS=${BAM}.reads.sorted.adam
FRAGMENTS=${BAM}.fragments.adam

# fetch our input dataset
echo "Fetching BAM file"
rm -rf ${BAM}
wget -q https://s3.amazonaws.com/bdgenomics-test/${BAM}

# once fetched, convert BAM to ADAM
echo "Converting BAM to ADAM read format"
rm -rf ${READS}
${ADAM} transformAlignments ${BAM} ${READS}

# then, sort the BAM
echo "Converting BAM to ADAM read format with sorting"
rm -rf ${SORTED_READS}
${ADAM} transformAlignments -sort_reads ${READS} ${SORTED_READS}

# convert the reads to fragments to re-pair the reads
echo "Converting read file to fragments"
rm -rf ${FRAGMENTS}
${ADAM} transformFragments -load_as_reads ${READS} ${FRAGMENTS}

# test that printing works
echo "Printing reads and fragments"
${ADAM} print ${READS} 1>/dev/null 2>/dev/null
${ADAM} print ${FRAGMENTS} 1>/dev/null 2>/dev/null

# run flagstat to verify that flagstat runs OK
echo "Printing read statistics"
${ADAM} flagstat -print_metrics ${READS}
rm -rf ${ADAM_TMP_DIR}
popd

# test that the source is formatted correctly
# we had modified the poms to add a temp dir, so back out that modification first
pushd ${PROJECT_ROOT}
./scripts/format-source
if test -n "$(git status --porcelain)"
then
echo "Please run './scripts/format-source'"
exit 1
fi
popd
# fetch our input dataset
echo "Fetching BAM file"
rm -rf ${BAM}
wget -q https://s3.amazonaws.com/bdgenomics-test/${BAM}

# once fetched, convert BAM to ADAM
echo "Converting BAM to ADAM read format"
rm -rf ${READS}
${ADAM} transformAlignments ${BAM} ${READS}

# then, sort the BAM
echo "Converting BAM to ADAM read format with sorting"
rm -rf ${SORTED_READS}
${ADAM} transformAlignments -sort_reads ${READS} ${SORTED_READS}

# convert the reads to fragments to re-pair the reads
echo "Converting read file to fragments"
rm -rf ${FRAGMENTS}
${ADAM} transformFragments -load_as_reads ${READS} ${FRAGMENTS}

# test that printing works
echo "Printing reads and fragments"
${ADAM} print ${READS} 1>/dev/null 2>/dev/null
${ADAM} print ${FRAGMENTS} 1>/dev/null 2>/dev/null

# run flagstat to verify that flagstat runs OK
echo "Printing read statistics"
${ADAM} flagstat -print_metrics ${READS}
rm -rf ${ADAM_TMP_DIR}
popd

# move back to Scala 2.11 as default
if [ ${SCALAVER} == 2.12 ];
then
set +e
./scripts/move_to_scala_2.11.sh
set -e
fi

# test that the source is formatted correctly
# we had modified the poms to add a temp dir, so back out that modification first
pushd ${PROJECT_ROOT}
./scripts/format-source
if test -n "$(git status --porcelain)"
then
echo "Please run './scripts/format-source'"
exit 1
fi
popd

echo
echo "All the tests passed"

0 comments on commit 3718c8d

Please sign in to comment.
You can’t perform that action at this time.