scripts/lumpyexpress

#!/bin/bash -e

############################################################
#  Program: lumpyexpress
#  Author: Colby Chiang (cc2qe@virginia.edu)
############################################################

# source the paths to the binaries used in the script
function source_binaries() {
    if [[ -e $1 ]]
    then
	echo "Sourcing executables from $1 ..."
	if [[ $1 == /* ]]
	then
	    source $1
	else
	    source ./$1
	fi
    else
	echo "Config file $1 not found. Attempting to auto-source executables"
	# general
	SPEEDSEQ_HOME=$( dirname `which speedseq` )
	SAMBAMBA=`which sambamba || true`
	BEDTOOLS=`which bedtools || true`
	BGZIP=`which bgzip || true`
	TABIX=`which tabix || true`
	VAWK=`which vawk || true`
	PARALLEL=`which parallel || true`
	PYTHON=`which python2.7 || true`

	# align
	BWA=`which bwa || true`
	SAMBLASTER=`which samblaster || true`

	# var/somatic
	FREEBAYES=`which freebayes || true`
	VEP=`which variant_effect_predictor.pl || true`
	VEP_CACHE_DIR=$SPEEDSEQ_HOME/annotations/vep_cache

	# sv
	LUMPY=`which lumpy || true`
	PAIREND_DISTRO=`which pairend_distro.py || true`
	BEDPETOVCF=`which bedpeToVcf || true`
	LUMPYTOBEDPE=`which lumpyToBedpe || true`
	SVTYPER=`which svtyper || true`
	BAMGROUPREADS=`which bamgroupreads.py || true`
	BAMFILTERRG=`which bamfilterrg.py || true`

        # CNVnator
	CNVNATOR_WRAPPER=`which cnvnator_wrapper.py || true`
	CNVNATOR_MULTI=`which cnvnator-multi || true`
	ANNOTATE_RD=`which annotate_rd.py || true`
	CNVNATOR_CHROMS_DIR=~/genomes/GRCh37/chroms

	# re-align
	BAMTOFASTQ=`which bamtofastq.py || true`
	MBUFFER=`which mbuffer || true`
	BAMHEADRG=`which bamheadrg.py || true`
    fi
}

# ensure that the require python modules are installed before
# beginning analysis
function check_python_modules() {
    PYTHON_TEST=$1
    echo -e "\nChecking for required python modules ($PYTHON_TEST)..."

    $PYTHON_TEST -c "import imp; imp.find_module('pysam')"
    $PYTHON_TEST -c "import imp; imp.find_module('numpy')"
    $PYTHON_TEST -c "import imp; imp.find_module('scipy')"
}

## usage
function usage() {
    echo "
usage:   lumpyexpress [options]

options:
     -B FILE  full BAM file(s) (comma separated) (required)
     -S FILE  split reads BAM file(s) (comma separated) (required)
     -D FILE  discordant reads BAM files(s) (comma separated) (required)
     -R FILE  indexed reference genome fasta file (required)
     -o STR   output prefix [fullBam.bam]
#     -t INT   threads [1] 
     -x FILE  BED file to exclude
     -P       output probability curves for each variant
#     -g       genotype SV breakends with svtyper
#     -d       calculate read-depth with CNVnator
#     -A       annotate the vcf with VEP
#     -a       re-align with SpeedSeq BWA
     -m INT   minimum sample weight for a call [4]
     -r FLOAT trim threshold [0]
     -T DIR   temp directory [./output_prefix.XXXXXXXXXXXX]
     -k       keep temporary files

     -K FILE  path to lumpyexpress.config file
                (default: same directory as lumpyexpress)
     -v       verbose
     -h       show this message
"
}

# set defaults
LUMPY_DIR=`dirname $0`
CONFIG="$LUMPY_DIR/lumpyexpress.config"
THREADS=1
ANNOTATE=0
MIN_WEIGHT=4
TRIM_THRES=0
EXCLUDE_BED=
TEMP_DIR=""
GENOTYPE=0
READDEPTH=0
VERBOSE=0
KEEP=0
OUTPUT=""
MAX_SPLIT_COUNT=2
MIN_NON_OVERLAP=20
REALIGN=0
PROB_CURVE=""

while getopts ":hB:S:D:R:o:m:r:x:T:t:PaAdgkvK:" OPTION
do
    case "${OPTION}" in
	h)
	    usage
	    exit 1
	    ;;
	B)
	    FULL_BAM_STRING="$OPTARG"
	    ;;
	S)
	    SPL_BAM_STRING="$OPTARG"
	    ;;
	D)
	    DISC_BAM_STRING="$OPTARG"
	    ;;
	R)
	    REF="$OPTARG"
	    ;;
	o)
	    OUTPUT="$OPTARG"
	    ;;
	m)
	    MIN_WEIGHT="$OPTARG"
	    ;;
	r)
	    TRIM_THRES="$OPTARG"
	    ;;
	x)
	    EXCLUDE_BED="$OPTARG"
	    EXCLUDE_BED_FMT="-x $EXCLUDE_BED"
	    ;;  
	T)
	    TEMP_DIR="$OPTARG"
	    ;;
	t)
	    THREADS="$OPTARG"
	    ;;
	P)
	    PROB_CURVE="-P"
	    ;;
	a)
	    REALIGN=1
	    ;;
	A)
	    ANNOTATE=1
	    ;;
	d)
	    READDEPTH=1
	    ;;
	g)
	    GENOTYPE=1
	    ;;
	v)
	    VERBOSE=1
	    ;;
	k)
	    KEEP=1
	    ;;
	K)
	    CONFIG="$OPTARG"
	    ;;
    esac
done

# parse the BAM strings
FULL_BAM_LIST=(`echo $FULL_BAM_STRING | tr "," " "`)
SPL_BAM_LIST=(`echo $SPL_BAM_STRING | tr "," " "`)
DISC_BAM_LIST=(`echo $DISC_BAM_STRING | tr "," " "`)

OPTIND=0

# Check the for the relevant binaries
source_binaries $CONFIG

if [[ -z "$LUMPY" ]]
then
    usage
    echo -e "Error: lumpy executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
elif [[ -z  "$PAIREND_DISTRO" ]]
then
    usage
    echo -e "Error: pairend_distro.py executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
elif [[ -z  "$SAMBAMBA" ]]
then
    usage
    echo -e "Error: sambamba executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
elif [[ ! -f "$VEP" ]] && [[ "$ANNOTATE" -eq 1 ]]
then
    usage
    echo -e "Error: VEP not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
elif [[ ! -d "$VEP_CACHE_DIR" ]] && [[ "$ANNOTATE" -eq 1 ]]
then
    usage
    echo -e "Error: VEP cache directory not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
elif [[ -z "$VAWK" ]] && [[ "$ANNOTATE" -eq 1 ]]
then
    usage
    echo -e "Error: vawk executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
elif [[ -z "$SAMBLASTER" ]] && [[ -z "${DISC_BAM_STRING}${SPL_BAM_STRING}" ]]
then
    usage
    echo -e "Error: samblaster executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
elif [[ -z "$BWA" ]] && [[ "$REALIGN" -eq 1 ]]
then
    usage
    echo -e "Error: BWA executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
elif [[ -z "$BAMTOFASTQ" ]] && [[ "$REALIGN" -eq 1 ]]
then
    usage
    echo -e "Error: bamtofastq.py executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
elif [[ -z "$BAMHEADRG" ]] && [[ "$REALIGN" -eq 1 ]]
then
    usage
    echo -e "Error: bamheadrg.py executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
elif [[ -z "$MBUFFER" ]] && [[ "$REALIGN" -eq 1 ]]
then
    usage
    echo -e "Error: mbuffer executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
elif [[ -z "$BAMFILTERRG" ]]
then
    usage
    echo -e "Error: bamheadrg.py executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
fi

# if genotyping requested, look for svtyper
if [[ "$GENOTYPE" -eq 1 ]] && [[ -z "$SVTYPER" ]]
then
    usage
    echo -e "Error: svtyper executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
    exit 1
fi

# if CNV read-depth requested, look for cnvnator executables
if [[ "$READDEPTH" -eq 1 ]]
then
    if [[ -z "$CNVNATOR_MULTI" ]]
    then
	usage
	echo -e "Error: cnvnator executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
	exit 1
    elif [[ -z "$CNVNATOR_WRAPPER" ]]
    then
	usage
	echo -e "Error: cnvnator_wrapper.py executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
	exit 1
    elif [[ -z "$ANNOTATE_RD" ]]
    then
	usage
	echo -e "Error: annotate_rd.py executable not found. Please set path in $SPEEDSEQ_DIR/speedseq.config file\n"
	exit 1
    fi
fi

# check for required python modules (pysam, numpy, scipy, etc)
check_python_modules $PYTHON

# Check that the required files exist
if [[ ! -f $REF ]]
then
    usage
    echo -e "Error: reference fasta file $REF not found\n"
    exit 1
fi
if [[ ! -f $REF.fai && ! -f $(echo ${REF%*.*}).fai ]]
then
    usage
    echo -e "Error: reference fasta file $REF not indexed\n"
    exit 1
fi
if [[ ${#FULL_BAM_LIST[@]} -eq 0 ]]
then
    usage
    echo -e "Error: -B is required\n"
    exit 1
fi

for TEST_BAM in ${FULL_BAM_LIST[@]} ${SPL_BAM_LIST[@]} ${DISC_BAM_LIST[@]}
do
    if [[ ! -f $TEST_BAM ]]
    then
	usage
	echo -e "Error: BAM file $TEST_BAM not found.\n"
	exit 1
    fi
done

# default OUTPUT if not provided
if test -z "$OUTPUT"
then
    OUTPUT=`basename "${FULL_BAM_LIST[0]}"`
fi
OUTBASE=`basename "$OUTPUT"`

# make temporary directory
if [[ $VERBOSE -eq 1 ]]
then
    echo "
    create temporary directory"
fi
if [[ -z $TEMP_DIR ]]
then
    TEMP_DIR=`mktemp -d ${OUTBASE}.XXXXXXXXXXXX`
else
    mkdir -p $TEMP_DIR
fi

# If splitter and discordant BAMs not provided, generate them
# (LUMPY express)
if [[ -z "${SPL_BAM_LIST}${DISC_BAM_LIST}" ]]
then
    # initialize split and discordant bam lists
    SPL_BAM_LIST=()
    DISC_BAM_LIST=()

    # create temp files and pipes
    mkdir -p $TEMP_DIR/spl $TEMP_DIR/disc
    if [[ ! -e $TEMP_DIR/spl_pipe ]]
    then
	mkfifo $TEMP_DIR/spl_pipe
    fi
    if [[ ! -e $TEMP_DIR/disc_pipe ]]
    then
	mkfifo $TEMP_DIR/disc_pipe
    fi
    if [[ ! -e $TEMP_DIR/fq_pipe ]]
    then
	mkfifo $TEMP_DIR/fq_pipe
    fi
    FQ="$TEMP_DIR/fq_pipe"

    # generate histo files and construct the strings for LUMPY
    for i in $( seq 0 $(( ${#FULL_BAM_LIST[@]}-1 )) )
    do
	FULL_BAM=${FULL_BAM_LIST[$i]}

	# calc readlength if not provided
	READ_LENGTH=`$SAMBAMBA view $FULL_BAM | head -n 10000 | awk 'BEGIN { MAX_LEN=0 } { LEN=length($10); if (LEN>MAX_LEN) MAX_LEN=LEN } END { print MAX_LEN }'`

	# parse the libraries in the BAM header to extract readgroups from the same library
	LIB_RG_LIST=(`$BAMLIBS $FULL_BAM`)

	for j in $( seq 0 $(( ${#LIB_RG_LIST[@]}-1 )) )
	do
	    if [[ "$VERBOSE" -eq 1 ]]
	    then
		echo -e "
	    $PYTHON $BAMGROUPREADS --fix_flags -i $FULL_BAM -r ${LIB_RG_LIST[$j]} \\
		| $SAMBLASTER --acceptDupMarks --excludeDups --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP \\
		--splitterFile $TEMP_DIR/spl_pipe --discordantFile $TEMP_DIR/disc_pipe \\
		| $SAMBAMBA view -S -F 'paired and mate_is_reverse_strand and not (unmapped or mate_is_unmapped or reverse_strand or secondary_alignment or duplicate)' /dev/stdin \\
		| awk '{ if (NR<=1000000) print > \"/dev/stdout\" ; else print > \"/dev/null\" }' \\
		| $PYTHON $PAIREND_DISTRO -r $READ_LENGTH -X 4 -N 1000000 -o ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).x4.histo \\
		> ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).insert.stats

	    $SAMBAMBA view -S -f bam -l 0 $TEMP_DIR/spl_pipe \\
		| $SAMBAMBA sort -m 1G --tmpdir=$TEMP_DIR/spl -o $TEMP_DIR/$OUTBASE.sample$(($i+1)).lib$(($j+1)).splitters.bam /dev/stdin
	    $SAMBAMBA view -S -f bam -l 0 $TEMP_DIR/disc_pipe \\
		| $SAMBAMBA sort -m 1G --tmpdir=$TEMP_DIR/disc -o $TEMP_DIR/$OUTBASE.sample$(($i+1)).lib$(($j+1)).discordants.bam /dev/stdin"
	    fi

	    echo -e "
	    $PYTHON $BAMGROUPREADS --fix_flags -i $FULL_BAM -r ${LIB_RG_LIST[$j]} \
		| $SAMBLASTER --acceptDupMarks --excludeDups --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP \
		    --splitterFile $TEMP_DIR/spl_pipe --discordantFile $TEMP_DIR/disc_pipe \
		| $SAMBAMBA view -S -F 'paired and mate_is_reverse_strand and not (unmapped or mate_is_unmapped or reverse_strand or secondary_alignment or duplicate)' /dev/stdin \
		| awk '{ if (NR<=1000000) print > \"/dev/stdout\" ; else print > \"/dev/null\" }' \
		| $PYTHON $PAIREND_DISTRO -r $READ_LENGTH -X 4 -N 1000000 -o ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).x4.histo \
		> ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).insert.stats

	    $SAMBAMBA view -S -f bam -l 0 $TEMP_DIR/spl_pipe \
		| $SAMBAMBA sort -m 1G --tmpdir=$TEMP_DIR/spl -o $TEMP_DIR/$OUTBASE.sample$(($i+1)).lib$(($j+1)).splitters.bam /dev/stdin
	    $SAMBAMBA view -S -f bam -l 0 $TEMP_DIR/disc_pipe \
		| $SAMBAMBA sort -m 1G --tmpdir=$TEMP_DIR/disc -o $TEMP_DIR/$OUTBASE.sample$(($i+1)).lib$(($j+1)).discordants.bam /dev/stdin" \
		| $PARALLEL -j 3

	    # generate discordant pair string for LUMPY
	    RG_STRING=`echo "${LIB_RG_LIST[$j]}" | sed 's/,/,read_group:/g' | sed 's/^/read_group:/g'`
	    MEAN=`cat ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).insert.stats | tr '\t' '\n' | grep "^mean" | sed 's/mean\://g'`
	    STDEV=`cat ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).insert.stats | tr '\t' '\n' | grep "^stdev" | sed 's/stdev\://g'`
	    LUMPY_DISC_STRING="$LUMPY_DISC_STRING -pe bam_file:$TEMP_DIR/$OUTBASE.sample$(($i+1)).discordants.bam,histo_file:${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).x4.histo,mean:${MEAN},stdev:${STDEV},read_length:${READ_LENGTH},min_non_overlap:${READ_LENGTH},discordant_z:5,back_distance:10,weight:1,id:$(($i+1))$(($j+1))0,min_mapping_threshold:20,${RG_STRING}"

	    # generate split-read string for LUMPY
	    LUMPY_SPL_STRING="$LUMPY_SPL_STRING -sr bam_file:${TEMP_DIR}/$OUTBASE.sample$(($i+1)).splitters.bam,back_distance:10,min_mapping_threshold:20,weight:1,id:$(($i+1))$(($j+1))1,min_clip:20,${RG_STRING}"

	    # # generate LUMPY sample config file
	    # DISC_BAM=$TEMP_DIR/$OUTBASE.sample$(($i+1)).lib$(($j+1)).discordants.bam
	    # SPL_BAM=${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).splitters.bam
	    # DISC_SAMPLE=`$SAMBAMBA view -H $DISC_BAM | grep -m 1 "^@RG" | awk -v i=$i '{ for (j=1;j<=NF;++j) {if ($j~"^SM:") { gsub("^SM:","",$j); print $j } } }'`
	    # SPL_SAMPLE=`$SAMBAMBA view -H $SPL_BAM | grep -m 1 "^@RG" | awk -v i=$i '{ for (j=1;j<=NF;++j) {if ($j~"^SM:") { gsub("^SM:","",$j); print $j } } }'`
	    # echo -e "$DISC_SAMPLE\t$(($i+1))$(($j+1))0\tPE\t$DISC_BAM" >> $TEMP_DIR/$OUTBASE.sample.config
	    # echo -e "$SPL_SAMPLE\t$(($i+1))$(($j+1))1\tSR\t$SPL_BAM" >> $TEMP_DIR/$OUTBASE.sample.config
	done

	# merge the splitters and discordants files
	MERGE_DISCORDANTS=""
	MERGE_SPLITTERS=""
	for j in $( seq 0 $(( ${#LIB_RG_LIST[@]}-1 )) )
	do
	    MERGE_DISCORDANTS="$MERGE_DISCORDANTS $TEMP_DIR/$OUTBASE.sample$(($i+1)).lib$(($j+1)).discordants.bam"
	    MERGE_SPLITTERS="$MERGE_SPLITTERS ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).splitters.bam"
	done

	if [[ $VERBOSE -eq 1 ]]
	then
	    echo "
	$SAMBAMBA merge -t $THREADS ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).discordants.bam $MERGE_DISCORDANTS
	$SAMBAMBA merge -t $THREADS ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).splitters.bam $MERGE_SPLITTERS
	rm $MERGE_DISCORDANTS $MERGE_SPLITTERS"
	fi
	$SAMBAMBA merge -t $THREADS ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).discordants.bam $MERGE_DISCORDANTS
	$SAMBAMBA merge -t $THREADS ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).splitters.bam $MERGE_SPLITTERS
	rm $MERGE_DISCORDANTS $MERGE_SPLITTERS

	# index the files
	if [[ $VERBOSE -eq 1 ]]
	then
	    echo -e "
	    $SAMBAMBA index ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).discordants.bam
	    $SAMBAMBA index ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).splitters.bam"
	fi
	echo "
	$SAMBAMBA index ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).discordants.bam
	$SAMBAMBA index ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).splitters.bam
	" | $PARALLEL -j 2

	# update the splitters and discordant BAM lists
	SPL_BAM_LIST+=(${TEMP_DIR}/$OUTBASE.sample$(($i+1)).splitters.bam)
	DISC_BAM_LIST+=(${TEMP_DIR}/$OUTBASE.sample$(($i+1)).discordants.bam)
    done

# else (user provided a splitter and discordants file)
else
    # # initialize LUMPY sample config file for generating the VCF
    # > $TEMP_DIR/$OUTBASE.sample.config

    # parse the libraries in the BAM header to extract readgroups from the same library
    for i in $( seq 0 $(( ${#FULL_BAM_LIST[@]}-1 )) )
    do
	FULL_BAM=${FULL_BAM_LIST[$i]}
	DISC_BAM=${DISC_BAM_LIST[$i]}
	SPL_BAM=${SPL_BAM_LIST[$i]}

	# LIB_RG_LIST contains an element for each library in the BAM file.
	# These elements are comma delimited strings for the readgroups for each library.
	LIB_RG_LIST=(`$BAMLIBS ${FULL_BAM_LIST[$i]}`)

	# generate the histo, stats, and config files
	echo "Calculating insert distributions... "
	for j in $( seq 0 $(( ${#LIB_RG_LIST[@]}-1 )) )
	do
	    # calculate read length if not provided
	    LIB_READ_LENGTH_LIST+=(`$SAMBAMBA view ${FULL_BAM_LIST[$i]} | head -n 10000 | awk 'BEGIN { MAX_LEN=0 } { LEN=length($10); if (LEN>MAX_LEN) MAX_LEN=LEN } END { print MAX_LEN }'`)
	    echo "Library read groups: ${LIB_RG_LIST[$j]}"
	    echo "Library read length: ${LIB_READ_LENGTH_LIST[$j]}"
	    $SAMBAMBA view -h -F 'paired and mate_is_reverse_strand and not (unmapped or mate_is_unmapped or reverse_strand or secondary_alignment or duplicate)' ${FULL_BAM_LIST[$i]} \
		| $PYTHON $BAMFILTERRG -S -n 4000000 --readgroup ${LIB_RG_LIST[$j]} \
		| grep -v '^@' \
		| sed -n '3000001,4000000p;4000000q' \
		| $PYTHON $PAIREND_DISTRO -r ${LIB_READ_LENGTH_LIST[$j]} -X 4 -N 1000000 -o ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).x4.histo \
		> ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).insert.stats
	done
	echo "done"

	# construct LUMPY_SPL_STRING
	SPL_SAMPLE=`$SAMBAMBA view -H $SPL_BAM | grep -m 1 "^@RG" | awk -v i=$i '{ for (j=1;j<=NF;++j) {if ($j~"^SM:") { gsub("^SM:","",$j); print $j } } }'`
	LUMPY_SPL_STRING="$LUMPY_SPL_STRING -sr bam_file:${SPL_BAM},back_distance:10,min_mapping_threshold:20,weight:1,id:${SPL_SAMPLE},min_clip:20"
	# # append to the sample config file
	# echo -e "$SPL_SAMPLE\t$(($i+1))01\tSR\t$SPL_BAM" >> $TEMP_DIR/$OUTBASE.sample.config

	# construct LUMPY_DISC_STRING
	for j in $( seq 0 $(( ${#LIB_RG_LIST[@]}-1 )) )
	do
	    echo $(( ${#FULL_BAM_LIST[@]}-1 ))
	    DISC_BAM=${DISC_BAM_LIST[$i]}
	    DISC_SAMPLE=`$SAMBAMBA view -H $DISC_BAM | grep -m 1 "^@RG" | awk -v i=$i '{ for (j=1;j<=NF;++j) {if ($j~"^SM:") { gsub("^SM:","",$j); print $j } } }'`
	    MEAN=`cat ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).insert.stats | tr '\t' '\n' | grep "^mean" | sed 's/mean\://g'`
	    STDEV=`cat ${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).insert.stats | tr '\t' '\n' | grep "^stdev" | sed 's/stdev\://g'`
	    RG_STRING=`echo "${LIB_RG_LIST[$j]}" | sed 's/,/,read_group:/g' | sed 's/^/read_group:/g'`

	    LUMPY_DISC_STRING="$LUMPY_DISC_STRING -pe bam_file:${DISC_BAM},histo_file:${TEMP_DIR}/$OUTBASE.sample$(($i+1)).lib$(($j+1)).x4.histo,mean:${MEAN},stdev:${STDEV},read_length:${LIB_READ_LENGTH_LIST[$j]},min_non_overlap:${LIB_READ_LENGTH_LIST[$j]},discordant_z:5,back_distance:10,weight:1,id:${DISC_SAMPLE},min_mapping_threshold:20,${RG_STRING}"

	    # # append to the sample config file
	    # echo -e "$DISC_SAMPLE\t$(($i+1))$(($j+1))0\tPE\t$DISC_BAM" >> $TEMP_DIR/$OUTBASE.sample.config
	done
    done
fi

echo "Running LUMPY... "
if [[ "$VERBOSE" -eq 1 ]]
then
    echo "
$LUMPY ${PROB_CURVE} \\
    -t ${TEMP_DIR}/${OUTBASE} \\
    -mw $MIN_WEIGHT \\
    -tt $TRIM_THRES \\
    $EXCLUDE_BED_FMT \\
    $LUMPY_DISC_STRING \\
    $LUMPY_SPL_STRING \\
    > $TEMP_DIR/$OUTBASE.sv.vcf"
fi
# call lumpy
$LUMPY $PROB_CURVE -t ${TEMP_DIR}/${OUTBASE} -mw $MIN_WEIGHT -tt $TRIM_THRES \
    $EXCLUDE_BED_FMT \
    $LUMPY_DISC_STRING \
    $LUMPY_SPL_STRING \
    > $OUTPUT.sv.vcf

# clean up
if [[ "$KEEP" -eq 0 ]]
then
    rm -r ${TEMP_DIR}
fi

echo "done"

# exit cleanly
exit 0


## END SCRIPT