# Experiment: HowDeSBT Indexing

Now let's run HowDeSBT to index on our data.

First, let's setup some directories.

In [1]:
fastq_data_dir=data-downsampled
data_dir=kmer-counts-jellyfish
howdesbt_dir=howdesbt
kmer_size="17"

threads=1

PROJECT_DIR=`git rev-parse --show-toplevel`
cd $PROJECT_DIR

The code given below assumes you have the following [conda](https://docs.conda.io/en/latest/) environments setup to install [howdesbt](https://github.com/medvedevgroup/HowDeSBT).

```bash
conda create --name howdesbt howdesbt
```

Let's verify these commands exist (and verify versions).

In [2]:
conda run --name howdesbt howdesbt --version

version 2.00.02 20191014


## Bloom filter sizes

From step **4-kmer-cardinality-downsampled** we determined some Bloom filter sizes using the estimated kmers in union of all datasets.

In [3]:
(echo -e "data_type\tkmer_size\tunion_kmers"
for data_type in microbial metagenomics human
do
    count=`sed -e 's/Estimated number of unique exact matches: //' ${data_type}/${fastq_data_dir}/total-unique-kmers-${kmer_size}.txt | \
    awk '{print int($1+0.5)}'`
    
    echo -e "${data_type}\t${kmer_size}\t${count}"
done) | sort -k3,3n | column -s$'\t' -t

data_type     kmer_size  union_kmers
microbial     17         73106902
human         17         85719995
metagenomics  17         337057219


In [4]:
# For kmer size 9
#microbial_bits=140000
#human_bits=140000
#metagenomics_bits=140000
# For kmer size 11
#microbial_bits=2100000
#human_bits=2100000
#metagenomics_bits=2100000
# For kmer size 13
#microbial_bits=24000000
#human_bits=24000000
#metagenomics_bits=33000000
# For kmer size 15
#microbial_bits=61000000
#human_bits=72000000
#metagenomics_bits=210000000
# For kmer size 17
microbial_bits=74000000
human_bits=86000000
metagenomics_bits=340000000

## HowDeSBT bash function

In [5]:
run_howdesbt() {
    type_dir=$1
    output_dir=$2
    nkmers=$3
    
    cd ${PROJECT_DIR} # Reset ourselves
    pwd
    
    mkdir -p ${output_dir}
    rm ${output_dir}/howdesbt.*
    rm ${output_dir}/*.kmer
    
    # HowDeSBT appears to write files to the working directory
    #  So, we have to change to the proper directory.
    cd ${output_dir}
    pwd
    
    jobs=24
    
    input_dir=${type_dir}/${data_dir}/${kmer_size}
            
    before=`date +%s`

    commands_file=`mktemp`
    
    for file in ${PROJECT_DIR}/${input_dir}/*.kmer.gz
    do
        accession=`basename ${file} .kmer.gz`
        
        uncompressed_input=${accession}.kmer
        
        howdesbt_gzip_log=gzip.${accession}.log
        howdesbt_makebf_out=howdesbt.${accession}.bf
        howdesbt_makebf_log=howdesbt.${accession}.makebf.log

        command="/usr/bin/time -v gzip -d --stdout ${file} 2> ${howdesbt_gzip_log}.err 1> ${uncompressed_input} && \
            /usr/bin/time -v howdesbt makebf ${uncompressed_input} --kmersin --k=${kmer_size} \
            --threads=${threads} --hashes=1 --bits=${nkmers} --out=${howdesbt_makebf_out} \
            2> ${howdesbt_makebf_log}.err 1> ${howdesbt_makebf_log}"
        echo ${command} >> ${commands_file}
    done
    
    echo "Will run commands (howdesbt makebf) from [${commands_file}] like:"
    head -n 1 ${commands_file}
    command="parallel -j ${jobs} -a ${commands_file}"
    echo -e "\n${command}"
    conda run --name howdesbt ${command}
    
    nkmers_10_per=`echo "${nkmers}/10" | bc`
    
    howdesbt_cluster_out=howdesbt.cluster.sbt
    howdesbt_cluster_log=howdesbt.cluster.log
    
    howdesbt_build_out=howdesbt.build.sbt
    howdesbt_build_log=howdesbt.build.log
    
    echo "Now, let's run howdesbt cluster and build"
    ls howdesbt.*.bf > leafnames
    command="/usr/bin/time -v howdesbt cluster --list=leafnames --bits=${nkmers_10_per} --tree=${howdesbt_cluster_out} \
                --nodename=node{number} --keepallnodes 2> ${howdesbt_cluster_log}.err 1> ${howdesbt_cluster_log} && \
            /usr/bin/time -v howdesbt build --HowDe --tree=${howdesbt_cluster_out} --outtree=${howdesbt_build_out} \
                2> ${howdesbt_build_log}.err 1> ${howdesbt_build_log}"
            
    echo "${command}"
    conda run --name howdesbt ${command}
     
    cd ${PROJECT_DIR}
    pwd
        
    after=`date +%s`
    minutes=`echo "(${after}-${before})/60" | bc -l`
    printf "Done. Took %0.2f minutes.\n" ${minutes}
}

## Microbial HowDeSBT

In [6]:
input_dir_type="microbial"
run_howdesbt "${input_dir_type}" "${input_dir_type}/${howdesbt_dir}/${kmer_size}" "${microbial_bits}"
cd ${PROJECT_DIR}

/home/CSCScience.ca/apetkau/workspace/comp7934-project
rm: cannot remove 'microbial/howdesbt/17/howdesbt.*': No such file or directory
rm: cannot remove 'microbial/howdesbt/17/*.kmer': No such file or directory
/home/CSCScience.ca/apetkau/workspace/comp7934-project/microbial/howdesbt/17
Will run commands (howdesbt makebf) from [/tmp/tmp.1cAWVg9JRz] like:
/usr/bin/time -v gzip -d --stdout /home/CSCScience.ca/apetkau/workspace/comp7934-project/microbial/kmer-counts-jellyfish/17/ERR1144974.kmer.gz 2> gzip.ERR1144974.log.err 1> ERR1144974.kmer && /usr/bin/time -v howdesbt makebf ERR1144974.kmer --kmersin --k=17 --threads=1 --hashes=1 --bits=74000000 --out=howdesbt.ERR1144974.bf 2> howdesbt.ERR1144974.makebf.log.err 1> howdesbt.ERR1144974.makebf.log

parallel -j 24 -a /tmp/tmp.1cAWVg9JRz
Now, let's run howdesbt cluster and build
/usr/bin/time -v howdesbt cluster --list=leafnames --bits=7400000 --tree=howdesbt.cluster.sbt                 --nodename=node{number} --keepallnodes 2> howdesbt.clu

Awesome. We've build the HowDeSBT Sequence Bloom tree. Let's take a look at sizes.

In [7]:
echo ${input_dir_type}
du -kc `ls ${input_dir_type}/${howdesbt_dir}/${kmer_size}/{howdesbt.*.bf,*.kmer,howdesbt.cluster.sbt} | grep -v 'detbrief'` | 
    grep 'total' | 
    sed -e 's/\ttotal$/ total intermediate (KB)/' | 
    tee ${input_dir_type}/${howdesbt_dir}/${kmer_size}/howdesbt-total-disk.txt
    
du -kc ${input_dir_type}/${howdesbt_dir}/${kmer_size}/{*.detbrief.rrr.bf,howdesbt.build.sbt} | 
    grep 'total' | 
    sed -e 's/\ttotal$/ total database (KB)/' | 
    tee -a ${input_dir_type}/${howdesbt_dir}/${kmer_size}/howdesbt-total-disk.txt

microbial
3667576 total intermediate (KB)
100440 total database (KB)


## Metagenomics HowDeSBT

In [8]:
input_dir_type="metagenomics"
run_howdesbt "${input_dir_type}" "${input_dir_type}/${howdesbt_dir}/${kmer_size}" "${metagenomics_bits}"

/home/CSCScience.ca/apetkau/workspace/comp7934-project
rm: cannot remove 'metagenomics/howdesbt/17/howdesbt.*': No such file or directory
rm: cannot remove 'metagenomics/howdesbt/17/*.kmer': No such file or directory
/home/CSCScience.ca/apetkau/workspace/comp7934-project/metagenomics/howdesbt/17
Will run commands (howdesbt makebf) from [/tmp/tmp.Kx5hrVhtDH] like:
/usr/bin/time -v gzip -d --stdout /home/CSCScience.ca/apetkau/workspace/comp7934-project/metagenomics/kmer-counts-jellyfish/17/ERR1713331.kmer.gz 2> gzip.ERR1713331.log.err 1> ERR1713331.kmer && /usr/bin/time -v howdesbt makebf ERR1713331.kmer --kmersin --k=17 --threads=1 --hashes=1 --bits=340000000 --out=howdesbt.ERR1713331.bf 2> howdesbt.ERR1713331.makebf.log.err 1> howdesbt.ERR1713331.makebf.log

parallel -j 24 -a /tmp/tmp.Kx5hrVhtDH
Now, let's run howdesbt cluster and build
/usr/bin/time -v howdesbt cluster --list=leafnames --bits=34000000 --tree=howdesbt.cluster.sbt                 --nodename=node{number} --keepallnodes 2

In [9]:
echo ${input_dir_type}
du -kc `ls ${input_dir_type}/${howdesbt_dir}/${kmer_size}/{howdesbt.*.bf,*.kmer,howdesbt.cluster.sbt} | grep -v 'detbrief'` | 
    grep 'total' | 
    sed -e 's/\ttotal$/ total intermediate (KB)/' | 
    tee ${input_dir_type}/${howdesbt_dir}/${kmer_size}/howdesbt-total-disk.txt
    
du -kc ${input_dir_type}/${howdesbt_dir}/${kmer_size}/{*.detbrief.rrr.bf,howdesbt.build.sbt} | 
    grep 'total' | 
    sed -e 's/\ttotal$/ total database (KB)/' | 
    tee -a ${input_dir_type}/${howdesbt_dir}/${kmer_size}/howdesbt-total-disk.txt

metagenomics
9525964 total intermediate (KB)
510092 total database (KB)


## Human HowDeSBT

In [10]:
input_dir_type="human"
run_howdesbt "${input_dir_type}" "${input_dir_type}/${howdesbt_dir}/${kmer_size}" "${human_bits}"

/home/CSCScience.ca/apetkau/workspace/comp7934-project
rm: cannot remove 'human/howdesbt/17/howdesbt.*': No such file or directory
rm: cannot remove 'human/howdesbt/17/*.kmer': No such file or directory
/home/CSCScience.ca/apetkau/workspace/comp7934-project/human/howdesbt/17
Will run commands (howdesbt makebf) from [/tmp/tmp.OiUgdLPGfg] like:
/usr/bin/time -v gzip -d --stdout /home/CSCScience.ca/apetkau/workspace/comp7934-project/human/kmer-counts-jellyfish/17/SRR038300.kmer.gz 2> gzip.SRR038300.log.err 1> SRR038300.kmer && /usr/bin/time -v howdesbt makebf SRR038300.kmer --kmersin --k=17 --threads=1 --hashes=1 --bits=86000000 --out=howdesbt.SRR038300.bf 2> howdesbt.SRR038300.makebf.log.err 1> howdesbt.SRR038300.makebf.log

parallel -j 24 -a /tmp/tmp.OiUgdLPGfg
Now, let's run howdesbt cluster and build
/usr/bin/time -v howdesbt cluster --list=leafnames --bits=8600000 --tree=howdesbt.cluster.sbt                 --nodename=node{number} --keepallnodes 2> howdesbt.cluster.log.err 1> howdesb

In [11]:
echo ${input_dir_type}
du -kc `ls ${input_dir_type}/${howdesbt_dir}/${kmer_size}/{howdesbt.*.bf,*.kmer,howdesbt.cluster.sbt} | grep -v 'detbrief'` | 
    grep 'total' | 
    sed -e 's/\ttotal$/ total intermediate (KB)/' | 
    tee ${input_dir_type}/${howdesbt_dir}/${kmer_size}/howdesbt-total-disk.txt
    
du -kc ${input_dir_type}/${howdesbt_dir}/${kmer_size}/{*.detbrief.rrr.bf,howdesbt.build.sbt} | 
    grep 'total' | 
    sed -e 's/\ttotal$/ total database (KB)/' | 
    tee -a ${input_dir_type}/${howdesbt_dir}/${kmer_size}/howdesbt-total-disk.txt

human
3115964 total intermediate (KB)
132584 total database (KB)
