# Index many genomes with sourmash

In [1]:
import os
from os import path

import os

# Fix warning about locale unset
os.environ['LANG'] = 'en_US.UTF-8'

reads_dir = 'output'
sourmash_dir = 'sourmash'
sourmash_out = f'{sourmash_dir}/sigs1'

if not path.exists(sourmash_dir):
    os.mkdir(sourmash_dir)
    
if not path.exists(sourmash_out):
    os.mkdir(sourmash_out)

def strip_end(text, suffix):
    if suffix and text.endswith(suffix):
        return text[:-len(suffix)]
    return text

sample_names = [strip_end(f, '_R1.fq.gz') for f in os.listdir(reads_dir) if f.endswith('_R1.fq.gz')]
sample_names[:4]

['SH14-013', 'SH10-015', 'SH12-009', 'SH14-006']

# Create all sourmash signatures

In [2]:
kmer_small = 21
kmer_medium = 31
kmer_large = 51
scaled=1000
sourmash_params = f'k={kmer_small},k={kmer_medium},k={kmer_large},scaled={scaled}'

!parallel -j 32 -I% 'sourmash sketch dna -p {sourmash_params} --merge % -o - \
    {reads_dir}/%_R1.fq.gz {reads_dir}/%_R2.fq.gz | gzip > {sourmash_out}/%.sig.gz' \
    ::: {' '.join(sample_names)} 2> {sourmash_dir}/sigs1.stderr

In [3]:
!head {sourmash_dir}/sigs1.stderr

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: output/SH11-001_R1.fq.gz, output/SH11-001_R2.fq.gz
[KComputing a total of 1 signature(s).
[K... reading sequences from output/SH11-001_R1.fq.gz
[K... output/SH11-001_R1.fq.gz 293397 sequences
[K... reading sequences from output/SH11-001_R2.fq.gz
[K... output/SH11-001_R2.fq.gz 293397 sequences


In [4]:
!du -sh {sourmash_out}

44M	sourmash/sigs1


In [5]:
!sourmash sig describe {sourmash_out}/SH14-013.sig.gz

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

---...reading from file 'sourmash/sigs1/SH14-013.sig.gz'
signature filename: sourmash/sigs1/SH14-013.sig.gz
signature: SH14-013
source file: output/SH14-013_R2.fq.gz
md5: 249c53a79fe0d1fa8fcc61fedcfb4c7d
k=21 molecule=DNA num=0 scaled=1000 seed=42 track_abundance=0
size: 25322
signature license: CC0

---
signature filename: sourmash/sigs1/SH14-013.sig.gz
signature: SH14-013
source file: output/SH14-013_R2.fq.gz
md5: 152628e3436e58f3fa4f9888bbce526d
k=31 molecule=DNA num=0 scaled=1000 seed=42 track_abundance=0
size: 31987
signature license: CC0

---
signature filename: sourmash/sigs1/SH14-013.sig.gz
signature: SH14-013
source file: output/SH14-013_R2.fq.gz
md5: 740d31e5a4282cde203d17a75d7e4463
k=51 molecule=DNA num=0 scaled=1000 seed=42 track_abundance=0
size: 41123
signature license: CC0

[Kloaded 3 sigs from 'sourmash/sigs1/SH14-013.sig.gz'
[Kloaded 3 signatures total.


# Search through them for matches to a particular genome

In [6]:
query_genome = 'input/S_HeidelbergSL476.fasta.gz'
query_genome_name = 'S_HeidelbergSL476'
query_sketch = f'{sourmash_dir}/query/{query_genome_name}.sig.gz'

if not path.exists(f'{sourmash_dir}/query'):
    os.mkdir(f'{sourmash_dir}/query')

!sourmash sketch dna -p {sourmash_params} --merge {query_genome_name} \
    -o - {query_genome} | gzip > {query_sketch}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: input/S_HeidelbergSL476.fasta.gz
[KComputing a total of 1 signature(s).
[K... reading sequences from input/S_HeidelbergSL476.fasta.gz
[K... input/S_HeidelbergSL476.fasta.gz 1 sequences
[Kcalculated 1 signature for 1 sequences taken from 1 files
[Ksaved signature(s) to -. Note: signature license is CC0.


In [7]:
!time sourmash search -k {kmer_large} -o {sourmash_dir}/query1.csv {query_sketch} {sourmash_out}/*.sig.gz

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kselecting specified query k=51
[Kloaded query: S_HeidelbergSL476... (k=51, DNA)
[Kloaded 59 signatures.                                                          

59 matches; showing first 3:
similarity   match
----------   -----
 11.5%       SH12-013
 10.3%       SH12-014
 10.1%       SH14-008

real	0m6.489s
user	0m6.431s
sys	0m0.045s


In [8]:
!column -s',' -t {sourmash_dir}/query1.csv | head -n 5

similarity           name      filename                        md5
0.11508932706134685  SH12-013  sourmash/sigs1/SH12-013.sig.gz  a4c34ce5ccf57767d0a1d4392318b243
0.10308603037507885  SH12-014  sourmash/sigs1/SH12-014.sig.gz  abdd8a58c3cd446be16b0fca08f8679d
0.10140947752126367  SH14-008  sourmash/sigs1/SH14-008.sig.gz  5294517198a67fea1e2e92456ab31ff8
0.10103351685501787  SH10-30   sourmash/sigs1/SH10-30.sig.gz   324430a43b32ed760a89832e6c9c9239


# Create SBT index

In [9]:
sourmash_index1 = f'{sourmash_dir}/index1/index1.sbt.json'

!time sourmash index -k {kmer_small} {sourmash_index1} {sourmash_out}/*.sig.gz

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading 59 files into SBT
[Kloaded 1 sigs from 'sourmash/sigs1/SH08-001.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH09-29.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-001.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-002.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-014.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-015.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-30.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH11-001.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH11-002.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-001.sig.gz'10 sigs total
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-002.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-003.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-004.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-005.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-006.sig.

In [10]:
!du -sh {path.dirname(sourmash_index1)}

15M	sourmash/index1


# Search SBT index

In [11]:
!time sourmash search -k {kmer_small} -o {sourmash_dir}/query.index1.csv {query_sketch} {sourmash_index1}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kselecting specified query k=21
[Kloaded query: S_HeidelbergSL476... (k=21, DNA)
[Kloaded 1 databases.                                                            

59 matches; showing first 3:
similarity   match
----------   -----
 18.6%       SH12-013
 17.9%       SH14-016
 17.9%       SH14-021

real	0m3.576s
user	0m3.530s
sys	0m0.036s


# Test incrementally adding new genomes to SBT index

# Case: 10 genomes

In [34]:
index_update10 = f'{sourmash_dir}/index-update/index10.sbt.json'

inc_kmer = 21

sig_files = [f'{sourmash_out}/{n}.sig.gz' for n in sample_names[:10]]

!time sourmash index -k {inc_kmer} {index_update10} {' '.join(sig_files)}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading 10 files into SBT
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-013.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-015.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-009.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-006.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-002.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-013.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-003.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-007.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-008.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH13-008.sig.gz'10 sigs total
[K
[Kloaded 10 sigs; saving SBT under "sourmash/index-update/index10.sbt.json"
[KFinished saving nodes, now saving SBT index file.
[KFinished saving SBT index, available at /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/sourmash/index-upd

In [35]:
!time sourmash search -k {inc_kmer} -o {sourmash_dir}/query.index10.csv {query_sketch} {index_update10}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kselecting specified query k=21
[Kloaded query: S_HeidelbergSL476... (k=21, DNA)
[Kloaded 1 databases.                                                            

10 matches; showing first 3:
similarity   match
----------   -----
 18.6%       SH12-013
 17.7%       SH14-006
 17.6%       SH14-013

real	0m0.788s
user	0m0.770s
sys	0m0.016s


# Case: Add 20 genomes

In [36]:
index_update20 = f'{sourmash_dir}/index-update/index20.sbt.json'
sig_files = [f'{sourmash_out}/{n}.sig.gz' for n in sample_names[10:20]]
!time sourmash index -k {inc_kmer} {index_update20} {index_update10} {' '.join(sig_files)}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading 11 files into SBT
[Kloaded 10 sigs from 'sourmash/index-update/index10.sbt.json'0 sigs total
[Kloaded 1 sigs from 'sourmash/sigs1/SH11-001.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-016.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-006.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-020.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-004.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-014.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-015.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-005.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-012.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-011.sig.gz'20 sigs total
[K
[Kloaded 20 sigs; saving SBT under "sourmash/index-update/index20.sbt.json"
[KFinished saving nodes, now saving SBT index file.
[KFinished saving SBT index, available at /home/CSCScience.ca/apet

In [37]:
!time sourmash search -k {inc_kmer} -o {sourmash_dir}/query.index20.csv {query_sketch} {index_update20}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kselecting specified query k=21
[Kloaded query: S_HeidelbergSL476... (k=21, DNA)
[Kloaded 1 databases.                                                            

20 matches; showing first 3:
similarity   match
----------   -----
 18.6%       SH12-013
 17.9%       SH14-016
 17.8%       SH12-014

real	0m1.379s
user	0m1.352s
sys	0m0.025s


In [46]:
!du -sh {sourmash_dir}/index-update/.[^.]*

2.4M	sourmash/index-update/.sbt.index10
4.8M	sourmash/index-update/.sbt.index20


# Case: 30 genomes

In [47]:
index_update30 = f'{sourmash_dir}/index-update/index30.sbt.json'
sig_files = [f'{sourmash_out}/{n}.sig.gz' for n in sample_names[20:30]]
!time sourmash index -k {inc_kmer} {index_update30} {index_update20} {' '.join(sig_files)}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading 11 files into SBT
[Kloaded 20 sigs from 'sourmash/index-update/index20.sbt.json'0 sigs total
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-014.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-024.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-025.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-019.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-009.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-003.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-026.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-30.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-010.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-001.sig.gz'30 sigs total
[K
[Kloaded 30 sigs; saving SBT under "sourmash/index-update/index30.sbt.json"
[KFinished saving nodes, now saving SBT index file.
[KFinished saving SBT index, available at /home/CSCScience.ca/apetk

In [48]:
!time sourmash search -k {inc_kmer} -o {sourmash_dir}/query.index30.csv {query_sketch} {index_update30}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kselecting specified query k=21
[Kloaded query: S_HeidelbergSL476... (k=21, DNA)
[Kloaded 1 databases.                                                            

30 matches; showing first 3:
similarity   match
----------   -----
 18.6%       SH12-013
 17.9%       SH14-016
 17.8%       SH14-025

real	0m1.959s
user	0m1.918s
sys	0m0.036s


In [49]:
!du -sh {sourmash_dir}/index-update/.[^.]*

2.4M	sourmash/index-update/.sbt.index10
4.8M	sourmash/index-update/.sbt.index20
7.2M	sourmash/index-update/.sbt.index30


# Appending to SBT

## Case: Initial 10 genomes

In [65]:
index_append = f'{sourmash_dir}/index-append/index.sbt.json'
sig_files = [f'{sourmash_out}/{n}.sig.gz' for n in sample_names[:10]]

!time sourmash index -k {inc_kmer} {index_append} {' '.join(sig_files)}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading 10 files into SBT
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-013.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-015.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-009.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-006.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-002.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-013.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-003.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-007.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-008.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH13-008.sig.gz'10 sigs total
[K
[Kloaded 10 sigs; saving SBT under "sourmash/index-append/index.sbt.json"
[KFinished saving nodes, now saving SBT index file.
[KFinished saving SBT index, available at /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/sourmash/index-appen

In [66]:
!time sourmash search -k {inc_kmer} -o {sourmash_dir}/query.indexappend10.csv {query_sketch} {index_append}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kselecting specified query k=21
[Kloaded query: S_HeidelbergSL476... (k=21, DNA)
[Kloaded 1 databases.                                                            

10 matches; showing first 3:
similarity   match
----------   -----
 18.6%       SH12-013
 17.7%       SH14-006
 17.6%       SH14-013

real	0m0.798s
user	0m0.744s
sys	0m0.052s


In [67]:
!du -sh {sourmash_dir}/index-append/

2.4M	sourmash/index-append/


## Append: 20 genomes

In [68]:
sig_files = [f'{sourmash_out}/{n}.sig.gz' for n in sample_names[10:20]]

!time sourmash index -k {inc_kmer} --append {index_append} {' '.join(sig_files)}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading 10 files into SBT
[Kloaded 1 sigs from 'sourmash/sigs1/SH11-001.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-016.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-006.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-020.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-004.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-014.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-015.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-005.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-012.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-011.sig.gz'10 sigs total
[K
[Kloaded 10 sigs; saving SBT under "sourmash/index-append/index.sbt.json"
[KFinished saving nodes, now saving SBT index file.
[KFinished saving SBT index, available at /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/sourmash/index-appen

In [69]:
!sourmash sig describe {index_append} | tail

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloaded 20 sigs from 'sourmash/index-append/index.sbt.json'0 sigs total
[Kloaded 20 signatures total.

---
signature filename: sourmash/index-append/index.sbt.json
signature: SH14-011
source file: output/SH14-011_R2.fq.gz
md5: 63f227b890cb046953c0e7c076dd07d0
k=21 molecule=DNA num=0 scaled=1000 seed=42 track_abundance=0
size: 25433
signature license: CC0



In [70]:
!time sourmash search -k {inc_kmer} -o {sourmash_dir}/query.indexappend20.csv {query_sketch} {index_append}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kselecting specified query k=21
[Kloaded query: S_HeidelbergSL476... (k=21, DNA)
[Kloaded 1 databases.                                                            

20 matches; showing first 3:
similarity   match
----------   -----
 18.6%       SH12-013
 17.9%       SH14-016
 17.8%       SH12-014

real	0m1.367s
user	0m1.351s
sys	0m0.012s


In [71]:
!du -sh {sourmash_dir}/index-append/

5.3M	sourmash/index-append/


## Append: 30 genomes

In [72]:
sig_files = [f'{sourmash_out}/{n}.sig.gz' for n in sample_names[20:30]]

!time sourmash index -k {inc_kmer} --append {index_append} {' '.join(sig_files)}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading 10 files into SBT
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-014.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-024.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-025.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-019.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-009.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-003.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-026.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-30.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-010.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-001.sig.gz'10 sigs total
[K
[Kloaded 10 sigs; saving SBT under "sourmash/index-append/index.sbt.json"
[KFinished saving nodes, now saving SBT index file.
[KFinished saving SBT index, available at /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/sourmash/index-append

In [73]:
!sourmash sig describe {index_append} | tail

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloaded 30 sigs from 'sourmash/index-append/index.sbt.json'0 sigs total
[Kloaded 30 signatures total.

---
signature filename: sourmash/index-append/index.sbt.json
signature: SH10-001
source file: output/SH10-001_R2.fq.gz
md5: 2ff9ffcb3f0ef4658e7702e83951a3a0
k=21 molecule=DNA num=0 scaled=1000 seed=42 track_abundance=0
size: 25296
signature license: CC0



In [74]:
!time sourmash search -k {inc_kmer} -o {sourmash_dir}/query.indexappend30.csv {query_sketch} {index_append}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kselecting specified query k=21
[Kloaded query: S_HeidelbergSL476... (k=21, DNA)
[Kloaded 1 databases.                                                            

30 matches; showing first 3:
similarity   match
----------   -----
 18.6%       SH12-013
 17.9%       SH14-016
 17.8%       SH14-025

real	0m1.942s
user	0m1.905s
sys	0m0.033s


In [75]:
!du -sh {sourmash_dir}/index-append/

8.3M	sourmash/index-append/


# Merging SBTs

In [76]:
index_merge1 = f'{sourmash_dir}/index-merge/index1.sbt.json'
sig_files = [f'{sourmash_out}/{n}.sig.gz' for n in sample_names[:10]]

!time sourmash index -k {inc_kmer} {index_merge1} {' '.join(sig_files)}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading 10 files into SBT
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-013.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-015.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-009.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-006.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-002.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-013.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-003.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-007.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-008.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH13-008.sig.gz'10 sigs total
[K
[Kloaded 10 sigs; saving SBT under "sourmash/index-merge/index1.sbt.json"
[KFinished saving nodes, now saving SBT index file.
[KFinished saving SBT index, available at /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/sourmash/index-merge

In [77]:
index_merge2 = f'{sourmash_dir}/index-merge/index2.sbt.json'
sig_files = [f'{sourmash_out}/{n}.sig.gz' for n in sample_names[10:20]]

!time sourmash index -k {inc_kmer} {index_merge2} {' '.join(sig_files)}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading 10 files into SBT
[Kloaded 1 sigs from 'sourmash/sigs1/SH11-001.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-016.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-006.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-020.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-004.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-014.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-015.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-005.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-012.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-011.sig.gz'10 sigs total
[K
[Kloaded 10 sigs; saving SBT under "sourmash/index-merge/index2.sbt.json"
[KFinished saving nodes, now saving SBT index file.
[KFinished saving SBT index, available at /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/sourmash/index-merge

In [78]:
index_merge3 = f'{sourmash_dir}/index-merge/index3.sbt.json'
sig_files = [f'{sourmash_out}/{n}.sig.gz' for n in sample_names[20:30]]

!time sourmash index -k {inc_kmer} {index_merge3} {' '.join(sig_files)}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading 10 files into SBT
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-014.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-024.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-025.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-019.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-009.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-003.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-026.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-30.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH14-010.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-001.sig.gz'10 sigs total
[K
[Kloaded 10 sigs; saving SBT under "sourmash/index-merge/index3.sbt.json"
[KFinished saving nodes, now saving SBT index file.
[KFinished saving SBT index, available at /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/sourmash/index-merge/

## Merge SBTs

In [79]:
index_merge = f'{sourmash_dir}/index-merge/index.sbt.json'
sig_files = [f'{sourmash_out}/{n}.sig.gz' for n in sample_names[20:30]]

!time sourmash sbt_combine {index_merge} {index_merge1} {index_merge2} {index_merge3}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcombining 3 SBTs
[Ksaving SBT under "sourmash/index-merge/index.sbt.json".
[KFinished saving nodes, now saving SBT index file.
[KFinished saving SBT index, available at /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/sourmash/index-merge/index.sbt.json


real	0m1.354s
user	0m1.308s
sys	0m0.045s


In [80]:
!sourmash sig describe {index_merge} | tail

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloaded 30 sigs from 'sourmash/index-merge/index.sbt.json'0 sigs total
[Kloaded 30 signatures total.

---
signature filename: sourmash/index-merge/index.sbt.json
signature: SH14-011
source file: output/SH14-011_R2.fq.gz
md5: 63f227b890cb046953c0e7c076dd07d0
k=21 molecule=DNA num=0 scaled=1000 seed=42 track_abundance=0
size: 25433
signature license: CC0



In [81]:
!time sourmash search -k {inc_kmer} -o {sourmash_dir}/query.indexmerge.csv {query_sketch} {index_merge}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kselecting specified query k=21
[Kloaded query: S_HeidelbergSL476... (k=21, DNA)
[Kloaded 1 databases.                                                            

30 matches; showing first 3:
similarity   match
----------   -----
 18.6%       SH12-013
 17.9%       SH14-016
 17.8%       SH14-025

real	0m1.900s
user	0m1.857s
sys	0m0.040s


In [82]:
!du -sh {sourmash_dir}/index-merge/.[^.]*

7.2M	sourmash/index-merge/.sbt.index
2.4M	sourmash/index-merge/.sbt.index1
2.4M	sourmash/index-merge/.sbt.index2
2.4M	sourmash/index-merge/.sbt.index3
