# Index many genomes with sourmash

In [1]:
import os
from os import path

import os

# Fix warning about locale unset
os.environ['LANG'] = 'en_US.UTF-8'

reads_dir = 'output'
sourmash_dir = 'sourmash'
sourmash_out = f'{sourmash_dir}/sigs1'

if not path.exists(sourmash_dir):
    os.mkdir(sourmash_dir)
    
if not path.exists(sourmash_out):
    os.mkdir(sourmash_out)

def strip_end(text, suffix):
    if suffix and text.endswith(suffix):
        return text[:-len(suffix)]
    return text

sample_names = [strip_end(f, '_R1.fq.gz') for f in os.listdir(reads_dir) if f.endswith('_R1.fq.gz')]
sample_names[:4]

['SH14-013', 'SH10-015', 'SH12-009', 'SH14-006']

# Create all sourmash signatures

In [2]:
sourmash_params = 'k=31,scaled=5000'

!parallel -j 32 -I% 'sourmash sketch dna -p {sourmash_params} --merge % -o - \
    {reads_dir}/%_R1.fq.gz {reads_dir}/%_R2.fq.gz | gzip > {sourmash_out}/%.sig.gz' \
    ::: {' '.join(sample_names)} 2> {sourmash_dir}/sigs1.stderr

In [3]:
!head {sourmash_dir}/sigs1.stderr

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: output/SH14-014_R1.fq.gz, output/SH14-014_R2.fq.gz
[KComputing a total of 1 signature(s).
[K... reading sequences from output/SH14-014_R1.fq.gz
[K... output/SH14-014_R1.fq.gz 294099 sequences
[K... reading sequences from output/SH14-014_R2.fq.gz
[K... output/SH14-014_R2.fq.gz 294099 sequences


In [4]:
!du -sh {sourmash_out}

3.0M	sourmash/sigs1


# Search through them for matches to a particular genome

In [5]:
query_genome = 'input/S_HeidelbergSL476.fasta.gz'
query_genome_name = 'S_HeidelbergSL476'
query_sketch = f'{sourmash_dir}/query/{query_genome_name}.sig.gz'

if not path.exists(f'{sourmash_dir}/query'):
    os.mkdir(f'{sourmash_dir}/query')

!sourmash sketch dna -p {sourmash_params} --merge {query_genome_name} \
    -o - {query_genome} | gzip > {query_sketch}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kcomputing signatures for files: input/S_HeidelbergSL476.fasta.gz
[KComputing a total of 1 signature(s).
[K... reading sequences from input/S_HeidelbergSL476.fasta.gz
[K... input/S_HeidelbergSL476.fasta.gz 1 sequences
[Kcalculated 1 signature for 1 sequences taken from 1 files
[Ksaved signature(s) to -. Note: signature license is CC0.


In [6]:
kmer = 31

!time sourmash search -k {kmer} -o {sourmash_dir}/query1.csv {query_sketch} {sourmash_out}/*.sig.gz

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kselecting specified query k=31
[Kloaded query: S_HeidelbergSL476... (k=31, DNA)
[Kloaded 59 signatures.                                                          

59 matches; showing first 3:
similarity   match
----------   -----
 15.3%       SH12-013
 14.0%       SH10-30
 14.0%       SH14-022

real	0m0.618s
user	0m0.583s
sys	0m0.032s


In [7]:
!column -s',' -t {sourmash_dir}/query1.csv | head -n 5

similarity           name      filename                        md5
0.15312450310065193  SH12-013  sourmash/sigs1/SH12-013.sig.gz  9471ee36977ea0c9736fe1a84ad65432
0.14016341923318668  SH10-30   sourmash/sigs1/SH10-30.sig.gz   9ca0bb8a485c56012ce141a125cb9ff5
0.1398786370001556   SH14-022  sourmash/sigs1/SH14-022.sig.gz  139b51886d7b90dd481c80f6ce0d1318
0.13976809777499216  SH14-008  sourmash/sigs1/SH14-008.sig.gz  532107c9b3996812aa0b98e8ed18a98c


# Create SBT index

In [8]:
sourmash_index1 = f'{sourmash_dir}/index1/index1.sbt.json'

!time sourmash index -k {kmer} {sourmash_index1} {sourmash_out}/*.sig.gz

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kloading 59 files into SBT
[Kloaded 1 sigs from 'sourmash/sigs1/SH08-001.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH09-29.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-001.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-002.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-014.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-015.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH10-30.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH11-001.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH11-002.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-001.sig.gz'10 sigs total
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-002.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-003.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-004.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-005.sig.gz'z'
[Kloaded 1 sigs from 'sourmash/sigs1/SH12-006.sig.

In [9]:
!du -sh {path.dirname(sourmash_index1)}

5.1M	sourmash/index1


# Search SBT index

In [10]:
!time sourmash search -k {kmer} -o {sourmash_dir}/query.index1.csv {query_sketch} {sourmash_index1}

[K
== This is sourmash version 4.0.0. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kselecting specified query k=31
[Kloaded query: S_HeidelbergSL476... (k=31, DNA)
[Kloaded 1 databases.                                                            

59 matches; showing first 3:
similarity   match
----------   -----
 15.3%       SH12-013
 14.0%       SH10-30
 14.0%       SH14-022

real	0m1.032s
user	0m1.019s
sys	0m0.012s
