# Test COBS index (Python interface)

Test creating a COBS index using the Python interface <https://bingmann.github.io/cobs-python-docs>.

In [8]:
import cobs_index as cobs
import time
from pathlib import Path
import os

print(cobs.__version__)

0.1.2


In [13]:
reads_path = Path('..') / 'output' / 'reads_cat'
doclist1 = cobs.DocumentList(str(reads_path))
for i, d in enumerate(doclist1):
    print("doc[{}] name {} size {}".format(i, d.name, d.size))

doc[0] name SH08-001 size 156246044
doc[1] name SH09-29 size 156247419
doc[2] name SH10-001 size 156150631
doc[3] name SH10-002 size 156046700
doc[4] name SH10-014 size 155880939
doc[5] name SH10-015 size 155777535
doc[6] name SH10-30 size 155237675
doc[7] name SH11-001 size 156269496
doc[8] name SH11-002 size 155952361
doc[9] name SH12-001 size 156573304
doc[10] name SH12-002 size 156642596
doc[11] name SH12-003 size 156096800
doc[12] name SH12-004 size 155802051
doc[13] name SH12-005 size 156869121
doc[14] name SH12-006 size 156284420
doc[15] name SH12-007 size 155926240
doc[16] name SH12-008 size 156173556
doc[17] name SH12-009 size 156443250
doc[18] name SH12-010 size 156219394
doc[19] name SH12-011 size 156703358
doc[20] name SH12-012 size 156135713
doc[21] name SH12-013 size 156241245
doc[22] name SH12-014 size 155945965
doc[23] name SH13-001 size 156550918
doc[24] name SH13-002 size 156415005
doc[25] name SH13-003 size 156089873
doc[26] name SH13-004 size 156430995
doc[27] name 

In [21]:
import glob

index_path = Path('salmonella.cobs_compact')
if index_path.exists():
    os.remove(index_path)
    
# Remove cache files so that we re-construct a new index each time
for file in glob.glob(f'{reads_path}/*.cobs_cache'):
    os.remove(file)
    
params = cobs.CompactIndexParameters()
params.term_size = 31               # k-mer size
params.false_positive_rate = 0.1    # higher false positive rate -> smaller index
params.keep_temporary = False
params.num_threads=8

start_time = time.time()
cobs.compact_construct_list(doclist1, str(index_path), index_params=params)
end_time = time.time()

print(f'Took {(end_time - start_time)/60:0.2f} minutes')

Took 5.43 minutes


In [22]:
!ls -lrth {index_path}
!du -sh {reads_path}

-rw-r--r-- 1 apetkau grp_apetkau 4.6G Mar  9 15:12 salmonella.cobs_compact
7.3G	../output/reads_cat


## Find nucleotide sequence to search

In [130]:
import tempfile
import pandas as pd

query = 'TCGGTGTACTGGGTTTCACCCAGACGGTCATGGAGACGCGTCATGAACAGCGTATTTGCCGCATAGTTGTTGGCAAGATAGCTGCCCACTTCCGGACGGAACTGATGTTCTGTTATCGGCGGTGTGTCAGATTGCCCTATGTCAGGAATTGTCGGATCAGGGAT'
# query = 'GAATTAGTTGAAAATAAATATATCGCCAGCAGCACATGAA'
blast_columns = 'qseqid qstart qend sseqid sstart send pident length'
blast_out_format = f'6 {blast_columns}'

assembly_dir = Path('..') / 'output' / 'assemblies'
assembly_files = glob.glob(f'{assembly_dir}/*.fasta')

tmp_assembly = tempfile.TemporaryDirectory()
query_file = f'{tmp_assembly.name}/query.fasta'

with open(query_file, 'w') as of:
    of.write('>query\n')
    of.write(query)

results_frames = []
for f in assembly_files:
    name = os.path.basename(f)
    blast_out = f'{tmp_assembly.name}/{name}.blast'
    out_path = f'{tmp_assembly.name}/{name}'
    os.symlink(os.path.abspath(f), out_path)
    
    !makeblastdb -in {out_path} -dbtype nucl -parse_seqids > /dev/null
    !blastn -db {out_path} -query {query_file} -outfmt '{blast_out_format}' > {blast_out}
    
    df = pd.read_csv(blast_out, sep='\t', names=blast_columns.split(' '))
    df['sample'] = name
    results_frames.append(df)
        
results_df = pd.concat(results_frames)
    
print(tmp_assembly.name)
results_df
tmp_assembly.cleanup()

/tmp/tmp2r1f0q7t


In [131]:
import re

matches_blast = set(results_df[(results_df['pident'] == 100) & (results_df['length'] == len(query))]['sample'].tolist())
matches_blast = {re.sub(r'\.fasta', '', r) for r in matches_blast}
len(matches_blast)

45

## Search in COBS

In [132]:
search_index = cobs.Search('salmonella.cobs_compact')
results_cobs = search_index.search(query, threshold=1.0)
matches_cobs = {r[1] for r in results_cobs}
print(f'COBs matches ({len(matches_cobs)}) and BLAST matches ({len(matches_blast)}) are equal: {matches_cobs == matches_blast}')

COBs matches (45) and BLAST matches (45) are equal: True


In [133]:
print(results_cobs)

[(134, 'SH09-29'), (134, 'SH10-002'), (134, 'SH10-014'), (134, 'SH10-015'), (134, 'SH10-30'), (134, 'SH11-002'), (134, 'SH12-012'), (134, 'SH12-013'), (134, 'SH12-014'), (134, 'SH13-001'), (134, 'SH13-002'), (134, 'SH13-003'), (134, 'SH13-004'), (134, 'SH13-005'), (134, 'SH13-006'), (134, 'SH13-007'), (134, 'SH13-008'), (134, 'SH14-001'), (134, 'SH14-002'), (134, 'SH14-003'), (134, 'SH14-004'), (134, 'SH14-005'), (134, 'SH14-006'), (134, 'SH14-007'), (134, 'SH14-008'), (134, 'SH14-009'), (134, 'SH14-010'), (134, 'SH14-011'), (134, 'SH14-012'), (134, 'SH14-013'), (134, 'SH14-014'), (134, 'SH14-015'), (134, 'SH14-016'), (134, 'SH14-017'), (134, 'SH14-018'), (134, 'SH14-019'), (134, 'SH14-020'), (134, 'SH14-021'), (134, 'SH14-022'), (134, 'SH14-023'), (134, 'SH14-024'), (134, 'SH14-025'), (134, 'SH14-026'), (134, 'SH14-027'), (134, 'SH14-028')]
