In [7]:
import sys
import json
from collections import defaultdict
import numpy as np

# vseek imports
sys.path.append("../")
import vseek.common.vseek_paths as vsp
from vseek.common.loader import load_genes_metadata, load_genome, load_viral_genes
from vseek.utils.sequence_io import SequenceIO
from vseek.utils.sra_callers import download_fasta
from vseek.common.io_files import get_viral_genome_fasta_paths, get_genome_genes_paths, get_meta_genomes_paths
from vseek.utils.vseek_analysis import dynamic_hamming


In [3]:
# loading all datasets 
viral_genome_paths = get_viral_genome_fasta_paths()
viral_genes_paths = get_genome_genes_paths()
all_accessions = list(viral_genome_paths.keys())

In [4]:
# downloading metagenome files 
srr_id = "SRR12464727"
download_fasta(srr_id)

Prefetching SRR12464727 data...
Downloading fasta files
SRR12464727.fasta already exists... skipping


'/Users/erikserrano/Development/prelim/prelim3/VSeek/results/fasta_files'

In [3]:
# since it is one file, it is returning a str 
# if it is multiple files, it will return a list
metagenome_path = get_meta_genomes_paths()
reader = SequenceIO(metagenome_path)

In [9]:
# parameters 
all_accessions = list(viral_genome_paths.keys())
metagenome_path = get_meta_genomes_paths()
reader = SequenceIO(metagenome_path)
reads = reader.lazy_load_fasta()
threshold=0.33

counts = defaultdict(lambda: 0)
for idx, read in enumerate(reads):
    if idx == 5:
        break

    for acc_id in all_accessions[:10]:
        # load all the gene sequences
        gene_sequences = load_viral_genes(acc_id)

        top_score = 0.0
        for gene in gene_sequences:
            score = dynamic_hamming(read=read.sequence, reference=gene)
            if score >= threshold:
                if score == 1:
                    top_score = score
                    break
                elif score > top_score: 
                    top_score = score


        if top_score == 1:
            counts[acc_id] += 1 
            break 

        # if perfect match, find the highest score
        if top_score >= threshold:
            counts[acc_id] += 1 

with open("./viral_composition_counts.json", "w") as outfile:
    json.dump(counts, outfile)