## Downloading fasta files with sra-tools

In [6]:
import sys
from pathlib import Path
# vseek imports
sys.path.append("../")
from vseek.utils.sra_callers import download_fasta
from vseek.common.errors import FastaFileNotFound

In [42]:
srr = ["SRR12432009", "SRR12464727"]
download_fasta(srr)

	

Prefetching SRR data...


In [38]:
class FastaReadRecord:
    """Structure class that contains sequence reads information.
    srr_id: which SRR file it came from
    fragment_id: which fragment id was it 
    """
    __slots__ = ("srr_id", "fragment_id", "sequence", "length")

    def __init__(self, srr_id, fragment_id, sequence, length):
        self.srr_id = srr_id
        self.fragment_id = fragment_id
        self.sequence = sequence
        self.length = length


class SequenceIO:
    def __init__(self, sequence_path):
        # checking 
        check = Path(sequence_path).is_file()
        if check is False:
            raise FastaFileNotFound(f"Unable to find {sequence_path}")

        self.sequence_path = sequence_path

    def lazy_load_fasta(self):
        """Loads all fasta sequences as FastaReadRecords

        Yields
        ------
        _type_
            _description_
        """
        with open(self.sequence_path, "r") as fasta_file:
            entry = []
            for idx, line in enumerate(fasta_file):

                cleaned_line = line.strip().replace("\n", "")

                # append the first line only
                if idx == 0 and cleaned_line.startswith(">"):
                    entry.append(cleaned_line)
                    continue

                elif idx != 0 and cleaned_line.startswith(">"):        
                    # convert the chunk into a single 
                    fasta_record = self._convert(entry)

                    # reassign list entry as empty
                    entry = []

                    yield fasta_record 

                entry.append(cleaned_line)

    def _convert(self, entry: list[str]):
        """converts entry into a FastaReadRecord

        Parameters
        ----------
        entry : list[str]
            list containing header and sequence

        Returns
        -------
        FastaReadRecord
            Datatype that contains header_id, fragment_id, sequence and its length 
        """
        entry_data = tuple(entry[0].split())
        header_id = entry_data[0]
        frag_id = entry_data[1]
        sequence = "".join(entry[1:])
        length = len(sequence)

        return FastaReadRecord(srr_id=header_id, fragment_id=frag_id, sequence=sequence, length=length)

In [41]:
sio = SequenceIO(sequence_path="../results/fasta_files/SRR12464727.fasta")

for idx, r in enumerate(sio.lazy_load_fasta()):
    if idx % 1000000 == 0:
        print(r.fragment_id, r.sequence)
    
print(f"total amount of reads: {idx}")

1 NGTGGGGGTGGGGGGGAGGGCGCGCGACCCCGGTCGGCGCGCCCCGCTTCTTCGGTTCCCGCCTCCTCCCCGTTCACCGCCGGGGCGGCTCGTCCGCTCCGGGCCGGGACGGGGTCCGGGGAGCGTGGTTTGGGAGCCGCGGAGGCGGCC
1000001 ATTCTCATTGGCGTGAGTGTCAGTAAGTTTGGCTGGTGCAGGGGTACTATATGTAGCAATAACGCTACAACGACGAGCCACATCCAAAAATCCATCAGCCGCACTGGCTGTACAAAAGATTTTTATATGTGGAATATAATATCTAATAAC
2000001 CCCGTCCGTCCTTCCGTTCGTCTTCCTCCCTCCCGGCCTCTCCCGCCGACCGCGGGCGTGGTGGTGGGGGTGGGGGGGAGGGCGCGCGACCCCGGTCGGCGCGCCCCGCTTCTTCGGTTCCCGCCTCCTCCCCGTTCACCGCCGGGGCGG
3000001 TTCTGGATACATGTGTATGTTCTGCATCTTAATTTATCTTCCAGATCTTGGAGTGATGTAGTTGCTGGTGGAACTTCACTGCTAGTAACACTTGTAAAATCTTCATTCTGTGAGGAAGTCCACATCTGTCTTGTGCTATTGTCTGCTCTC
4000001 CAGGGAAAATAAGTTGCTCTTTAACACCTAATGTGTAGTTCCCACGACCATCAAATGATTTCTTAGAAACACCACGGAAATCACGTACACGTGGTAAAGAAACTGATACTAATTTGTCGAAGAACTCATACATTTTCTCGCCACGTAAAG
5000001 GGTGGGGGTGGGGGGGAGGGCGCGCGACCCCGGTCGGCGCGCCCCGCTTCTTCGGTTCCCGCCTCCTCCCCGTTCACCGCCGGGGCGGCTCGTCCGCTCCGGGCCGGGGCGGGGTCCGGGGAGCGTGGTTTGGGAGCCGCGGAGGCGGCC
6000001 CCGGGACGGGGACCGGCGGGCCACGGGCCCGGCTCGGCGCGGCC