In [1]:
from collections import defaultdict
from Bio import SeqIO
import dask
from dask import delayed, compute

In [2]:
def extract_kmers(sequence, k):
    kmers = {}
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        predecessor = sequence[i - 1] if i > 0 else None
        successor = sequence[i + k] if i + k < len(sequence) else None
        
        if kmer not in kmers:
            kmers[kmer] = {'count': 0, 'predecessors': set(), 'successors': set()}
        
        kmers[kmer]['count'] += 1
        
        if predecessor:
            kmers[kmer]['predecessors'].add(predecessor)
        
        if successor:
            kmers[kmer]['successors'].add(successor)
    
    return kmers

In [3]:
def parse_dataset_and_get_kmers(filepath, k):
    kmer_frequencies = {}
    with open(filepath, 'r') as file:
        while True:
            header = file.readline().strip()
            if not header:
                break
            sequence = file.readline().strip()
            plus = file.readline().strip()
            quality = file.readline().strip()
            
            kmers = extract_kmers(sequence, k)
            for kmer, data in kmers.items():
                if kmer not in kmer_frequencies:
                    kmer_frequencies[kmer] = {'count': 0, 'predecessors': set(), 'successors': set()}
                
                kmer_frequencies[kmer]['count'] += data['count']
                kmer_frequencies[kmer]['predecessors'].update(data['predecessors'])
                kmer_frequencies[kmer]['successors'].update(data['successors'])
    
    return kmer_frequencies

In [4]:
def remove_tips(graph, min_len=26):
    to_remove = []
    
    for kmer, data in graph.items():
        # A tip is defined as a k-mer that has no successors or no predecessors and is below min_len
        if (len(data['successors']) == 0 or len(data['predecessors']) == 0) and len(kmer) < min_len:
            to_remove.append(kmer)

    # Remove tips from the graph
    for kmer in to_remove:
        del graph[kmer]

In [8]:
def find_successor(kmer, adjacency_list):
    """Find the successor of a kmer using the adjacency list."""
    return adjacency_list.get(kmer, None)

@delayed
def construct_genome_from_kmers(kmers, adjacency_list):
    """Construct the genome string from kmers using the adjacency list."""
    genome = kmers[0]  # Start with the first k-mer
    
    for _ in range(len(kmers) - 1):
        last_kmer = genome[-len(kmers[0]):]  # Get the last k-mer from the current genome string
        successor = find_successor(last_kmer, adjacency_list)
        if not successor:
            break
        
        # Append only the last character of the successor to avoid duplication
        genome += successor[-1]
    
    return genome

# Function to parallelize the genome construction process
def parallel_construct_genome(kmer_frequencies):
    """Parallelize the genome reconstruction process."""
    adjacency_list = {}
    for kmer, data in kmer_frequencies.items():
        if len(data['successors']) > 0:
            adjacency_list[kmer] = next(iter(data['successors']))
    
    # Assuming kmers are sorted and aligned properly
    kmers = sorted(adjacency_list.keys())
    
    tasks = [construct_genome_from_kmers(kmers[i:i + 100], adjacency_list) for i in range(0, len(kmers), 100)]
    contig = compute(*tasks)
    print(contig)

In [9]:
fastq_file = "16S_WT_day3_11_SRR2628505_1.fastq"
k = 21

# Get k-mer frequencies
kmer_frequencies = parse_dataset_and_get_kmers(fastq_file, k)

# Remove tips from the k-mer graph
remove_tips(kmer_frequencies, min_len=26)

# Construct genome from k-mers using Dask
parallel_construct_genome(kmer_frequencies)

KeyboardInterrupt: 