In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-win_amd64.whl.metadata (13 kB)
Downloading biopython-1.84-cp310-cp310-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
    --------------------------------------- 0.0/2.8 MB 495.5 kB/s eta 0:00:06
   - -------------------------------------- 0.1/2.8 MB 657.6 kB/s eta 0:00:05
   - -------------------------------------- 0.1/2.8 MB 656.4 kB/s eta 0:00:05
   -- ------------------------------------- 0.1/2.8 MB 655.8 kB/s eta 0:00:05
   --- ------------------------------------ 0.2/2.8 MB 942.1 kB/s eta 0:00:03
   ----- ---------------------------------- 0.4/2.8 MB 1.3 MB/s eta 0:00:02
   -------- ------------------------------- 0.6/2.8 MB 1.7 MB/s eta 0:00:02
   ---------- ----------------------------- 0.7/2.8 MB 1.9 MB/s eta 0:00:02
   ------------- -------------------------- 0.9/2.8 MB 2.1 MB/s eta 0:00:01
   ----------


[notice] A new release of pip is available: 23.3.1 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from Bio import SeqIO
fastq_file = "16S_WT_day3_11_SRR2628505_1.fastq"
with open(fastq_file, "r") as handle:
    for i, record in enumerate(SeqIO.parse(handle, "fastq")):
        print(f"ID: {record.id}")
        print(f"Sequence: {record.seq}")
        print(f"Quality: {record.letter_annotations['phred_quality']}")
        if i >= 10:
            break

ID: SRR2628505.45838
Sequence: CCTACGGGGGGCAGCAGTGAGGAATATTGGTCAATGGACGAGAGTCTGAACCAGCCAAGTAGCGTGAAGGATGACTGCCCTATGGGTTGTAAACTTCTTTTATATGGGAATAAAACAGGGTATGCATACCCTCTTGTATGTACCATATGAATAAGGATCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGA
Quality: [37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37

In [2]:
def extract_kmers(sequence, k):
    kmers = {}
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        predecessor = sequence[i - 1] if i > 0 else None
        successor = sequence[i + k] if i + k < len(sequence) else None
        
        if kmer not in kmers:
            kmers[kmer] = {'count': 0, 'predecessors': set(), 'successors': set()}
        
        kmers[kmer]['count'] += 1
        
        if predecessor:
            kmers[kmer]['predecessors'].add(predecessor)
        
        if successor:
            kmers[kmer]['successors'].add(successor)
    
    return kmers

In [3]:
def parse_dataset_and_get_kmers(filepath, k):
    kmer_frequencies = {}
    with open(filepath, 'r') as file:
        while True:
            header = file.readline().strip()
            if not header:
                break
            sequence = file.readline().strip()
            plus = file.readline().strip()
            quality = file.readline().strip()
            
            kmers = extract_kmers(sequence, k)
            for kmer, data in kmers.items():
                if kmer not in kmer_frequencies:
                    kmer_frequencies[kmer] = {'count': 0, 'predecessors': set(), 'successors': set()}
                
                kmer_frequencies[kmer]['count'] += data['count']
                kmer_frequencies[kmer]['predecessors'].update(data['predecessors'])
                kmer_frequencies[kmer]['successors'].update(data['successors'])
    
    return kmer_frequencies

k = 21
kmer_frequencies = parse_dataset_and_get_kmers(fastq_file, k)

for kmer, data in kmer_frequencies.items():
    predecessors = ', '.join(data['predecessors']) if data['predecessors'] else 'None'
    successors = ', '.join(data['successors']) if data['successors'] else 'None'
    print(f"{kmer}: {data['count']} (Predecessors: {predecessors}, Successors: {successors})")

CCTACGGGGGGCAGCAGTGAG: 1841 (Predecessors: A, Successors: T, G)
CTACGGGGGGCAGCAGTGAGG: 1843 (Predecessors: A, C, Successors: A, C, G)
TACGGGGGGCAGCAGTGAGGA: 1842 (Predecessors: A, C, Successors: A)
ACGGGGGGCAGCAGTGAGGAA: 1842 (Predecessors: T, Successors: T, G)
CGGGGGGCAGCAGTGAGGAAT: 1843 (Predecessors: A, G, Successors: A, T)
GGGGGGCAGCAGTGAGGAATA: 1843 (Predecessors: T, C, G, Successors: T, G)
GGGGGCAGCAGTGAGGAATAT: 1850 (Predecessors: A, T, C, G, Successors: T, G)
GGGGCAGCAGTGAGGAATATT: 1850 (Predecessors: C, G, Successors: T, G)
GGGCAGCAGTGAGGAATATTG: 1858 (Predecessors: A, T, G, Successors: C, G)
GGCAGCAGTGAGGAATATTGG: 5585 (Predecessors: A, T, C, G, Successors: A, T, C, G)
GCAGCAGTGAGGAATATTGGT: 5569 (Predecessors: A, T, C, G, Successors: C, G)
CAGCAGTGAGGAATATTGGTC: 5576 (Predecessors: A, T, G, Successors: A, T, G)
AGCAGTGAGGAATATTGGTCA: 5578 (Predecessors: T, C, Successors: A, G)
GCAGTGAGGAATATTGGTCAA: 6924 (Predecessors: A, T, C, G, Successors: T, C, G)
CAGTGAGGAATATTGGTCAAT: 

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
import dask
import dask.bag as db
from dask import delayed
from collections import defaultdict

# Function to construct contigs from a single k-mer node
def construct_contig(kmer, kmer_data):
    contig = kmer
    current_kmer = kmer
    visited = set([kmer])
    
    # Traverse successors
    while True:
        try:
            # Get successors for the current k-mer, excluding visited nodes
            successors = kmer_data[current_kmer]['successors'] - visited
            if not successors:
                break  # No unvisited successors, stop traversal
            
            # Pick a successor (you can modify this to choose the best one if needed)
            next_kmer = next(iter(successors))
            visited.add(next_kmer)
            contig += next_kmer[-1]  # Append the last character of the successor
            current_kmer = next_kmer
        except KeyError:
            # If a k-mer doesn't have a successor, end the contig
            break

    return contig

# Function to process each chunk of k-mers and generate contigs
def process_chunk(kmer_chunk, kmer_data):
    contigs = []
    for kmer in kmer_chunk:
        contig = construct_contig(kmer, kmer_data)
        contigs.append(contig)
    return contigs

# Function to generate contigs from k-mers using Dask
def generate_contigs_dask(kmer_data, num_partitions=4):
    # Divide the k-mers into partitions
    kmer_keys = list(kmer_data.keys())
    dask_bag = db.from_sequence(kmer_keys, npartitions=num_partitions)
    
    # Create delayed tasks for processing each chunk
    contigs_bag = dask_bag.map(lambda kmer_chunk: process_chunk(kmer_chunk, kmer_data))
    
    # Compute contigs in parallel
    contigs = contigs_bag.compute()
    
    # Flatten the result
    return [contig for sublist in contigs for contig in sublist]

# Example usage:
# Suppose `kmer_frequencies` is your dictionary from the previous steps
# and contains k-mers with their predecessors and successors.
contigs = generate_contigs_dask(kmer_frequencies, num_partitions=4)

# Print contigs
for contig in contigs:
    print(contig)

C
C
T
A
C
G
G
G
G
G
G
C
A
G
C
A
G
T
G
A
G
C
T
A
C
G
G
G
G
G
G
C
A
G
C
A
G
T
G
A
G
G
T
A
C
G
G
G
G
G
G
C
A
G
C
A
G
T
G
A
G
G
A
A
C
G
G
G
G
G
G
C
A
G
C
A
G
T
G
A
G
G
A
A
C
G
G
G
G
G
G
C
A
G
C
A
G
T
G
A
G
G
A
A
T
G
G
G
G
G
G
C
A
G
C
A
G
T
G
A
G
G
A
A
T
A
G
G
G
G
G
C
A
G
C
A
G
T
G
A
G
G
A
A
T
A
T
G
G
G
G
C
A
G
C
A
G
T
G
A
G
G
A
A
T
A
T
T
G
G
G
C
A
G
C
A
G
T
G
A
G
G
A
A
T
A
T
T
G
G
G
C
A
G
C
A
G
T
G
A
G
G
A
A
T
A
T
T
G
G
G
C
A
G
C
A
G
T
G
A
G
G
A
A
T
A
T
T
G
G
T
C
A
G
C
A
G
T
G
A
G
G
A
A
T
A
T
T
G
G
T
C
A
G
C
A
G
T
G
A
G
G
A
A
T
A
T
T
G
G
T
C
A
G
C
A
G
T
G
A
G
G
A
A
T
A
T
T
G
G
T
C
A
A
C
A
G
T
G
A
G
G
A
A
T
A
T
T
G
G
T
C
A
A
T
A
G
T
G
A
G
G
A
A
T
A
T
T
G
G
T
C
A
A
T
G
G
T
G
A
G
G
A
A
T
A
T
T
G
G
T
C
A
A
T
G
G
T
G
A
G
G
A
A
T
A
T
T
G
G
T
C
A
A
T
G
G
A
G
A
G
G
A
A
T
A
T
T
G
G
T
C
A
A
T
G
G
A
C
A
G
G
A
A
T
A
T
T
G
G
T
C
A
A
T
G
G
A
C
G
G
G
A
A
T
A
T
T
G
G
T
C
A
A
T
G
G
A
C
G
A
G
A
A
T
A
T
T
G
G
T
C
A
A
T
G
G
A
C
G
A
G
A
A
T
A
T
T
G
G
T
C
A
A
T
G
G
A
C
G
A
G
A
A
T
A
T
T
G
G
T
C
A
A
T
G
G
A
C
G


In [8]:
!pip install dask




[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import dask
from dask import delayed, compute

def build_adjacency_list(kmer_frequencies):
    adjacency_list = {}
    for kmer, data in kmer_frequencies.items():
        if len(data['successors']) == 1:
            adjacency_list[kmer] = next(iter(data['successors']))
    return adjacency_list

@delayed
def traverse_kmers(adjacency_list, starting_kmer):
    """Traverse from a starting k-mer to form a segment of the genome."""
    genome_segment = starting_kmer
    current_kmer = starting_kmer
    
    while current_kmer in adjacency_list:
        next_kmer = adjacency_list[current_kmer]
        genome_segment += next_kmer[-1]  # Append only the last character
        current_kmer = next_kmer
    
    return genome_segment

def parallel_construct_genome(kmer_frequencies):
    # Step 1: Build the adjacency list
    adjacency_list = build_adjacency_list(kmer_frequencies)
    
    # Step 2: Parallelize the traversal from each k-mer
    tasks = []
    for starting_kmer in adjacency_list.keys():
        tasks.append(traverse_kmers(adjacency_list, starting_kmer))
    
    # Step 3: Execute tasks and combine results
    genome_segments = compute(*tasks)
    
    # Step 4: Combine genome segments into a single string
    genome_string = ''.join(genome_segments)
    
    return genome_string

# Usage example:
# Assuming kmer_frequencies is defined and populated correctly
single_kmers = get_kmers_with_single_predecessor_successor(kmer_frequencies)
genome_string = parallel_construct_genome(single_kmers)
print(f"Reconstructed Genome: {genome_string}")