In [24]:
from Bio import SeqIO
fastq_file = "16S_WT_day3_11_SRR2628505_1.fastq"
with open(fastq_file, "r") as handle:
    for i, record in enumerate(SeqIO.parse(handle, "fastq")):
        print(f"ID: {record.id}")
        print(f"Sequence: {record.seq}")
        print(f"Quality: {record.letter_annotations['phred_quality']}")
        if i >= 10:
            break

ID: SRR2628505.45838
Sequence: CCTACGGGGGGCAGCAGTGAGGAATATTGGTCAATGGACGAGAGTCTGAACCAGCCAAGTAGCGTGAAGGATGACTGCCCTATGGGTTGTAAACTTCTTTTATATGGGAATAAAACAGGGTATGCATACCCTCTTGTATGTACCATATGAATAAGGATCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGA
Quality: [37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37

In [25]:
def extract_kmers(sequence, k):
    kmers = {}
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        predecessor = sequence[i - 1] if i > 0 else None
        successor = sequence[i + k] if i + k < len(sequence) else None
        
        if kmer not in kmers:
            kmers[kmer] = {'count': 0, 'predecessors': set(), 'successors': set()}
        
        kmers[kmer]['count'] += 1
        
        if predecessor:
            kmers[kmer]['predecessors'].add(predecessor)
        
        if successor:
            kmers[kmer]['successors'].add(successor)
    
    return kmers

In [26]:
def parse_dataset_and_get_kmers(filepath, k):
    kmer_frequencies = {}
    with open(filepath, 'r') as file:
        while True:
            header = file.readline().strip()
            if not header:
                break
            sequence = file.readline().strip()
            plus = file.readline().strip()
            quality = file.readline().strip()
            
            kmers = extract_kmers(sequence, k)
            for kmer, data in kmers.items():
                if kmer not in kmer_frequencies:
                    kmer_frequencies[kmer] = {'count': 0, 'predecessors': set(), 'successors': set()}
                
                kmer_frequencies[kmer]['count'] += data['count']
                kmer_frequencies[kmer]['predecessors'].update(data['predecessors'])
                kmer_frequencies[kmer]['successors'].update(data['successors'])
    
    return kmer_frequencies

k = 10
kmer_frequencies = parse_dataset_and_get_kmers(fastq_file, k)

for kmer, data in kmer_frequencies.items():
    predecessors = ', '.join(data['predecessors']) if data['predecessors'] else 'None'
    successors = ', '.join(data['successors']) if data['successors'] else 'None'
    print(f"{kmer}: {data['count']} (Predecessors: {predecessors}, Successors: {successors})")

CCTACGGGGG: 3337 (Predecessors: C, A, Successors: G, T, C)
CTACGGGGGG: 3331 (Predecessors: A, G, T, C, Successors: G, T, C)
TACGGGGGGC: 3324 (Predecessors: A, C, Successors: G, T, A)
ACGGGGGGCA: 2628 (Predecessors: T, Successors: G, T, A)
CGGGGGGCAG: 2627 (Predecessors: G, A, Successors: T, C)
GGGGGGCAGC: 2629 (Predecessors: G, T, C, Successors: G, T, A)
GGGGGCAGCA: 2637 (Predecessors: A, G, T, C, Successors: G, T, C)
GGGGCAGCAG: 2633 (Predecessors: G, C, Successors: G, T, C)
GGGCAGCAGT: 2643 (Predecessors: G, T, A, Successors: G, T, A)
GGCAGCAGTG: 7893 (Predecessors: A, G, T, C, Successors: A, G, T, C)
GCAGCAGTGA: 5640 (Predecessors: A, G, T, C, Successors: G, T, A)
CAGCAGTGAG: 5644 (Predecessors: G, T, A, Successors: G, T, A)
AGCAGTGAGG: 5645 (Predecessors: T, C, Successors: G, C, A)
GCAGTGAGGA: 6998 (Predecessors: A, G, T, C, Successors: G, A)
CAGTGAGGAA: 7011 (Predecessors: G, T, A, Successors: G, T)
AGTGAGGAAT: 7017 (Predecessors: G, T, C, Successors: G, T, A)
GTGAGGAATA: 7012 (Pr

In [27]:
def get_kmers_with_single_predecessor_successor(kmer_frequencies):
    single_predecessor_successor_kmers = {}
    
    for kmer, data in kmer_frequencies.items():
        if len(data['predecessors']) == 1 and len(data['successors']) == 1:
            single_predecessor_successor_kmers[kmer] = data
    
    return single_predecessor_successor_kmers

single_predecessor_successor_kmers = get_kmers_with_single_predecessor_successor(kmer_frequencies)

for kmer, data in single_predecessor_successor_kmers.items():
    predecessors = ', '.join(data['predecessors'])
    successors = ', '.join(data['successors'])
    print(f"{kmer}: {data['count']} (Predecessors: {predecessors}, Successors: {successors})")

TAGCGTGAAG: 4404 (Predecessors: G, Successors: G)
ACTGCCCTAT: 4390 (Predecessors: G, Successors: G)
GTGGGGAATT: 1040 (Predecessors: A, Successors: T)
TGGGGAATTT: 1040 (Predecessors: G, Successors: T)
GGGGAATTTT: 1040 (Predecessors: T, Successors: G)
GAATTTTGGA: 1047 (Predecessors: G, Successors: C)
TTTGGACAAT: 1045 (Predecessors: T, Successors: G)
GACAATGGGC: 1076 (Predecessors: G, Successors: G)
AGCCTGATCC: 1080 (Predecessors: A, Successors: A)
GCCTGATCCA: 1080 (Predecessors: A, Successors: G)
TGATCCAGCT: 1051 (Predecessors: C, Successors: A)
TCCAGCTATT: 1051 (Predecessors: A, Successors: C)
CAGCTATTCC: 1053 (Predecessors: C, Successors: G)
TATTCCGCGT: 1052 (Predecessors: C, Successors: G)
AAGGCCCTCG: 1063 (Predecessors: G, Successors: G)
TAAACCACTT: 899 (Predecessors: G, Successors: T)
TTTGTAGAGA: 1046 (Predecessors: T, Successors: A)
AGAACGAAAA: 1053 (Predecessors: G, Successors: G)
AACGAAAAGA: 1057 (Predecessors: G, Successors: C)
AAAAGGGTGT: 772 (Predecessors: A, Successors: T)
AA

In [42]:
def parallel_construct_genome(kmer_frequencies):
    """Parallelize the genome reconstruction process."""
    adjacency_list = {}
    for kmer, data in kmer_frequencies.items():
        if len(data['successors']) == 1:
            adjacency_list[kmer] = next(iter(data['successors']))  # No position stored

    # Sort k-mers
    kmers = sorted(adjacency_list.keys())

    tasks = [construct_genome_from_kmers(kmers[i:i + 100], adjacency_list) for i in range(0, len(kmers), 100)]
    contig = compute(*tasks)
    print(contig)

# Construct the genome with adjacency list but no position verification
def find_successor(kmer, adjacency_list):
    """Find the successor of a kmer using the adjacency list."""
    return adjacency_list.get(kmer, None)

@delayed
def construct_genome_from_kmers(kmers, adjacency_list):
    """Construct the genome string from kmers using the adjacency list."""
    genome = kmers[0]  # Start with the first k-mer

    for _ in range(len(kmers) - 1):
        last_kmer = genome[-len(kmers[0]):]  # Get the last k-mer from the current genome string
        successor = find_successor(last_kmer, adjacency_list)
        if not successor:
            break

        # Append only the last character of the successor to avoid duplication
        genome += successor[-1]

    return genome

parallel_construct_genome(single_predecessor_successor_kmers)

('AAAAAAAAACA', 'AAAACGTTAATCG', 'AAACATCTTTT', 'AAAGAATTTCGGTTATCGATGGGGATGCGT', 'AAAGGCGTCCACG', 'AAATCAGACCTCAGAATGACTGG', 'AACAGGCCACGC', 'AACCGGTAACATTGGGTCTGCAG', 'AACGGGGGGGG', 'AACTTAGTACAAGCCAACAGGCGATTAAAAACGTAGG', 'AAGACCTTCGGGTT', 'AAGCATTGGCTAAC', 'AAGCGTTGTTCGGAATGAC', 'AAGGCCATCGGGTTGTA', 'AAGGGGGGCTGAGA', 'AAGTAGGGTGCG', 'AATAAAGATCGGCT', 'AATAGAACGCAG', 'AATCTTCCACAATGGGC', 'AATGGTCGCAAGCCTG', 'ACAAGAGCTTCCACGAGTG', 'ACAGCAGGCCTAACACCACGCCAGCAGC', 'ACCAAACGCTATCCGG', 'ACCCGCAGCAGAAGC', 'ACCGGATCGATTGGGTTT', 'ACCTGCCTGCCACAGCC', 'ACGAGGGCAAATGCAGGT', 'ACGCTAATACCT', 'ACGGGACCTGCAGAA', 'ACGGTGATGCAAGGCAGA', 'ACGTTAACCGCAGAAGA', 'ACTCGCAGAAGAA', 'ACTGGGTTTAAA', 'AGAAAGCATAG', 'AGAATAAAGTG', 'AGAGAGGAGAG', 'AGAGTGTAACTGTAAAAAAAACA', 'AGCACCGGATAACTC', 'AGCCAAGGCGCGTGAGGG', 'AGCCTACCATCTTG', 'AGCGGAAAGTTCATA', 'AGCGTTATGCG', 'AGGAAGGCGCAGAGCGTCGCAGACCTCTTTTG', 'AGGATCCACTCGGTGTCCGG', 'AGGCCCTACGG', 'AGGGAACGCATACCC', 'AGGGCGGGACGTG', 'AGGGTACAAGCGT', 'AGGTCCTCGGGTTG', 'AGTA

In [32]:
from collections import defaultdict
from dask import delayed, compute

# Function to generate k-mers from a sequence
def generate_kmers(sequence, k):
    for i in range(len(sequence) - k + 1):
        yield sequence[i:i+k]

# Function to build the de Bruijn graph
def build_de_bruijn_graph(kmers):
    graph = defaultdict(list)
    for kmer in kmers:
        prefix = kmer[:-1]
        suffix = kmer[1:]
        graph[prefix].append(suffix)
    return graph

# Function to remove tips (short dead-end branches)
def remove_tips(graph, min_len=15):
    for node in list(graph.keys()):
        if len(graph[node]) == 1:
            succ = graph[node][0]
            # Check for dead-end branches
            if len(graph[succ]) == 0 and len(succ) < min_len:
                del graph[node]  # Remove tip

# Function to traverse the graph and assemble contigs
def assemble_contigs(graph):
    contigs = []
    for node in list(graph.keys()):
        if len(graph[node]) == 1:
            contig = node
            next_node = graph[node][0]
            while next_node in graph and len(graph[next_node]) == 1:
                contig += next_node[-1]
                next_node = graph[next_node][0]
            contigs.append(contig)
    return contigs

# Function to remove contigs shorter than or equal to a specified length in parallel
@delayed
def filter_short_contigs(contig, min_length):
    """
    Filter out contigs shorter than or equal to min_length.
    """
    if len(contig) > min_length:
        return contig
    return None  # Discard short contigs

# Function to process sequences and assemble contigs
def assemble_from_sequences(sequences, k, min_contig_length=10):
    all_kmers = []
    for seq in sequences:
        all_kmers.extend(generate_kmers(seq, k))
    
    # Build de Bruijn graph
    graph = build_de_bruijn_graph(all_kmers)
    
    # Remove tips
    remove_tips(graph)
    
    # Assemble contigs
    contigs = assemble_contigs(graph)
    
    # Parallel filtering of short contigs
    tasks = [filter_short_contigs(contig, min_contig_length) for contig in contigs]
    
    # Compute the tasks and filter out None values (i.e., discarded contigs)
    valid_contigs = compute(*tasks)
    valid_contigs = [contig for contig in valid_contigs if contig]  # Remove None values
    
    return valid_contigs


# Define k-mer length and minimum contig length
k = 10
min_contig_length = 20

# Assemble contigs and remove short contigs in parallel
contigs = assemble_from_sequences(kmer_frequencies, k, min_contig_length)
print("Assembled Contigs:", contigs)


Assembled Contigs: ['GACTGCGTATATCCGAAGAAAA', 'ACTGCGTATATCCGAAGAAAA', 'GGGTCCACCCTGTGGCTTAGC', 'GAGCAACGGGGTCCTTGCGAG', 'AACGTAAACAAACAGAAAGAC', 'GAAAGACCAAACTTAGTACAAGCCAACAGGCGATTAAAAAC', 'AAAGACCAAACTTAGTACAAGCCAACAGGCGATTAAAAAC', 'AAGACCAAACTTAGTACAAGCCAACAGGCGATTAAAAAC', 'AGACCAAACTTAGTACAAGCCAACAGGCGATTAAAAAC', 'GACCAAACTTAGTACAAGCCAACAGGCGATTAAAAAC', 'ACCAAACTTAGTACAAGCCAACAGGCGATTAAAAAC', 'CCAAACTTAGTACAAGCCAACAGGCGATTAAAAAC', 'CAAACTTAGTACAAGCCAACAGGCGATTAAAAAC', 'AAACTTAGTACAAGCCAACAGGCGATTAAAAAC', 'AACTTAGTACAAGCCAACAGGCGATTAAAAAC', 'ACTTAGTACAAGCCAACAGGCGATTAAAAAC', 'CTTAGTACAAGCCAACAGGCGATTAAAAAC', 'TTAGTACAAGCCAACAGGCGATTAAAAAC', 'TAGTACAAGCCAACAGGCGATTAAAAAC', 'AGTACAAGCCAACAGGCGATTAAAAAC', 'GTACAAGCCAACAGGCGATTAAAAAC', 'TACAAGCCAACAGGCGATTAAAAAC', 'ACAAGCCAACAGGCGATTAAAAAC', 'CAAGCCAACAGGCGATTAAAAAC', 'AAGCCAACAGGCGATTAAAAAC', 'AGCCAACAGGCGATTAAAAAC', 'GGAAGCGAAATAATTATTTAATTCTTATTGTTCTAGAGAACTCG', 'GAAGCGAAATAATTATTTAATTCTTATTGTTCTAGAGAACTCG', 'AAGCGAAATAATTATTTAATTCT