In [1]:
import pandas as pd
import numpy
import random

# generate a random DNA sequence of length 10000bp
def generate_random_sequence(length):
    dna = ""
    for i in range(length):
        dna += random.choice("ACGT")
    return dna

# generate a random DNA sequence of length 10000bp
dna = generate_random_sequence(10000)

In [19]:
import sys
from typing import List, Dict, Iterable
from collections import defaultdict
import random

# ------------------------ Generate Reads ------------------------ #
def generate_reads(dna: str, read_len: int, coverage: float) -> List[str]:
    """Generates reads from a DNA string with specified read length and coverage."""
    genome_len = len(dna)
    num_reads = int(genome_len * coverage / read_len)
    reads = []
    for _ in range(num_reads):
        start_pos = random.randint(0, genome_len - read_len)
        reads.append(dna[start_pos:start_pos + read_len])
    return reads

# ------------------------ Break Reads into k-mers ------------------------ #
def break_reads_into_kmers(reads: List[str], k: int, g: int) -> List[str]:
    """Breaks a list of reads into k-mers with a gap of g."""
    kmers = []
    for read in reads:
        for i in range(0, len(read) - k + 1, g + 1):  # Step by g+1
            kmers.append(read[i:i+k])
    return kmers

# ------------------------ De Bruijn Graph from k-mers ------------------------ #
def de_bruijn_kmers(k_mers: List[str]) -> Dict[str, List[str]]:
    """Forms the de Bruijn graph of a collection of k-mers."""
    adj_list = defaultdict(list)
    for k_mer in k_mers:
        prefix = k_mer[:-1]
        suffix = k_mer[1:]
        adj_list[prefix].append(suffix)
    return dict(adj_list)

# ------------------------ Eulerian Path ------------------------ #
def extend_cycle(cycle: List[str], marked_graph: Dict[str, List[str]]) -> List[str]:
    """Extends the Eulerian cycle from a given node in the marked graph."""
    if cycle:
        cycle.pop()  # remove the repeated node at the end
        new_start_index = next(i for i, node in enumerate(cycle) if node in marked_graph)
        cycle = cycle[new_start_index:] + cycle[:new_start_index]
        cycle.append(cycle[0])  # re-add the repeated node
        current_node = cycle[-1]
    else:
        current_node = next(iter(marked_graph))  # get an arbitrary node from the graph
        cycle = [current_node]
    
    while current_node in marked_graph:
        old_node = current_node
        current_node = marked_graph[old_node].pop()
        if not marked_graph[old_node]:
            del marked_graph[old_node]  # remove the node if no more edges
        cycle.append(current_node)
    
    return cycle

def eulerian_cycle_str(g: Dict[str, List[str]]) -> List[str]:
    """Constructs an Eulerian cycle in a graph. Assumes the graph is Eulerian and connected."""
    cycle = []
    while g:
        cycle = extend_cycle(cycle, g)
    return cycle

def fix_unbalanced(g: Dict[str, List[str]]) -> tuple[str, str]:
    """Finds and fixes unbalanced nodes in the graph."""
    total_degree = defaultdict(int)
    
    for node1, adj_nodes in g.items():
        for node2 in adj_nodes:
            total_degree[node1] += 1  # Out-degree
            total_degree[node2] -= 1  # In-degree

    s, t = None, None
    for node, tot_degree in total_degree.items():
        if tot_degree == 1:
            t = node
        elif tot_degree== -1:
            s = node

    if s and t:
        g.setdefault(s, []).append(t)
    
    return s, t

def eulerian_path(g: Dict[str, List[str]]) -> List[str]:
    """Constructs an Eulerian path in a graph, assuming the graph is nearly Eulerian."""
    s, t = fix_unbalanced(g)
    cycle = eulerian_cycle_str(g)
    
    if s:
        cycle.pop()  # Remove the duplicate last node
        t_index = next(i for i, (u, v) in enumerate(zip(cycle, cycle[1:])) if u == s and v == t)
        cycle = cycle[t_index + 1:] + cycle[:t_index + 1]
    
    return cycle

# ------------------------ String Spelled by a Genome Path ------------------------ #
def genome_path(path: List[str]) -> str:
    """Forms the genome path formed by a collection of patterns."""
    if not path:
        return ""
    result = path[0]
    for i in range(1, len(path)):
        result += path[i][-1]
    return result

# ------------------------ Assemble Sequence ------------------------ #
def assemble_sequence(dna: str, read_len: int, coverage: float, k: int, g: int) -> str:
    """Assembles a DNA sequence from reads using a De Bruijn graph."""
    reads = generate_reads(dna, read_len, coverage)
    kmers = break_reads_into_kmers(reads, k, g)  # Pass g here
    graph = de_bruijn_kmers(kmers)
    path = eulerian_path(graph)
    assembled_sequence = genome_path(path)
    return assembled_sequence

# # ------------------------ Main Execution ------------------------ #
# if __name__ == "__main__":
#     dna_string = input("Enter DNA string: ")
#     read_length = int(input("Enter read length: "))
#     coverage_percentage = float(input("Enter coverage percentage (e.g., 0.5 for 50%): "))
#     kmer_length = int(input("Enter k-mer length: "))

#     assembled_dna = assemble_sequence(dna_string, read_length, coverage_percentage, kmer_length)
#     print("Assembled DNA sequence:", assembled_dna)

In [None]:
dna_string = generate_random_sequence(1000)
read_length = 20
coverage_percentage = 1.0
kmer_length = 10
gap = 1

assembled_dna = assemble_sequence(dna_string, read_length, coverage_percentage, kmer_length, gap)

------------

In [4]:
import numpy as np

def construct_profile_hmm(alignment, alphabet, theta):
    # Preprocess the alignment
    alignment = [list(seq) for seq in alignment]
    n_seqs = len(alignment)
    seq_length = len(alignment[0])

    # Count the number of non-gap characters in each column
    col_counts = [sum(1 for seq in alignment if seq[i] != '-') for i in range(seq_length)]

    # Determine which columns are matches (M) and which are inserts (I)
    match_columns = [i for i, count in enumerate(col_counts) if count / n_seqs > theta]

    # Initialize transition and emission matrices
    n_match_states = len(match_columns)
    n_states = 2 * n_match_states + 3  # S, I0, M1, D1, I1, ..., Mn, Dn, In, E
    transitions = np.zeros((n_states, n_states))
    emissions = np.zeros((n_states, len(alphabet)))

    # Helper function to get state index
    def state_index(state_type, state_num):
        if state_type == 'S':
            return 0
        elif state_type == 'I':
            return 2 * state_num + 1
        elif state_type == 'M':
            return 2 * state_num
        elif state_type == 'D':
            return 2 * state_num + 1
        elif state_type == 'E':
            return n_states - 1

    # Count transitions and emissions
    for seq in alignment:
        prev_state = 'S'
        prev_state_num = 0

        for i, char in enumerate(seq):
            if i in match_columns:
                state_num = match_columns.index(i) + 1
                if char == '-':
                    state_type = 'D'
                else:
                    state_type = 'M'
                    emissions[state_index('M', state_num)][alphabet.index(char)] += 1
            else:
                state_num = sum(1 for col in match_columns if col < i)
                if char != '-':
                    state_type = 'I'
                    emissions[state_index('I', state_num)][alphabet.index(char)] += 1
                else:
                    continue  # Skip gaps in insert columns

            transitions[state_index(prev_state, prev_state_num)][state_index(state_type, state_num)] += 1
            prev_state, prev_state_num = state_type, state_num

        # Transition to end state
        transitions[state_index(prev_state, prev_state_num)][state_index('E', 0)] += 1

    # Normalize transitions and emissions
    for i in range(n_states):
        row_sum = np.sum(transitions[i])
        if row_sum > 0:
            transitions[i] /= row_sum

    for i in range(n_states):
        row_sum = np.sum(emissions[i])
        if row_sum > 0:
            emissions[i] /= row_sum

    return transitions, emissionse add k + 1 insertion states, denoted Insertion(0), . . . , Insertion(k) (see figure below). Entering Insertion(i) allows the profile HMM to emit an additional symbol after visiting the i-th column of Profile(Alignment∗) and before entering the (i + 1)- th column. Thus, we will connect Match(i) to Insertion(i) and Insertion(i) to Match(i + 1). Furthermore, to allow for multiple inserted symbols between columns of Profile(Alignment∗), we will connect Insertion(i) to itself.



def format_matrix(matrix, row_labels, col_labels):
    output = "\t" + "\t".join(col_labels) + "\n"
    for i, row in enumerate(matrix):
        output += f"{row_labels[i]}\t" + "\t".join(f"{x:.3f}" if x > 0 else "0" for x in row) + "\n"
    return output

# Main function to process input and generate output
def main():
    # Hardcoded input
    input_data = """0.289
--------
A B C D E
--------
EBA
E-D
EB-
EED
EBD
EBE
E-D
E-D"""

    # Process input
    lines = input_data.strip().split('\n')
    theta = float(lines[0])
    alphabet = lines[2].split()
    alignment = lines[4:]

    # Construct the profile HMM
    transitions, emissions = construct_profile_hmm(alignment, alphabet, theta)

    # Prepare labels for output
    n_match_states = (transitions.shape[0] - 3) // 2
    state_labels = ['S', 'I0'] + sum([[f'M{i}', f'D{i}', f'I{i}'] for i in range(1, n_match_states)], []) + ['E']

    # Format and print the output
    print(format_matrix(transitions, state_labels, state_labels))
    print("--------")
    print(format_matrix(emissions, state_labels, alphabet))

if __name__ == "__main__":
    main()

	S	I0	M1	D1	I1	M2	D2	I2	E
S	0	0	1.000	0	0	0	0	0	0
I0	0	0	0	0	0	0	0	0	0
M1	0	0	0	0	0.625	0.375	0	0	0
D1	0	0	0	0	0	0	0	0	0
I1	0	0	0	0	0	0	0.800	0.200	0
M2	0	0	0	0	0	0	1.000	0	0
D2	0	0	0	0	0	0	0	0	1.000
I2	0	0	0	0	0	0	0	0	1.000
E	0	0	0	0	0	0	0	0	0

--------
	A	B	C	D	E
S	0	0	0	0	0
I0	0	0	0	0	0
M1	0	0	0	0	1.000
D1	0	0	0	0	0
I1	0	0.800	0	0	0.200
M2	0	0	0	0	0
D2	0.143	0	0	0.714	0.143
I2	0	0	0	0	0
E	0	0	0	0	0



In [2]:
def build_graph(edges):
    # Create adjacency list representation with weights
    graph = {}
    for edge in edges:
        source, dest_weight = edge.split('->')
        dest, weight = dest_weight.split(':')
        source, dest, weight = int(source), int(dest), int(weight)
        
        # Initialize dict entries if they don't exist
        if source not in graph:
            graph[source] = {}
        if dest not in graph:
            graph[dest] = {}
            
        # Add edges (undirected graph)
        graph[source][dest] = weight
        graph[dest][source] = weight
    
    return graph

def find_path_length(graph, start, end, visited=None):
    if visited is None:
        visited = set()
    
    # If we reached the destination
    if start == end:
        return 0
    
    visited.add(start)
    min_length = float('inf')
    
    # Try all possible next nodes
    for next_node in graph[start]:
        if next_node not in visited:
            length = find_path_length(graph, next_node, end, visited)
            if length != float('inf'):
                min_length = min(min_length, length + graph[start][next_node])
                
    visited.remove(start)
    return min_length

def compute_distances(n, edges):
    # Build the graph
    graph = build_graph(edges)
    
    # Initialize distance matrix
    distances = [[0] * n for _ in range(n)]
    
    # Compute distances between all pairs of leaves
    for i in range(n):
        for j in range(i + 1, n):  # Only compute upper triangle
            dist = find_path_length(graph, i, j)
            distances[i][j] = dist
            distances[j][i] = dist  # Matrix is symmetric
    
    return distances

def main():
    # Read input from file
    with open('data/dataset_39959_12.txt', 'r') as file:
        # Read number of leaves
        n = int(file.readline().strip())
        
        # Read edges
        edges = []
        for line in file:
            if line.strip():  # Skip empty lines
                edges.append(line.strip())
    
    # Compute distances
    distances = compute_distances(n, edges)
    
    # Print output
    for row in distances:
        print('\t'.join(map(str, row)))

if __name__ == "__main__":
    main()

0	129	125	43	51	223	86	36	45	241	197	31	260	282	148	229	75	286	277	162	192	274	69	216	98	156	254	118	185	211	138	34
129	0	32	148	88	114	63	117	112	132	88	136	151	173	39	120	72	177	168	53	83	165	88	107	47	47	145	39	76	102	29	139
125	32	0	144	84	126	59	113	108	144	100	132	163	185	51	132	68	189	180	65	95	177	84	119	43	59	157	35	88	114	41	135
43	148	144	0	70	242	105	55	64	260	216	30	279	301	167	248	94	305	296	181	211	293	88	235	117	175	273	137	204	230	157	19
51	88	84	70	0	182	45	39	34	200	156	58	219	241	107	188	34	245	236	121	151	233	28	175	57	115	213	77	144	170	97	61
223	114	126	242	182	0	157	211	206	32	44	230	51	73	101	20	166	77	68	71	55	65	182	21	141	83	45	133	66	42	113	233
86	63	59	105	45	157	0	74	69	175	131	93	194	216	82	163	29	220	211	96	126	208	45	150	32	90	188	52	119	145	72	96
36	117	113	55	39	211	74	0	33	229	185	43	248	270	136	217	63	274	265	150	180	262	57	204	86	144	242	106	173	199	126	46
45	112	108	64	34	206	69	33	0	224	180	52	243	265	131	212	58	269	260	145	175	257	52	199	81	139

In [3]:
def limb_length(n, j, D):
    # Initialize minimum length as infinity
    min_length = float('inf')
    
    # Try all pairs of leaves i and k where i ≠ j and k ≠ j
    for i in range(n):
        if i == j:
            continue
        for k in range(i + 1, n):  # k > i to avoid counting pairs twice
            if k == j:
                continue
            
            # Calculate (Di,j + Dj,k - Di,k)/2
            length = (D[i][j] + D[j][k] - D[i][k]) / 2
            min_length = min(min_length, length)
    
    return int(min_length)  # Convert to integer as per problem requirements

def main():
    # Read input from file
    with open('data/dataset_39960_8.txt', 'r') as file:
        # Read n
        n = int(file.readline().strip())
        
        # Read j
        j = int(file.readline().strip())
        
        # Read distance matrix
        D = []
        for _ in range(n):
            row = list(map(int, file.readline().strip().split()))
            D.append(row)
    
    # Calculate limb length
    result = limb_length(n, j, D)
    
    # Print result
    print(result)

if __name__ == "__main__":
    main()

521


In [6]:
def parse_input(filename):
    with open(filename, 'r') as file:
        n = int(file.readline().strip())
        D = []
        for _ in range(n):
            row = list(map(float, file.readline().strip().split()))
            D.append(row)
    return D, n

def find_closest_clusters(D):
    n = len(D)
    min_dist = float('inf')
    min_i = min_j = -1
    
    for i in range(n):
        for j in range(i + 1, n):
            if D[i][j] < min_dist:
                min_dist = D[i][j]
                min_i, min_j = i, j
    
    return min_i, min_j, min_dist

def compute_new_distances(D, ci_idx, cj_idx, cluster_sizes):
    n = len(D)
    new_distances = []
    ci_size = cluster_sizes[ci_idx]
    cj_size = cluster_sizes[cj_idx]
    
    for k in range(n):
        if k != ci_idx and k != cj_idx:
            # Get distances using correct indices
            ci_dist = D[min(ci_idx, k)][max(ci_idx, k)]
            cj_dist = D[min(cj_idx, k)][max(cj_idx, k)]
            # Weighted average formula
            new_dist = (ci_dist * ci_size + cj_dist * cj_size) / (ci_size + cj_size)
            new_distances.append(new_dist)
    
    return new_distances

def update_distance_matrix(D, ci_idx, cj_idx, new_distances):
    n = len(D)
    # Create new matrix excluding merged clusters
    new_D = []
    for i in range(n):
        if i != ci_idx and i != cj_idx:
            row = []
            for j in range(n):
                if j != ci_idx and j != cj_idx:
                    row.append(D[i][j])
            new_D.append(row)
    
    # Add new distances
    for i in range(len(new_D)):
        new_D[i].append(new_distances[i])
    new_D.append(new_distances + [0.0])
    
    return new_D

def upgma(D, n):
    # Initialize
    active_nodes = list(range(n))
    cluster_sizes = [1] * n
    ages = {i: 0.0 for i in range(n)}
    edges = {}
    next_node = n
    
    while len(D) > 1:
        # Find closest clusters
        ci_idx, cj_idx, dist = find_closest_clusters(D)
        ci = active_nodes[ci_idx]
        cj = active_nodes[cj_idx]
        
        # Create new node
        new_node = next_node
        next_node += 1
        
        # Set age of new node
        ages[new_node] = dist / 2
        
        # Add edges
        edges[(new_node, ci)] = ages[new_node] - ages[ci]
        edges[(new_node, cj)] = ages[new_node] - ages[cj]
        edges[(ci, new_node)] = edges[(new_node, ci)]
        edges[(cj, new_node)] = edges[(new_node, cj)]
        
        # Calculate new distances before updating cluster sizes
        new_distances = compute_new_distances(D, ci_idx, cj_idx, cluster_sizes)
        
        # Update cluster sizes
        new_size = cluster_sizes[ci_idx] + cluster_sizes[cj_idx]
        cluster_sizes = [size for idx, size in enumerate(cluster_sizes) 
                        if idx != ci_idx and idx != cj_idx]
        cluster_sizes.append(new_size)
        
        # Update distance matrix
        D = update_distance_matrix(D, ci_idx, cj_idx, new_distances)
        
        # Update active nodes
        active_nodes = [node for idx, node in enumerate(active_nodes) 
                       if idx != ci_idx and idx != cj_idx]
        active_nodes.append(new_node)
    
    # Print edges with 2 decimal places
    for (u, v), length in sorted(edges.items()):
        print(f"{u}->{v}:{length:.2f}")

def main():
    filename = 'data/dataset_39963_8 (4).txt'
    D, n = parse_input(filename)
    upgma(D, n)

if __name__ == "__main__":
    main()

0->22:222.50
1->28:239.50
2->24:227.00
3->25:231.00
4->25:231.00
5->26:231.50
6->23:223.50
7->30:257.50
8->27:239.00
9->30:257.50
10->22:222.50
11->24:227.00
12->21:220.50
13->33:286.25
14->23:223.50
15->28:239.50
16->26:231.50
17->27:239.00
18->32:284.75
19->29:255.25
20->21:220.50
21->12:220.50
21->20:220.50
21->31:47.12
22->0:222.50
22->10:222.50
22->34:69.62
23->6:223.50
23->14:223.50
23->29:31.75
24->2:227.00
24->11:227.00
24->34:65.12
25->3:231.00
25->4:231.00
25->40:117.68
26->5:231.50
26->16:231.50
26->35:77.94
27->8:239.00
27->17:239.00
27->33:47.25
28->1:239.50
28->15:239.50
28->31:28.12
29->19:255.25
29->23:31.75
29->37:74.01
30->7:257.50
30->9:257.50
30->32:27.25
31->21:47.12
31->28:28.12
31->36:45.38
32->18:284.75
32->30:27.25
32->36:28.25
33->13:286.25
33->27:47.25
33->39:56.55
34->22:69.62
34->24:65.12
34->35:17.31
35->26:77.94
35->34:17.31
35->38:27.07
36->31:45.38
36->32:28.25
36->37:16.26
37->29:74.01
37->36:16.26
37->38:7.25
38->35:27.07
38->37:7.25
38->39:6.29
39->3

In [None]:
def parse_input(filename):
    with open(filename, 'r') as file:
        n = int(file.readline().strip())
        D = []
        for _ in range(n):
            row = list(map(float, file.readline().strip().split()))
            D.append(row)
    return D, n

def total_distance(D, i):
    """Calculate total distance from node i to all other nodes"""
    return sum(D[i][j] for j in range(len(D)))

def create_neighbor_joining_matrix(D):
    """Create neighbor-joining matrix D* from distance matrix D"""
    n = len(D)
    D_star = [[0.0] * n for _ in range(n)]
    
    for i in range(n):
        for j in range(n):
            if i != j:
                total_dist_i = total_distance(D, i)
                total_dist_j = total_distance(D, j)
                D_star[i][j] = (n - 2) * D[i][j] - total_dist_i - total_dist_j
    
    return D_star

def find_minimum_element(D_star):
    """Find minimum non-diagonal element in D* matrix"""
    n = len(D_star)
    min_val = float('inf')
    min_i = min_j = -1
    
    for i in range(n):
        for j in range(i + 1, n):
            if D_star[i][j] < min_val:
                min_val = D_star[i][j]
                min_i, min_j = i, j
    
    return min_i, min_j

def update_distance_matrix(D, i, j):
    """Create new distance matrix with merged node m"""
    n = len(D)
    m_distances = []
    
    # Calculate distances to new node m
    for k in range(n):
        if k != i and k != j:
            dist_k_m = (D[k][i] + D[k][j] - D[i][j]) / 2
            m_distances.append(dist_k_m)
    
    # Create new matrix without i and j, but with new node m
    new_D = []
    for row in range(n):
        if row != i and row != j:
            new_row = []
            for col in range(n):
                if col != i and col != j:
                    new_row.append(D[row][col])
            new_D.append(new_row)
    
    # Add row and column for m
    for row in range(len(new_D)):
        new_D[row].append(m_distances[row])
    new_D.append(m_distances + [0.0])
    
    return new_D

def neighbor_joining(D, current_nodes, next_node):
    n = len(D)
    
    # Base case: only two nodes left
    if n == 2:
        edges = {
            (current_nodes[0], next_node): D[0][1]/2,
            (current_nodes[1], next_node): D[0][1]/2,
            (next_node, current_nodes[0]): D[0][1]/2,
            (next_node, current_nodes[1]): D[0][1]/2
        }
        return edges, next_node + 1
    
    # Create neighbor-joining matrix
    D_star = create_neighbor_joining_matrix(D)
    
    # Find minimum element
    i, j = find_minimum_element(D_star)
    
    # Calculate delta
    total_dist_i = total_distance(D, i)
    total_dist_j = total_distance(D, j)
    delta = (total_dist_i - total_dist_j) / (n - 2)
    
    # Calculate limb lengths
    limb_length_i = (D[i][j] + delta) / 2
    limb_length_j = (D[i][j] - delta) / 2
    
    # Update distance matrix
    new_D = update_distance_matrix(D, i, j)
    
    # Save current nodes i and j
    node_i = current_nodes[i]
    node_j = current_nodes[j]
    
    # Update current_nodes list
    new_current_nodes = [node for k, node in enumerate(current_nodes) if k != i and k != j]
    new_current_nodes.append(next_node)
    
    # Recursive call
    edges, next_next_node = neighbor_joining(new_D, new_current_nodes, next_node + 1)
    
    # Add new edges
    edges[(next_node, node_i)] = limb_length_i
    edges[(next_node, node_j)] = limb_length_j
    edges[(node_i, next_node)] = limb_length_i
    edges[(node_j, next_node)] = limb_length_j
    
    return edges, next_next_node

def main():
    filename = 'data/dataset_39964_6.txt'
    D, n = parse_input(filename)
    
    # Initialize with leaf nodes 0 to n-1
    current_nodes = list(range(n))
    next_node = n
    
    # Run neighbor joining algorithm
    edges, _ = neighbor_joining(D, current_nodes, next_node)
    
    # Print edges in required format
    for (u, v), length in sorted(edges.items()):
        print(f"{u}->{v}:{length:.2f}")

if __name__ == "__main__":
    main()