In [2]:
import os

if 'HOME' not in os.environ and 'USERPROFILE' in os.environ:
    os.environ['HOME'] = os.environ['USERPROFILE']

In [1]:
from ete3 import Tree
import random

In [2]:
def build_tree(file_path, index):
    with open(file_path, 'r') as file:
        parent_data = file.readlines()[index].strip().split()

    node_dict = {}

    for idx, parent_id in enumerate(parent_data, start=1):
        if idx not in node_dict:
            node_dict[idx] = Tree(name=str(idx))
        if int(parent_id) not in node_dict:
            node_dict[int(parent_id)] = Tree(name=str(parent_id))

        node_dict[int(parent_id)].add_child(node_dict[idx])
    
    return node_dict

In [3]:
def get_final_mutations_L(file_path, index):
    with open(file_path, 'r') as file:
        final_distribution = file.readlines()[index].strip().split()
    
    return [str(num) for num in final_distribution]

def get_final_mutations(file_path, index):
    with open(file_path, 'r') as file:
        final_distribution = file.readlines()[index].strip().split()
    
    return set([str(num) for num in final_distribution])

def sample_final_mutations(final_distribution, k):
    return set(random.sample(final_distribution, k))

In [4]:
def find_all_nodes(tree, nodes):
    # nodes = [str(num) for num in nodes]
    necessary_nodes = set(nodes)
    for node_id in nodes:
        current_node = tree.search_nodes(name=str(node_id))[0]
        while current_node.up: 
            necessary_nodes.add(current_node.up.name)
            current_node = current_node.up
    return necessary_nodes

In [5]:
import os
def count_non_empty_lines(file_path):
    if not os.path.exists(file_path):
        return 0

    with open(file_path, 'r') as f:
        return sum(1 for line in f if line.strip())

In [6]:
def get_coalescence_time(tree, samples):
    samples = set(samples)
    if len(samples) <= 1: return 0
    ancestor = tree.get_common_ancestor(samples)
    max_distance = max(ancestor.get_distance(tree & sample) for sample in samples)
    return max_distance

In [9]:
tree_path = f"graphs/result_11.1/geo_1_edges_mu0.001_s0.1_tree.txt"
final_path = f"graphs/result_11.1/geo_1_edges_mu0.001_s0.1_list.txt"

tree_dict = build_tree(tree_path, 0)
calculate_tree_metrics(tree_dict[0])

NameError: name 'calculate_tree_metrics' is not defined

In [20]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

def calculate_coalescence(output_file, checkpoint_interval=50):
    ks = ["geo"]
    mus = [0.001]
    # mus = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001, 0.000001]
    ss = [0.1]#, 0.01, 0, -0.01]
    repetitions = 21 # Number of trees with same parameters
    num_samples = 100  # Number of samples per file
    iterations = 200 # Number of coalescence calculations per sample

    # Load checkpoint if it exists
    temp_file = output_file + ".tmp"
    if os.path.exists(temp_file):
        checkpoint_df = pd.read_pickle(temp_file)
        data = checkpoint_df.to_dict(orient='list')
        completed_combinations = set(zip(data['k'], data['mu'], data['s'], data['repetition']))
        print("Resuming from checkpoint.")
    else:
        data = {'k': [], 'mu': [], 's': [], 'repetition': [],
                'mean_coalescence_time': [], 'variance_coalescence_time': [],}
        completed_combinations = set()

    total_combinations = len(ks) * len(mus) * len(ss) * repetitions
    remaining_combinations = total_combinations - len(completed_combinations)

    with tqdm(total=remaining_combinations, desc='Calculating Coalescence Times') as pbar:
        count = 0
        for k in ks:
            for mu in mus:
                for s in ss:
                    for i in range(repetitions):

                        if (k, mu, s, i) in completed_combinations:
                            continue

                        tree_path = f"graphs/result_11.1/geo_{i}_edges_mu{mu}_s{s}_tree.txt"
                        final_path = f"graphs/result_11.1/geo_{i}_edges_mu{mu}_s{s}_list.txt"

                        if not os.path.exists(tree_path) or not os.path.exists(final_path):
                            print(f"Skipping missing paths: {tree_path}, {final_path}")
                            pbar.update(1)
                            continue

                        # Load the tree files
                        tree_samples = count_non_empty_lines(tree_path)
                        final_samples = count_non_empty_lines(final_path)
                        num_samples = min(tree_samples, final_samples, num_samples)

                        if num_samples == 0:
                            print(f"Skipping empty files: {tree_path}, {final_path}")
                            pbar.update(1)
                            continue

                        # Calculate the coalescence time
                        sample_coalescence_times = []
                        for sample_idx in range(num_samples):
                            tree_dict = build_tree(tree_path, sample_idx)
                            tree = tree_dict[0]
                            distribution = get_final_mutations_L(final_path, sample_idx)
                            path = find_all_nodes(tree, get_final_mutations(final_path, sample_idx))
                            tree.prune(path)
                            coalescence_times = []
                            for _ in range(iterations):
                                try:
                                    coalescence_times.append(get_coalescence_time(tree, random.sample(distribution, 2)))
                                except:
                                    pass
                            sample_avg_time = np.mean(coalescence_times)
                            sample_coalescence_times.append(sample_avg_time)
                            # print(sample_idx)

                        mean_coalescence_time = np.mean(sample_coalescence_times)
                        variance_coalescence_time = np.var(sample_coalescence_times)

                        data['k'].append(k)
                        data['mu'].append(mu)
                        data['s'].append(s)
                        data['repetition'].append(i)

                        data['mean_coalescence_time'].append(mean_coalescence_time)
                        data['variance_coalescence_time'].append(variance_coalescence_time)

                        pbar.update(1)
                        count += 1

                        if count % checkpoint_interval == 0:
                            checkpoint_df = pd.DataFrame(data)
                            checkpoint_df.to_pickle(output_file + ".tmp")
                            print(f"Checkpoint saved at iteration {count}")

    df = pd.DataFrame(data)
    df.to_pickle(output_file)
    print("Final results saved.")

calculate_coalescence("coalescence_geo_20.pkl", checkpoint_interval=50)


Calculating Coalescence Times:   0%|          | 0/21 [00:00<?, ?it/s]

Skipping missing paths: graphs/result_11.1/geo_0_edges_mu0.001_s0.1_tree.txt, graphs/result_11.1/geo_0_edges_mu0.001_s0.1_list.txt


Calculating Coalescence Times: 100%|██████████| 21/21 [08:18<00:00, 23.75s/it]

Final results saved.





In [7]:
def get_sackin_index(tree):
    leaves = tree.get_leaves()  
    total_depth = sum(leaf.get_distance(tree) for leaf in leaves)  
    num_leaves = len(leaves)  
    if num_leaves > 1:
        normalized_index = total_depth / (0.5 * num_leaves * (num_leaves + 1) - 1)
    else:
        normalized_index = 0  
    
    # # tree depth
    # tree_depth = max(leaf.get_distance(tree) for leaf in leaves) if leaves else 0
    
    # # tree width
    # levels = {}
    # for leaf in leaves:
    #     level = leaf.get_distance(tree, topology_only=True)
    #     if level not in levels:
    #         levels[level] = 0
    #     levels[level] += 1
    # tree_width = max(levels.values()) if levels else 0

    return normalized_index

In [10]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

def calculate_sackin(output_file, checkpoint_interval=50):
    # ks = ["geo"]
    ks = [3, 4, 6]
    mus = [0.001]
    # mus = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001, 0.000001]
    ss = [0.1]#, 0.01, 0, -0.01]
    repetitions = 50 # Number of trees with same parameters
    num_samples = 20  # Number of samples per file

    temp_file = output_file + ".tmp"
    if os.path.exists(temp_file):
        checkpoint_df = pd.read_pickle(temp_file)
        data = checkpoint_df.to_dict(orient='list')
        completed_combinations = set(zip(data['k'], data['mu'], data['s'], data['repetition']))
        print("Resuming from checkpoint.")
    else:
        data = {'k': [], 'mu': [], 's': [], 'repetition': [],
                'sackin': [], 'sackin_variance': []}
        completed_combinations = set()

    total_combinations = len(ks) * len(mus) * len(ss) * repetitions
    remaining_combinations = total_combinations - len(completed_combinations)

    with tqdm(total=remaining_combinations, desc='Calculating Coalescence Times') as pbar:
        count = 0
        for k in ks:
            for mu in mus:
                for s in ss:
                    for i in range(repetitions):

                        if (k, mu, s, i) in completed_combinations:
                            continue
                        
                        # tree_path = f"graphs/result_11.1/geo_{i}_edges_mu{mu}_s{s}_tree.txt"
                        # final_path = f"graphs/result_11.1/geo_{i}_edges_mu{mu}_s{s}_list.txt"
                        tree_path = f"graphs/result_5.16/{k}_regular_{i}_m{mu}_s{s}.txt_tree.txt"
                        final_path = f"graphs/result_5.16/{k}_regular_{i}_m{mu}_s{s}.txt_list.txt"

                        if not os.path.exists(tree_path) or not os.path.exists(final_path):
                            print(f"Skipping missing paths: {tree_path}, {final_path}")
                            pbar.update(1)
                            continue

                        # Load the tree file
                        tree_samples = count_non_empty_lines(tree_path)
                        final_samples = count_non_empty_lines(final_path)
                        num_samples = min(tree_samples, final_samples, num_samples)

                        if num_samples == 0:
                            print(f"Skipping empty files: {tree_path}, {final_path}")
                            pbar.update(1)
                            continue

                        # Calculate Sackin index
                        sackinL = []
                        for sample_idx in range(num_samples):
                            tree_dict = build_tree(tree_path, sample_idx)
                            tree = tree_dict[0]
                            distribution = get_final_mutations_L(final_path, sample_idx)
                            path = find_all_nodes(tree, get_final_mutations(final_path, sample_idx))
                            tree.prune(path)
                            sackin_index = get_sackin_index(tree)
                            sackinL.append(sackin_index)
                            # print(sample_idx)

                        data['k'].append(k)
                        data['mu'].append(mu)
                        data['s'].append(s)
                        data['repetition'].append(i)
                        data['sackin'].append(np.mean(sackinL))
                        data['sackin_variance'].append(np.var(sackinL))

                        pbar.update(1)
                        count += 1

                        if count % checkpoint_interval == 0:
                            checkpoint_df = pd.DataFrame(data)
                            checkpoint_df.to_pickle(output_file + ".tmp")
                            print(f"Checkpoint saved at iteration {count}")

    df = pd.DataFrame(data)
    df.to_pickle(output_file)
    print("Final results saved.")

calculate_sackin("sackin_regular_stoch_50.pkl", checkpoint_interval=50)


Calculating Coalescence Times:  33%|███▎      | 50/150 [02:31<04:25,  2.65s/it]

Checkpoint saved at iteration 50


Calculating Coalescence Times:  67%|██████▋   | 100/150 [04:59<02:27,  2.94s/it]

Checkpoint saved at iteration 100


Calculating Coalescence Times: 100%|██████████| 150/150 [07:31<00:00,  3.01s/it]

Checkpoint saved at iteration 150
Final results saved.





In [22]:
from ete3 import Tree
import numpy as np

def compute_subtree_sizes(node):
    """
    Compute S_i (total subtree size including root) and S*_i (excluding root).
    Assigns these values to the node attributes.
    """
    if node.is_leaf():
        node.S_i = 1  # A leaf has size 1 (itself)
        node.S_star_i = 0  # No subtree without root
        return 1, 0  

    subtree_size = 1  # Count itself
    child_sizes = []

    for child in node.children:
        child_s, _ = compute_subtree_sizes(child)
        child_sizes.append(child_s)
        subtree_size += child_s  # Accumulate child subtree sizes
    
    S_star_i = subtree_size - 1  # Exclude root itself
    
    # Store values as attributes inside the node
    node.S_i = subtree_size
    node.S_star_i = S_star_i
    node.child_sizes = child_sizes  # Store sizes of children

    return subtree_size, S_star_i


def compute_balance_scores(node):
    if node.is_leaf() or len(node.children) < 2:
        node.W_i = 0
        return 0
    
    p_ij = np.array([child.S_i / node.S_star_i for child in node.children])
    
    W_i_1 = -np.sum(p_ij * np.log(p_ij) / np.log(len(node.children)))
    
    node.W_i = W_i_1
    return W_i_1

def compute_normalized_balance_index(root):
    """
    Compute the normalized tree balance index J^1.
    """
    internal_nodes = [n for n in root.traverse() if not n.is_leaf()]
    
    S_star_sum = sum(n.S_star_i for n in internal_nodes)
    weighted_sum = sum(n.S_star_i / n.S_i * n.W_i for n in internal_nodes)
    
    return weighted_sum / S_star_sum if S_star_sum > 0 else 0

k = 3
i = 0
mu = 0.001
s = 0
tree_path = f"graphs/results1/{k}_regular_graph/{k}_regular_graph_{i}_mu{mu}_s{s}_tree.txt"
final_path = f"graphs/results1/{k}_regular_graph/{k}_regular_graph_{i}_mu{mu}_s{s}_list.txt"
tree_dict = build_tree(tree_path, 0)


# Assuming root node is labeled as '1'
root = tree_dict[0]

# Step 1: Compute sizes
compute_subtree_sizes(root)

# Step 2: Compute balance scores
for node in root.traverse():
    compute_balance_scores(node)

# Step 3: Compute the normalized balance index
J1 = compute_normalized_balance_index(root)

print("Normalized Tree Balance Index (J1):", J1)


Normalized Tree Balance Index (J1): 0.004066544747433984


In [26]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

def calculate_coalescence_df_with_progress_and_saves(output_file, checkpoint_interval=50):
    """Calculate mean and variance of coalescence times with progress bar and periodic saves.

    Args:
        output_file (str): Path to the final output file (Pandas DataFrame saved as a pickle file).
        checkpoint_interval (int): Number of iterations after which to save a temporary file.
    """
    ks = ["bn"]#[3, 4, 6, 10]
    mus = [0.001]
    # mus = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001, 0.000001]
    ss = [0.1]#, 0.01, 0, -0.01]
    repetitions = 21
    num_samples = 100  # Number of samples per file
    iterations = 200 # Number of coalescence calculations per sample

    # Load checkpoint if it exists
    temp_file = output_file + ".tmp"
    if os.path.exists(temp_file):
        checkpoint_df = pd.read_pickle(temp_file)
        data = checkpoint_df.to_dict(orient='list')
        completed_combinations = set(zip(data['k'], data['mu'], data['s'], data['repetition']))
        print("Resuming from checkpoint.")
    else:
        data = {'k': [], 'mu': [], 's': [], 'repetition': [],
                'Js': [], 'variance_Js': [],}
        completed_combinations = set()

    # Calculate the total number of combinations for the progress bar
    total_combinations = len(ks) * len(mus) * len(ss) * repetitions
    remaining_combinations = total_combinations - len(completed_combinations)

    # Initialize progress bar
    with tqdm(total=remaining_combinations, desc='Calculating Coalescence Times') as pbar:
        count = 0
        for k in ks:
            for mu in mus:
                for s in ss:
                    for i in range(repetitions):
                        # Skip if this combination has already been completed
                        if (k, mu, s, i) in completed_combinations:
                            continue

                        # tree_path = f"graphs/result_11.13/{k}_regular_{i}_mu{mu}_s{s}_tree.txt"
                        # final_path = f"graphs/result_11.13/{k}_regular_{i}_mu{mu}_s{s}_list.txt"
                        tree_path = f"graphs/result_11.8/bn_{i}_mu{mu}_s{s}_tree.txt"
                        final_path = f"graphs/result_11.8/bn_{i}_mu{mu}_s{s}_list.txt"

                        # Check if paths exist; skip if they don't
                        if not os.path.exists(tree_path) or not os.path.exists(final_path):
                            print(f"Skipping missing paths: {tree_path}, {final_path}")
                            pbar.update(1)
                            continue

                        # Load the tree file (assuming the sample index isn't needed)
                        tree_samples = count_non_empty_lines(tree_path)
                        final_samples = count_non_empty_lines(final_path)
                        num_samples = min(tree_samples, final_samples, num_samples)

                        if num_samples == 0:
                            print(f"Skipping empty files: {tree_path}, {final_path}")
                            pbar.update(1)
                            continue

                        # Calculate the coalescence time for each sample in the file
                        Js = []
                        for sample_idx in range(num_samples):
                            tree_dict = build_tree(tree_path, sample_idx)
                            tree = tree_dict[0]

                            compute_subtree_sizes(tree)

                            for node in tree.traverse():
                                compute_balance_scores(node)

                            J1 = compute_normalized_balance_index(tree)

                            Js.append(J1)
                            # print(sample_idx)

                        # Calculate mean and variance across all samples
                        mean_Js = np.mean(Js)
                        variance_Js = np.var(Js)

                        # Append results to the data dictionary
                        data['k'].append(k)
                        data['mu'].append(mu)
                        data['s'].append(s)
                        data['repetition'].append(i)
                        data['Js'].append(np.mean(mean_Js))
                        data['variance_Js'].append(np.var(mean_Js))


                        # Update progress bar after each combination
                        pbar.update(1)
                        count += 1

                        # Save a checkpoint after every `checkpoint_interval` iterations
                        if count % checkpoint_interval == 0:
                            checkpoint_df = pd.DataFrame(data)
                            checkpoint_df.to_pickle(output_file + ".tmp")
                            print(f"Checkpoint saved at iteration {count}")

    # Final save: Create a DataFrame and save as the final file
    df = pd.DataFrame(data)
    df.to_pickle(output_file)
    print("Final results saved.")

# Call the function to write the stats to a pandas DataFrame file with checkpointing
calculate_coalescence_df_with_progress_and_saves("Js_bn_9.pkl", checkpoint_interval=50)


Calculating Coalescence Times:   0%|          | 0/21 [00:00<?, ?it/s]

Skipping missing paths: graphs/result_11.8/bn_0_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_0_mu0.001_s0.1_list.txt


Calculating Coalescence Times: 100%|██████████| 21/21 [01:22<00:00,  3.94s/it]

Skipping missing paths: graphs/result_11.8/bn_10_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_10_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_11_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_11_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_12_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_12_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_13_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_13_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_14_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_14_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_15_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_15_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_16_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_16_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_17_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_17_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_18




In [16]:
def calculate_subtree_sizes(node):
    """Recursively calculates the size of each subtree."""
    if not node.children:
        node.size = 1
        return 1
    
    size = 1 + sum(calculate_subtree_sizes(child) for child in node.children)
    node.size = size  # Store size in node attribute
    return size

def get_relative_abundances(node):
    """Computes relative subtree sizes for Hill number calculations."""
    total_size = node.size
    abundances = [child.size / total_size for child in node.children]
    return abundances

def hill_number(root, q):
    """Calculates the Hill number of order q for the given tree."""
    calculate_subtree_sizes(root)
    abundances = get_relative_abundances(root)
    
    if q == 1:
        # Shannon entropy-based diversity (q = 1) using exp(sum p log p)
        return np.exp(-sum(p * np.log(p) for p in abundances if p > 0))
    
    return (sum(p ** q for p in abundances if p > 0)) ** (1 / (1 - q))

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

def calculate_coalescence_df_with_progress_and_saves(output_file, checkpoint_interval=50):
    """Calculate mean and variance of coalescence times with progress bar and periodic saves.

    Args:
        output_file (str): Path to the final output file (Pandas DataFrame saved as a pickle file).
        checkpoint_interval (int): Number of iterations after which to save a temporary file.
    """
    ks = [3, 4, 6]
    mus = [0.001]
    # mus = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001, 0.000001]
    ss = [0.1]#, 0.01, 0, -0.01]
    repetitions = 50
    num_samples = 20  # Number of samples per file
    iterations = 200 # Number of coalescence calculations per sample

    # Load checkpoint if it exists
    temp_file = output_file + ".tmp"
    if os.path.exists(temp_file):
        checkpoint_df = pd.read_pickle(temp_file)
        data = checkpoint_df.to_dict(orient='list')
        completed_combinations = set(zip(data['k'], data['mu'], data['s'], data['repetition']))
        print("Resuming from checkpoint.")
    else:
        data = {'k': [], 'mu': [], 's': [], 'repetition': [],
                'Js': [], 'variance_Js': [],}
        completed_combinations = set()

    # Calculate the total number of combinations for the progress bar
    total_combinations = len(ks) * len(mus) * len(ss) * repetitions
    remaining_combinations = total_combinations - len(completed_combinations)

    # Initialize progress bar
    with tqdm(total=remaining_combinations, desc='Calculating Coalescence Times') as pbar:
        count = 0
        for k in ks:
            for mu in mus:
                for s in ss:
                    for i in range(repetitions):
                        # Skip if this combination has already been completed
                        if (k, mu, s, i) in completed_combinations:
                            continue

                        # tree_path = f"graphs/result_11.13/{k}_regular_{i}_mu{mu}_s{s}_tree.txt"
                        # final_path = f"graphs/result_11.13/{k}_regular_{i}_mu{mu}_s{s}_list.txt"
                        # tree_path = f"graphs/result_11.8/bn_{i}_mu{mu}_s{s}_tree.txt"
                        # final_path = f"graphs/result_11.8/bn_{i}_mu{mu}_s{s}_list.txt"

                        tree_path = f"graphs/result_5.16/{k}_regular_{i}_m{mu}_s{s}.txt_tree.txt"
                        final_path = f"graphs/result_5.16/{k}_regular_{i}_m{mu}_s{s}.txt_list.txt"

                        # Check if paths exist; skip if they don't
                        if not os.path.exists(tree_path) or not os.path.exists(final_path):
                            print(f"Skipping missing paths: {tree_path}, {final_path}")
                            pbar.update(1)
                            continue

                        # Load the tree file (assuming the sample index isn't needed)
                        tree_samples = count_non_empty_lines(tree_path)
                        final_samples = count_non_empty_lines(final_path)
                        num_samples = min(tree_samples, final_samples, num_samples)

                        if num_samples == 0:
                            print(f"Skipping empty files: {tree_path}, {final_path}")
                            pbar.update(1)
                            continue

                        # Calculate the coalescence time for each sample in the file
                        Hills = []
                        for sample_idx in range(num_samples):
                            tree_dict = build_tree(tree_path, sample_idx)
                            tree = tree_dict[0]

                            Hill = hill_number(tree, 1)

                            Hills.append(Hill)
                            # print(sample_idx)
                        # Calculate mean and variance across all samples
                        mean_Hills = np.mean(Hills)
                        variance_Hills = np.var(Hills)

                        # Append results to the data dictionary
                        data['k'].append(k)
                        data['mu'].append(mu)
                        data['s'].append(s)
                        data['repetition'].append(i)
                        data['Js'].append(np.mean(mean_Hills))
                        data['variance_Js'].append(np.var(mean_Hills))


                        # Update progress bar after each combination
                        pbar.update(1)
                        count += 1

                        # Save a checkpoint after every `checkpoint_interval` iterations
                        if count % checkpoint_interval == 0:
                            checkpoint_df = pd.DataFrame(data)
                            checkpoint_df.to_pickle(output_file + ".tmp")
                            print(f"Checkpoint saved at iteration {count}")

    # Final save: Create a DataFrame and save as the final file
    df = pd.DataFrame(data)
    df.to_pickle(output_file)
    print("Final results saved.")

# Call the function to write the stats to a pandas DataFrame file with checkpointing
calculate_coalescence_df_with_progress_and_saves("Hills_regular_stoch_50.pkl", checkpoint_interval=50)


Calculating Coalescence Times:   0%|          | 0/21 [00:00<?, ?it/s]

Skipping missing paths: graphs/result_11.8/bn_0_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_0_mu0.001_s0.1_list.txt


Calculating Coalescence Times: 100%|██████████| 21/21 [01:12<00:00,  3.47s/it]

Skipping missing paths: graphs/result_11.8/bn_10_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_10_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_11_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_11_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_12_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_12_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_13_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_13_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_14_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_14_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_15_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_15_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_16_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_16_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_17_mu0.001_s0.1_tree.txt, graphs/result_11.8/bn_17_mu0.001_s0.1_list.txt
Skipping missing paths: graphs/result_11.8/bn_18




In [12]:
def calculate_tree_metrics(root):
    from collections import deque

    # Tree size: Total number of nodes
    def get_tree_size(node):
        if not node:
            return 0
        return 1 + sum(get_tree_size(child) for child in node.children)

    # Tree height: Maximum depth of the tree
    def get_tree_height(node):
        if not node:
            return 0
        return 1 + max((get_tree_height(child) for child in node.children), default=0)

    # Tree width: Maximum number of nodes at any level
    def get_tree_width(node):
        if not node:
            return 0
        queue = deque([(node, 0)])  # (node, level)
        level_count = {}
        while queue:
            current, level = queue.popleft()
            level_count[level] = level_count.get(level, 0) + 1
            for child in current.children:
                queue.append((child, level + 1))
        return sum(level_count.values()) / len(level_count) if level_count else 0

    # Average terminal branch length
    def get_average_terminal_branch_length(node):
        total_terminal_length = 0
        leaf_count = 0

        def traverse(node):
            nonlocal total_terminal_length, leaf_count
            for child in node.children:
                if not child.children:  # It's a leaf
                    total_terminal_length += child.dist
                    leaf_count += 1
                else:
                    traverse(child)

        traverse(node)

        if leaf_count == 0:
            return 0
        return total_terminal_length / leaf_count

    # tree_size = get_tree_size(root)
    # tree_height = get_tree_height(root)
    tree_width = get_tree_width(root)
    # avg_terminal_branch_length = get_average_terminal_branch_length(root)

    # return {
    #     "tree_size": tree_size,
    #     "tree_width": tree_width,
    #     "tree_height": tree_height,
    #     "avg_terminal_branch_length": avg_terminal_branch_length
    # }
    return {
        "tree_width": tree_width,
    }

In [14]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

def calculate_coalescence_df_with_progress_and_saves(output_file, checkpoint_interval=50):
    """Calculate mean and variance of coalescence times with progress bar and periodic saves.

    Args:
        output_file (str): Path to the final output file (Pandas DataFrame saved as a pickle file).
        checkpoint_interval (int): Number of iterations after which to save a temporary file.
    """
    ks = [3, 4, 6, 10]
    mus = [0.001]
    # mus = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001, 0.000001]
    ss = [0.1]#, 0.01, 0, -0.01]
    repetitions = 50
    num_samples = 100  # Number of samples per file
    iterations = 200 # Number of coalescence calculations per sample

    # Load checkpoint if it exists
    temp_file = output_file + ".tmp"
    if os.path.exists(temp_file):
        checkpoint_df = pd.read_pickle(temp_file)
        data = checkpoint_df.to_dict(orient='list')
        completed_combinations = set(zip(data['k'], data['mu'], data['s'], data['repetition']))
        print("Resuming from checkpoint.")
    else:
        data = {'k': [], 'mu': [], 's': [], 'repetition': [],
                'tree_width': []}
        completed_combinations = set()

    # Calculate the total number of combinations for the progress bar
    total_combinations = len(ks) * len(mus) * len(ss) * repetitions
    remaining_combinations = total_combinations - len(completed_combinations)

    # Initialize progress bar
    with tqdm(total=remaining_combinations, desc='Calculating Coalescence Times') as pbar:
        count = 0
        for k in ks:
            for mu in mus:
                for s in ss:
                    for i in range(repetitions):
                        # Skip if this combination has already been completed
                        if (k, mu, s, i) in completed_combinations:
                            continue

                        tree_path = f"graphs/result_11.13/{k}_regular_{i}_mu{mu}_s{s}_tree.txt"
                        final_path = f"graphs/result_11.13/{k}_regular_{i}_mu{mu}_s{s}_list.txt"
                        # tree_path = f"graphs/result_11.8/sw_{i}_mu{mu}_s{s}_tree.txt"
                        # final_path = f"graphs/result_11.8/sw_{i}_mu{mu}_s{s}_list.txt"

                        # Check if paths exist; skip if they don't
                        if not os.path.exists(tree_path) or not os.path.exists(final_path):
                            print(f"Skipping missing paths: {tree_path}, {final_path}")
                            pbar.update(1)
                            continue

                        # Load the tree file (assuming the sample index isn't needed)
                        tree_samples = count_non_empty_lines(tree_path)
                        final_samples = count_non_empty_lines(final_path)
                        num_samples = min(tree_samples, final_samples, num_samples)

                        if num_samples == 0:
                            print(f"Skipping empty files: {tree_path}, {final_path}")
                            pbar.update(1)
                            continue

                        # Calculate the coalescence time for each sample in the file
                        tree_size = []
                        tree_width = []
                        tree_height = []
                        terminal_branch_length = []
                        for sample_idx in range(num_samples):
                            tree_dict = build_tree(tree_path, sample_idx)
                            tree = tree_dict[0]

                            metrics = calculate_tree_metrics(tree)

                            # tree_size.append(metrics["tree_size"])
                            tree_width.append(metrics["tree_width"])
                            # tree_height.append(metrics["tree_height"]) 
                            # terminal_branch_length.append(metrics["avg_terminal_branch_length"])
                            # print(sample_idx)
                        # Calculate mean and variance across all samples
                        # mean_tree_size = np.mean(tree_size)
                        mean_tree_width = np.mean(tree_width)
                        # mean_tree_height = np.mean(tree_height)

                        # Append results to the data dictionary
                        data['k'].append(k)
                        data['mu'].append(mu)
                        data['s'].append(s)
                        data['repetition'].append(i)
                        # data['tree_size'].append(np.mean(mean_tree_size))
                        data['tree_width'].append(np.mean(mean_tree_width))
                        # data['tree_height'].append(np.mean(mean_tree_height))
                        # data['terminal_branch_length'].append(np.mean(terminal_branch_length))

                        # Update progress bar after each combination
                        pbar.update(1)
                        count += 1

                        # Save a checkpoint after every `checkpoint_interval` iterations
                        if count % checkpoint_interval == 0:
                            checkpoint_df = pd.DataFrame(data)
                            checkpoint_df.to_pickle(output_file + ".tmp")
                            print(f"Checkpoint saved at iteration {count}")

    # Final save: Create a DataFrame and save as the final file
    df = pd.DataFrame(data)
    df.to_pickle(output_file)
    print("Final results saved.")

# Call the function to write the stats to a pandas DataFrame file with checkpointing
calculate_coalescence_df_with_progress_and_saves("Width_regular_50.pkl", checkpoint_interval=50)


Calculating Coalescence Times:  25%|██▌       | 50/200 [11:46<37:37, 15.05s/it]

Checkpoint saved at iteration 50


Calculating Coalescence Times:  50%|█████     | 100/200 [24:27<26:09, 15.69s/it]

Checkpoint saved at iteration 100


Calculating Coalescence Times:  75%|███████▌  | 150/200 [37:07<12:29, 15.00s/it]

Checkpoint saved at iteration 150


Calculating Coalescence Times: 100%|██████████| 200/200 [49:46<00:00, 14.93s/it]

Checkpoint saved at iteration 200
Final results saved.





In [28]:
import pandas as pd
df31 = pd.read_pickle("Hills_regular_50.pkl")
df31

FileNotFoundError: [Errno 2] No such file or directory: 'Hills_regular_50.pkl'

In [1]:
import pandas as pd
df = pd.read_pickle("combined_df.pkl")
df

Unnamed: 0,graph,mu,s,index,Js,Hill,sackin,mean_coalescence_time,tree_size,tree_width,...,acc,ConnectivityIndex,AlgebraicConnectivity,area,ratio_branch_to_height,ratio_branch_to_size,ratio_sackin_to_size,ratio_sackin_to_height,ratio_sackin_to_area,ratio_sackin_to_Hill
0,pa,0.001,0.1,1,0.000233,1.085766,68.794408,1.16770,10005.72,110.49,...,0.762193,0.0,0.000000,28330.7409,0.496995,0.012736,0.006876,0.268298,0.002428,63.360226
1,pa,0.001,0.1,2,0.000229,1.085674,72.209340,1.09025,10016.88,110.57,...,0.799076,2.0,0.556971,28502.7346,0.496810,0.012785,0.007209,0.280120,0.002533,66.511078
2,pa,0.001,0.1,3,0.000227,1.081703,70.585114,1.09475,10007.26,111.24,...,0.748963,3.0,1.269001,28736.6292,0.497369,0.012839,0.007053,0.273236,0.002456,65.253703
3,pa,0.001,0.1,4,0.000223,1.080297,75.577822,1.03860,10000.49,111.79,...,0.786613,4.0,2.103527,29142.5351,0.498218,0.012987,0.007557,0.289915,0.002593,69.960217
4,pa,0.001,0.1,5,0.000224,1.080507,72.608354,1.16805,10007.53,111.57,...,0.805082,5.0,2.841214,29016.0099,0.496282,0.012897,0.007255,0.279188,0.002502,67.198407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,bn,0.001,0.1,5,0.000265,1.105629,67.215759,1.47990,10009.38,116.32,...,0.788750,5.0,0.024320,28005.2032,0.496811,0.011950,0.006715,0.279182,0.002400,60.794153
281,bn,0.001,0.1,6,0.000263,1.092483,67.034074,1.33390,10008.48,115.85,...,0.813978,5.0,0.037964,27993.9940,0.496536,0.011988,0.006698,0.277413,0.002395,61.359365
282,bn,0.001,0.1,7,0.000253,1.089900,67.173369,1.13520,9992.91,114.36,...,0.806132,5.0,0.076205,28158.8628,0.495821,0.012217,0.006722,0.272807,0.002386,61.632587
283,bn,0.001,0.1,8,0.000244,1.087612,66.916265,1.11840,10017.46,113.02,...,0.809710,5.0,0.153080,28322.8120,0.497115,0.012436,0.006680,0.267024,0.002363,61.525849
