# MomentsLD makes me sad :(

In [1]:
import os
import moments
from tqdm import tqdm
import numpy as np
import msprime
import demes
import ray
import json
import pickle
os.chdir('/sietch_colab/akapoor/Demographic_Inference')
import src.demographic_models as demographic_models

## Functions

I want to see what's going wrong with my MomentsLD specific scripts. I will copy and paste them here and will debug.

Functions in the preprocessing module

In [3]:
def sample_params(lower_bound_params, upper_bound_params):
    sampled_params = {}
    for key in lower_bound_params:
        lower_bound = lower_bound_params[key]
        upper_bound = upper_bound_params[key]
        sampled_value = np.random.uniform(lower_bound, upper_bound)
        sampled_params[key] = int(sampled_value)


        # Check if the sampled parameter is equal to the mean of the uniform distribution
        mean_value = (lower_bound + upper_bound) / 2
        if sampled_value == mean_value:
            # Add a small random value to avoid exact mean, while keeping within bounds
            adjustment = np.random.uniform(-0.1 * (upper_bound - lower_bound), 0.1 * (upper_bound - lower_bound))
            adjusted_value = sampled_value + adjustment
            
            # Ensure the adjusted value is still within the bounds
            adjusted_value = max(min(adjusted_value, upper_bound), lower_bound)
            sampled_params[key] = int(adjusted_value)

    return sampled_params

In [3]:
def run_msprime_replicates(sampled_params, experiment_config, folderpath):

    if experiment_config["demographic_model"] == "bottleneck_model":
        demographic_model = demographic_models.bottleneck_model

    elif experiment_config["demographic_model"] == "split_isolation_model":
        demographic_model = demographic_models.split_isolation_model_simulation

    else:
        raise ValueError(f"Unsupported demographic model: {experiment_config['demographic_model']}")

    g = demographic_model(sampled_params)
    demog = msprime.Demography.from_demes(g)

    # Create directory for storing VCFs
    output_folder = folderpath
    os.makedirs(output_folder, exist_ok=True)

    print(f"Samples: {experiment_config['num_samples']}")
    
    tree_sequences = msprime.sim_ancestry(
        samples = experiment_config['num_samples'],
        # {"N1": experiment_config['num_samples']['N1'], "N2": experiment_config['num_samples']['N2']},
        demography=demog,
        sequence_length=experiment_config['genome_length'],
        recombination_rate=experiment_config['recombination_rate'],
        num_replicates=experiment_config['num_reps'],
        random_seed=experiment_config['seed'],
    )

    # List to store file paths of the generated VCFs
    vcf_filepaths = []

    for ii, ts in enumerate(tree_sequences):
        ts = msprime.sim_mutations(ts, rate=experiment_config['mutation_rate'], random_seed=ii + 1)
        vcf_name = os.path.join(output_folder, f'rep.{ii}.vcf')
        with open(vcf_name, "w+") as fout:
            ts.write_vcf(fout, allow_position_zero=True)
        os.system(f"gzip {vcf_name}")

        # Store the compressed VCF file path
        vcf_filepaths.append(f"{vcf_name}.gz")

    # Write the metadata file with all VCF file paths
    metadata_file = os.path.join(output_folder, "metadata.txt")
    with open(metadata_file, "w+") as metafile:
        metafile.write("\n".join(vcf_filepaths))

    print(f"Metadata file written to {metadata_file}")

In [4]:
def write_samples_and_rec_map(experiment_config, folderpath):

    # Define the file paths
    samples_file = os.path.join(folderpath, f"samples.txt")
    flat_map_file = os.path.join(folderpath, f"flat_map.txt")

    print(f'Samples filepath: {samples_file}')
    print(f'Flat map filepath: {flat_map_file}')

    # Open and write the sample file
    with open(samples_file, "w+") as fout:
        fout.write("sample\tpop\n")

        # Dynamically define samples based on the num_samples dictionary
        sample_idx = 0  # Initialize sample index
        for pop_name, sample_size in experiment_config['num_samples'].items():
            for _ in range(sample_size):
                fout.write(f"tsk_{sample_idx}\t{pop_name}\n")
                sample_idx += 1

    # Write the recombination map file
    with open(flat_map_file, "w+") as fout:
        fout.write("pos\tMap(cM)\n")
        fout.write("0\t0\n")
        fout.write(f"{experiment_config['genome_length']}\t{experiment_config['recombination_rate'] * experiment_config['genome_length'] * 100}\n")

Functions in the demographic_models module

In [4]:
def split_isolation_model_simulation(sampled_params):

    # Unpack the sampled parameters
    Na, N1, N2, m, t_split = (
        sampled_params["Na"],  # Effective population size of the ancestral population
        sampled_params["N1"],  # Size of population 1 after split
        sampled_params["N2"],  # Size of population 2 after split
        sampled_params["m"],   # Migration rate between populations
        sampled_params["t_split"],  # Time of the population split (in generations)
    )

    b = demes.Builder()
    b.add_deme("Na", epochs=[dict(start_size=Na, end_time=t_split)])
    b.add_deme("N1", ancestors=["Na"], epochs=[dict(start_size=N1)])
    b.add_deme("N2", ancestors=["Na"], epochs=[dict(start_size=N2)])
    b.add_migration(demes=["N1", "N2"], rate=m)
    g = b.resolve()
    return g

Functions for the MomentsLD inference part

In [5]:
# Define your function with Ray's remote decorator
@ray.remote
def get_LD_stats(vcf_file, r_bins, flat_map_path, pop_file_path):
    ray.init(ignore_reinit_error=True)
    ld_stats = moments.LD.Parsing.compute_ld_statistics( #type:ignore
        vcf_file,
        rec_map_file=flat_map_path,
        pop_file=pop_file_path,
        pops=["N1", "N2"], # TODO: Change later
        r_bins=r_bins,
        report=False,
    )

    return ld_stats


In [18]:
def compute_ld_stats_parallel(folderpath, num_windows, r_bins):
    
    flat_map_path = os.path.join(folderpath, "flat_map.txt")
    pop_file_path = os.path.join(folderpath, "samples.txt")
    vcf_files = [
        os.path.join(folderpath, f"rep.{rep_ii}.vcf.gz")
        for rep_ii in range(num_windows)
    ]

    # Launch the tasks in parallel using Ray
    futures = [
        get_LD_stats.remote(vcf_file, r_bins, flat_map_path, pop_file_path)
        for vcf_file in vcf_files
    ]

    # Wait for all the tasks to complete and retrieve results
    results = ray.get(futures)
    return results

In [8]:
def run_inference_momentsLD(folderpath, demographic_model, p_guess, num_reps):
    """
    This should do the parameter inference for momentsLD
    index: unique simulation number
    """

    r_bins = np.array([0, 1e-6, 2e-6, 5e-6, 1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3])

    print("parsing LD statistics")


    ld_stats = {}
    results = compute_ld_stats_parallel(folderpath, num_reps, r_bins)

    for i, result in enumerate(results):
        ld_stats[i] = result

    # print("computing mean and varcov matrix from LD statistics sums")
    mv = moments.LD.Parsing.bootstrap_data(ld_stats)  # type: ignore
    # print("SHAPE OF THE COVARIANCE MATRIX")
    # print(mv["varcovs"][-1].shape)
    # mv["varcovs"][-1].shape = (1, 1)

    if demographic_model == "bottleneck_model":
        demo_func = moments.LD.Demographics1D.three_epoch # type: ignore

    elif demographic_model == "split_isolation_model":
        demo_func = demographic_models.split_isolation_model_momentsLD

    else:
        raise ValueError(f"Unsupported demographic model: {demographic_model}")

    # Set up the initial guess
    p_guess = moments.LD.Util.perturb_params(p_guess, fold=0.1) # type: ignore
    opt_params, LL = moments.LD.Inference.optimize_log_lbfgsb( #type:ignore
        p_guess, [mv["means"], mv["varcovs"]], [demo_func], rs=r_bins, maxiter = 100, verbose = 3
    )

    physical_units = moments.LD.Util.rescale_params( # type: ignore
        opt_params, ["nu", "nu", "T", "m", "Ne"]
)

    opt_params, LL = moments.LD.Inference.optimize_log_lbfgsb( #type:ignore
    p_guess, [mv["means"], mv["varcovs"]], [demo_func], rs=r_bins, verbose = 3
    )

    opt_params_dict = {}
    if demographic_model == "bottleneck_model":

        opt_params_dict = {
            # "N0": opt_params[4],
            "Nb": opt_params[0] * opt_params[4],
            "N_recover": opt_params[1] * opt_params[4],
            "t_bottleneck_start": (opt_params[2]+opt_params[3]) * 2 * opt_params[4],
            "t_bottleneck_end": opt_params[3] * 2 * opt_params[4]
        }

    elif demographic_model == "split_isolation_model":
        physical_units = moments.LD.Util.rescale_params( #type:ignore
            opt_params, ["nu", "nu", "T", "m", "Ne"]
        )

        print(physical_units)

        opt_params_dict = {
            "N1": physical_units[0],
            "N2": physical_units[1],
            "t_split": physical_units[2],
            "m": physical_units[3], 
            'Na': physical_units[4]
        }

        print("best fit parameters:")
        print(f"  N(deme0)         :  {physical_units[0]:.1f}")
        print(f"  N(deme1)         :  {physical_units[1]:.1f}")
        print(f"  Div. time (gen)  :  {physical_units[2]:.1f}")
        print(f"  Migration rate   :  {physical_units[3]:.6f}")
        print(f"  N(ancestral)     :  {physical_units[4]:.1f}")
    
    # print(f'Moments LD results: {opt_params_dict}')

    return opt_params_dict


## Driver

In [9]:
# Load in the experiment_config.json
with open("/sietch_colab/akapoor/Demographic_Inference/experiment_config.json", "r") as f:
    experiment_config = json.load(f)

In [None]:
sampled_params = sample_params(experiment_config["lower_bound_params"], experiment_config["upper_bound_params"])
print(sampled_params)

In [12]:
folderpath = f"/sietch_colab/akapoor/Demographic_Inference/testing_things/simulations/{experiment_config['demographic_model']}/"

In [None]:
run_msprime_replicates(sampled_params, experiment_config, folderpath)

In [None]:
write_samples_and_rec_map(experiment_config, folderpath)

In [16]:
demographic_model = "split_isolation_model"

In [17]:
p_guess = p_guess = [0.01, 0.8, 0.075, 0.05, 10000]
num_reps = 100

In [None]:
opt_params_dict = run_inference_momentsLD(folderpath, demographic_model, p_guess, num_reps)

In [None]:
opt_params_dict

In [None]:
sampled_params

## Windowed version of my code

In [2]:
def sample_params(lower_bound_params, upper_bound_params):
    sampled_params = {}
    for key in lower_bound_params:
        lower_bound = lower_bound_params[key]
        upper_bound = upper_bound_params[key]
        sampled_value = np.random.uniform(lower_bound, upper_bound)
        sampled_params[key] = int(sampled_value)

        # Check if the sampled parameter is equal to the mean of the uniform distribution
        mean_value = (lower_bound + upper_bound) / 2
        if sampled_value == mean_value:
            # Add a small random value to avoid exact mean, while keeping within bounds
            adjustment = np.random.uniform(-0.1 * (upper_bound - lower_bound), 0.1 * (upper_bound - lower_bound))
            adjusted_value = sampled_value + adjustment
            
            # Ensure the adjusted value is still within the bounds
            adjusted_value = max(min(adjusted_value, upper_bound), lower_bound)
            sampled_params[key] = int(adjusted_value)

    return sampled_params

In [3]:
def split_isolation_model_simulation(sampled_params):

    # Unpack the sampled parameters
    Na, N1, N2, m, t_split = (
        sampled_params["Na"],  # Effective population size of the ancestral population
        sampled_params["N1"],  # Size of population 1 after split
        sampled_params["N2"],  # Size of population 2 after split
        sampled_params["m"],   # Migration rate between populations
        sampled_params["t_split"],  # Time of the population split (in generations)
    )

    b = demes.Builder()
    b.add_deme("Na", epochs=[dict(start_size=Na, end_time=t_split)])
    b.add_deme("N1", ancestors=["Na"], epochs=[dict(start_size=N1)])
    b.add_deme("N2", ancestors=["Na"], epochs=[dict(start_size=N2)])
    b.add_migration(demes=["N1", "N2"], rate=m)
    g = b.resolve()
    return g

In [4]:
# Define your function with Ray's remote decorator
@ray.remote
def get_LD_stats(vcf_file, r_bins, flat_map_path, pop_file_path):
    ray.init(ignore_reinit_error=True)
    ld_stats = moments.LD.Parsing.compute_ld_statistics( #type:ignore
        vcf_file,
        rec_map_file=flat_map_path,
        pop_file=pop_file_path,
        pops=["N1", "N2"], # TODO: Change later
        r_bins=r_bins,
        report=False
    )

    return ld_stats

In [5]:
def compute_ld_stats_parallel(folderpath, num_windows, r_bins):
    
    flat_map_path = os.path.join(folderpath, "flat_map.txt")
    pop_file_path = os.path.join(folderpath, "samples.txt")
    vcf_files = [
        os.path.join(folderpath, f"rep.{rep_ii}.vcf.gz")
        for rep_ii in range(num_windows)
    ]

    # Launch the tasks in parallel using Ray
    futures = [
        get_LD_stats.remote(vcf_file, r_bins, flat_map_path, pop_file_path)
        for vcf_file in vcf_files
    ]

    # Wait for all the tasks to complete and retrieve results
    results = ray.get(futures)
    return results

In [6]:
def simulate_chromosome(experiment_config, sampled_params, num_samples, demographic_model, length=1e7, mutation_rate=5.7e-9, recombination_rate = 3.386e-9, **kwargs):
    g = demographic_model(sampled_params)

    demog = msprime.Demography.from_demes(g)

    # Dynamically define the samples using msprime.SampleSet, based on the sample_sizes dictionary
    # samples = [
    #     msprime.SampleSet(sample_size, population=pop_name, ploidy=1)
    #     for pop_name, sample_size in num_samples.items()
    # ]

    samples = {"N1": experiment_config['num_samples']['N1'], "N2": experiment_config['num_samples']['N2']}

    # Simulate ancestry for two populations (joint simulation)
    ts = msprime.sim_ancestry(
        samples=samples,  # Two populations
        demography=demog,
        sequence_length=length,
        recombination_rate=recombination_rate,
        random_seed=experiment_config['seed'],
    )
    
    # Simulate mutations over the ancestry tree sequence
    ts = msprime.sim_mutations(ts, rate=mutation_rate)

    return ts

In [7]:
def generate_window(ts, window_length):
    """
    Generate a random window from a tree sequence.
    
    Parameters:
    -----------
    ts : tskit.TreeSequence
        The input tree sequence
    window_length : float
        Length of the window in the same units as ts.sequence_length
        
    Returns:
    --------
    tskit.TreeSequence
        A new tree sequence containing only the specified window
    """
    # Ensure window isn't longer than sequence
    if window_length > ts.sequence_length:
        raise ValueError("Window length cannot be larger than sequence length")
    
    # Generate random start position
    max_start = ts.sequence_length - window_length
    start = np.random.uniform(0, max_start)
    end = start + window_length
    
    # Extract window
    return ts.keep_intervals([[start, end]]).trim()

In [8]:
def run_msprime_replicates(ts, experiment_config, window_number, folderpath):

    folderpath = os.path.join(folderpath, f"window_{window_number}")

    # Create directory for storing VCFs
    os.makedirs(folderpath, exist_ok=True)

    # Generate random windows
    window = generate_window(ts, experiment_config['window_length'])

    # List to store file paths of the generated VCFs
    vcf_filepath = []

    # Iterate over windows and write VCFs
    vcf_name = os.path.join(folderpath, f'window.{window_number}.vcf')
    with open(vcf_name, "w+") as fout:
        window.write_vcf(fout, allow_position_zero=True)
        
    # Compress the VCF file
    os.system(f"gzip {vcf_name}")
    
    # # Store the compressed VCF file path
    vcf_filepath.append(f"{vcf_name}.gz")
    
    # Write the metadata file with all VCF file paths
    metadata_file = os.path.join(folderpath, "individual_file_metadata.txt")
    with open(metadata_file, "w+") as metafile:
        metafile.write(vcf_name)

In [9]:
def write_samples_and_rec_map(experiment_config, window_number, folderpath):

    folderpath = os.path.join(folderpath, f"window_{window_number}")

    # Define the file paths
    samples_file = os.path.join(folderpath, f"samples.txt")
    flat_map_file = os.path.join(folderpath, f"flat_map.txt")

    # Open and write the sample file
    with open(samples_file, "w+") as fout:
        fout.write("sample\tpop\n")

        # Dynamically define samples based on the num_samples dictionary
        sample_idx = 0  # Initialize sample index
        for pop_name, sample_size in experiment_config['num_samples'].items():
            for _ in range(sample_size):
                fout.write(f"tsk_{sample_idx}\t{pop_name}\n")
                sample_idx += 1

    # Write the recombination map file
    with open(flat_map_file, "w+") as fout:
        fout.write("pos\tMap(cM)\n")
        fout.write("0\t0\n")
        fout.write(f"{experiment_config['genome_length']}\t{experiment_config['recombination_rate'] * experiment_config['genome_length'] * 100}\n")

In [10]:
# Load in the experiment_config.json
with open("/sietch_colab/akapoor/Demographic_Inference/experiment_config.json", "r") as f:
    experiment_config = json.load(f)

In [11]:
sampled_params = sample_params(experiment_config["lower_bound_params"], experiment_config["upper_bound_params"])
print(sampled_params)

{'t_split': 1531, 'N1': 7269, 'N2': 1361, 'Na': 15802, 'm': 0}


First let's simulate the chromosome

In [12]:
from src.demographic_models import split_isolation_model_simulation
demographic_model = split_isolation_model_simulation

In [13]:
ts = simulate_chromosome(experiment_config, sampled_params, num_samples = experiment_config['num_samples'], demographic_model = demographic_model, length=experiment_config['genome_length'], mutation_rate=experiment_config['mutation_rate'], recombination_rate = experiment_config['recombination_rate'])
# Save the tree sequence to a file
ts.dump("big_sequence.trees")

In [14]:
import tskit

# Load the tree sequence
ts = tskit.load("big_sequence.trees")

In [15]:
ts

Tree Sequence,Unnamed: 1
Trees,326743
Sequence Length,100000000.0
Time Units,generations
Sample Nodes,80
Total Size,70.8 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,1108836,33.8 MiB,
Individuals,40,1.1 KiB,
Migrations,0,8 Bytes,
Mutations,389006,13.7 MiB,
Nodes,208095,5.6 MiB,
Populations,3,297 Bytes,✅
Provenances,2,2.6 KiB,
Sites,388204,9.3 MiB,


Now let's create the windows

In [23]:
from tqdm import tqdm 
for i in tqdm(range(experiment_config['num_windows'])):
    run_msprime_replicates(ts, experiment_config, i, '/sietch_colab/akapoor/Demographic_Inference/sampled_genome_windows/')
    write_samples_and_rec_map(experiment_config, i, '/sietch_colab/akapoor/Demographic_Inference/sampled_genome_windows/')
    

100%|██████████| 100/100 [00:21<00:00,  4.67it/s]


In [24]:
r_bins = np.array([0, 1e-6, 2e-6, 5e-6, 1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3])

In [25]:
def compute_ld_stats_parallel(num_windows, r_bins):
    
    flat_map_path = os.path.join('/sietch_colab/akapoor/Demographic_Inference/sampled_genome_windows/window_1', "flat_map.txt")
    pop_file_path = os.path.join('/sietch_colab/akapoor/Demographic_Inference/sampled_genome_windows/window_1', "samples.txt")
    vcf_files = [
        os.path.join('/sietch_colab/akapoor/Demographic_Inference/sampled_genome_windows', f"window_{i}/window.{i}.vcf.gz")
        for i in range(num_windows)
    ]

    # Launch the tasks in parallel using Ray
    futures = [
        get_LD_stats.remote(vcf_file, r_bins, flat_map_path, pop_file_path)
        for vcf_file in vcf_files
    ]

    # Wait for all the tasks to complete and retrieve results
    results = ray.get(futures)
    return results


In [26]:
def run_inference_momentsLD(demographic_model, p_guess, num_reps):
    """
    This should do the parameter inference for momentsLD
    index: unique simulation number
    """

    r_bins = np.array([0, 1e-6, 2e-6, 5e-6, 1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3])

    print("parsing LD statistics")


    ld_stats = {}
    results = compute_ld_stats_parallel(num_reps, r_bins)

    #print(results[0])
    # tl_sums0 = results[0]['sums'][0]
    # tl_sums1 = results[1]['sums'][0]
    # print(tl_sums0)
    # print(tl_sums1)
    # assert False
    #dims = (num_reps, ) + results[0]['sums'].shape
    #print(dims)
    #dbg_stats = np.zeros(dims)
    #for i, result in enumerate(results):
    #    dbg_stats[i] = result['sums']
    #print(dbg_stats[:, 0])
    #assert False

    for i, result in enumerate(results):
        ld_stats[i] = result

    # print("computing mean and varcov matrix from LD statistics sums")
    mv = moments.LD.Parsing.bootstrap_data(ld_stats)  # type: ignore
    # print("SHAPE OF THE COVARIANCE MATRIX")
    # print(mv["varcovs"][-1].shape)
    # mv["varcovs"][-1].shape = (1, 1)

    print(mv["stats"][0])
    for i, vc in enumerate(mv["varcovs"]):
        #print(vc)
        print(vc.shape)
        print(np.sum(np.isclose(np.diag(vc), 0)))
        # print(np.linalg.eigh(vc)[0])
        print(mv["stats"][0][np.argmin(np.diag(vc))])

    # assert False

    # Let's save the mv object 
    with open('/sietch_colab/akapoor/Demographic_Inference/mv.pkl', 'wb') as f:
        pickle.dump(mv, f)

    if demographic_model == "bottleneck_model":
        demo_func = moments.LD.Demographics1D.three_epoch # type: ignore

    elif demographic_model == "split_isolation_model":
        demo_func = demographic_models.split_isolation_model_momentsLD

    else:
        raise ValueError(f"Unsupported demographic model: {demographic_model}")

    # Set up the initial guess
    p_guess = moments.LD.Util.perturb_params(p_guess, fold=0.1) # type: ignore
    # opt_params, LL = moments.LD.Inference.optimize_log_lbfgsb( #type:ignore
    #     p_guess, [mv["means"], mv["varcovs"]], [demo_func], rs=r_bins, maxiter = 100, verbose = 3
    # )

    opt_params, LL = moments.LD.Inference.optimize_log_lbfgsb( #type:ignore
        p_guess, [mv["means"], mv["varcovs"]], [demo_func], rs=r_bins, maxiter = 100, verbose = 3
    )

    physical_units = moments.LD.Util.rescale_params( # type: ignore
        opt_params, ["nu", "nu", "T", "m", "Ne"]
)

    opt_params, LL = moments.LD.Inference.optimize_log_lbfgsb( #type:ignore
    p_guess, [mv["means"], mv["varcovs"]], [demo_func], rs=r_bins, verbose = 3
    )

    opt_params_dict = {}
    if demographic_model == "bottleneck_model":

        opt_params_dict = {
            # "N0": opt_params[4],
            "Nb": opt_params[0] * opt_params[4],
            "N_recover": opt_params[1] * opt_params[4],
            "t_bottleneck_start": (opt_params[2]+opt_params[3]) * 2 * opt_params[4],
            "t_bottleneck_end": opt_params[3] * 2 * opt_params[4]
        }

    elif demographic_model == "split_isolation_model":
        physical_units = moments.LD.Util.rescale_params( #type:ignore
            opt_params, ["nu", "nu", "T", "m", "Ne"]
        )

        print(physical_units)

        opt_params_dict = {
            "N1": physical_units[0],
            "N2": physical_units[1],
            "t_split": physical_units[2],
            "m": physical_units[3], 
            'Na': physical_units[4]
        }

        print("best fit parameters:")
        print(f"  N(deme0)         :  {physical_units[0]:.1f}")
        print(f"  N(deme1)         :  {physical_units[1]:.1f}")
        print(f"  Div. time (gen)  :  {physical_units[2]:.1f}")
        print(f"  Migration rate   :  {physical_units[3]:.6f}")
        print(f"  N(ancestral)     :  {physical_units[4]:.1f}")
    
    # print(f'Moments LD results: {opt_params_dict}')

    return opt_params_dict

In [27]:
p_guess = [0.01, 0.8, 0.075, 0.05, 10000]


In [28]:
from src.optimize import nlopt_LD

In [29]:
opt_params_dict = run_inference_momentsLD('split_isolation_model', p_guess, num_reps = 100)

parsing LD statistics
['DD_0_0', 'DD_0_1', 'DD_1_1', 'Dz_0_0_0', 'Dz_0_0_1', 'Dz_0_1_1', 'Dz_1_0_0', 'Dz_1_0_1', 'Dz_1_1_1', 'pi2_0_0_0_0', 'pi2_0_0_0_1', 'pi2_0_0_1_1', 'pi2_0_1_0_1', 'pi2_0_1_1_1', 'pi2_1_1_1_1']
(15, 15)
1
pi2_0_0_0_0
(15, 15)
1
pi2_0_0_0_0
(15, 15)
1
pi2_0_0_0_0
(15, 15)
1
pi2_0_0_0_0
(15, 15)
1
pi2_0_0_0_0
(15, 15)
1
pi2_0_0_0_0
(15, 15)
1
pi2_0_0_0_0
(15, 15)
1
pi2_0_0_0_0
(15, 15)
1
pi2_0_0_0_0
(15, 15)
1
pi2_0_0_0_0
(3, 3)
1
DD_0_0
3       , -2.12599e+13, array([ 0.0103613  ,  0.838742   ,  0.07231    ,  0.0501251  ,  9699.29    ])
6       , -2.12676e+13, array([ 0.0103613  ,  0.837904   ,  0.07231    ,  0.0501251  ,  9709       ])
9       , -1.01026e+09, array([ 0.0225289  ,  0.832143   ,  0.0390436  ,  0.0564767  ,  9217.17    ])
12      , -1.01048e+09, array([ 0.0225289  ,  0.831311   ,  0.0390436  ,  0.0564767  ,  9226.39    ])
15      , -1.00993e+09, array([ 0.0225295  ,  0.832142   ,  0.0390424  ,  0.0564767  ,  9217.15    ])
18      , -1.01014e+09, array

In [30]:
opt_params_dict

{'N1': 6973.89889385895,
 'N2': 1250.9904626904683,
 't_split': 1497.2395995457543,
 'm': 1.5856377608167838e-06,
 'Na': 16206.824667332314}

In [31]:
sampled_params

{'t_split': 1531, 'N1': 7269, 'N2': 1361, 'Na': 15802, 'm': 0}

# Dadi makes me sad :(

In [1]:
import os
import moments
from tqdm import tqdm
import numpy as np
import msprime
import dadi
import glob
import demes
import ray
import json
os.chdir('/sietch_colab/akapoor/Demographic_Inference')
import src.demographic_models as demographic_models

In [2]:
# Load in the experiment_config.json
with open("/sietch_colab/akapoor/Demographic_Inference/experiment_config.json", "r") as f:
    experiment_config = json.load(f)

In [3]:
def sample_params(lower_bound_params, upper_bound_params):
    sampled_params = {}
    for key in lower_bound_params:
        lower_bound = lower_bound_params[key]
        upper_bound = upper_bound_params[key]
        sampled_value = np.random.uniform(lower_bound, upper_bound)
        sampled_params[key] = int(sampled_value)


        # Check if the sampled parameter is equal to the mean of the uniform distribution
        mean_value = (lower_bound + upper_bound) / 2
        if sampled_value == mean_value:
            # Add a small random value to avoid exact mean, while keeping within bounds
            adjustment = np.random.uniform(-0.1 * (upper_bound - lower_bound), 0.1 * (upper_bound - lower_bound))
            adjusted_value = sampled_value + adjustment
            
            # Ensure the adjusted value is still within the bounds
            adjusted_value = max(min(adjusted_value, upper_bound), lower_bound)
            sampled_params[key] = int(adjusted_value)

    return sampled_params

In [None]:
sampled_params = sample_params(experiment_config["lower_bound_params"], experiment_config["upper_bound_params"])
print(sampled_params)

In [5]:
def create_SFS(
    experiment_config, sampled_params, mode, num_samples, demographic_model, length=1e7, mutation_rate=5.7e-9, recombination_rate = 3.386e-9, **kwargs
):
    """
    If we are in pretraining mode we will use a simulated SFS. If we are in inference mode we will use a real SFS.

    """

    if mode == "pretrain":
        # Simulate the demographic model
        g = demographic_model(sampled_params)
        demog = msprime.Demography.from_demes(g)

        # Dynamically define the samples using msprime.SampleSet, based on the sample_sizes dictionary
        samples = [
            msprime.SampleSet(sample_size, population=pop_name, ploidy=1)
            for pop_name, sample_size in num_samples.items()
        ]

        # Simulate ancestry for two populations (joint simulation)
        ts = msprime.sim_ancestry(
            samples=samples,  # Two populations
            demography=demog,
            sequence_length=length,
            recombination_rate=recombination_rate,
            random_seed=experiment_config['seed'],
        )
        
        # Simulate mutations over the ancestry tree sequence
        ts = msprime.sim_mutations(ts, rate=mutation_rate)

        # Define sample sets dynamically for the SFS
        sample_sets = [
            ts.samples(population=pop.id) 
            for pop in ts.populations() 
            if len(ts.samples(population=pop.id)) > 0  # Exclude populations with no samples
        ]
        
        # Create the joint allele frequency spectrum
        sfs = ts.allele_frequency_spectrum(sample_sets=sample_sets, mode="site", polarised=True)
        
        # Multiply SFS by the sequence length to adjust scale
        sfs *= length

        # Convert to moments Spectrum for further use
        sfs = moments.Spectrum(sfs)
    
    elif mode == "inference":
        vcf_file = kwargs.get("vcf_file", None)
        pop_file = kwargs.get("pop_file", None)
        popname = kwargs.get("popname", None)

        if vcf_file is None or pop_file is None:
            raise ValueError(
                "vcf_file and pop_file must be provided in inference mode."
            )

        dd = dadi.Misc.make_data_dict_vcf(vcf_file, pop_file)
        sfs = dadi.Spectrum.from_data_dict(
            dd, [popname], projections=[2 * num_samples], polarized=True
        )

    return sfs

In [6]:
sfs = create_SFS(
    experiment_config,
      sampled_params,
        "pretrain",
          experiment_config["num_samples"],
            demographic_models.split_isolation_model_simulation,
              length=experiment_config['genome_length'],
                mutation_rate=experiment_config['mutation_rate'], recombination_rate = experiment_config['recombination_rate'])

In [None]:
sfs

In [8]:
from src.parameter_inference import run_inference_dadi

In [None]:
model_sfs_dadi, opt_theta_dadi, opt_params_dict_dadi, ll_list_dadi = (
        run_inference_dadi(
            sfs = sfs,
            p0= experiment_config['optimization_initial_guess'],
            lower_bound= experiment_config['lower_bound_optimization'],
            upper_bound= experiment_config['upper_bound_optimization'],
            num_samples=20,
            demographic_model=experiment_config['demographic_model'],
            mutation_rate=experiment_config['mutation_rate'],
            length=experiment_config['genome_length'],
            k  = experiment_config['k'], 
            top_values_k = experiment_config['top_values_k']
        )
    )