# MomentsLD makes me sad :(

In [1]:
import os
import moments
from tqdm import tqdm
import numpy as np
import msprime
import dadi
import glob
import demes
import ray
import json
os.chdir('/sietch_colab/akapoor/Demographic_Inference')
import src.demographic_models as demographic_models

## Functions

I want to see what's going wrong with my MomentsLD specific scripts. I will copy and paste them here and will debug.

Functions in the preprocessing module

In [2]:
def sample_params(lower_bound_params, upper_bound_params):
    sampled_params = {}
    for key in lower_bound_params:
        lower_bound = lower_bound_params[key]
        upper_bound = upper_bound_params[key]
        sampled_value = np.random.uniform(lower_bound, upper_bound)
        sampled_params[key] = int(sampled_value)


        # Check if the sampled parameter is equal to the mean of the uniform distribution
        mean_value = (lower_bound + upper_bound) / 2
        if sampled_value == mean_value:
            # Add a small random value to avoid exact mean, while keeping within bounds
            adjustment = np.random.uniform(-0.1 * (upper_bound - lower_bound), 0.1 * (upper_bound - lower_bound))
            adjusted_value = sampled_value + adjustment
            
            # Ensure the adjusted value is still within the bounds
            adjusted_value = max(min(adjusted_value, upper_bound), lower_bound)
            sampled_params[key] = int(adjusted_value)

    return sampled_params

In [3]:
def run_msprime_replicates(sampled_params, experiment_config, folderpath):

    if experiment_config["demographic_model"] == "bottleneck_model":
        demographic_model = demographic_models.bottleneck_model

    elif experiment_config["demographic_model"] == "split_isolation_model":
        demographic_model = demographic_models.split_isolation_model_simulation

    else:
        raise ValueError(f"Unsupported demographic model: {experiment_config['demographic_model']}")

    g = demographic_model(sampled_params)
    demog = msprime.Demography.from_demes(g)

    # Create directory for storing VCFs
    output_folder = folderpath
    os.makedirs(output_folder, exist_ok=True)

    print(f"Samples: {experiment_config['num_samples']}")
    
    tree_sequences = msprime.sim_ancestry(
        samples = experiment_config['num_samples'],
        # {"N1": experiment_config['num_samples']['N1'], "N2": experiment_config['num_samples']['N2']},
        demography=demog,
        sequence_length=experiment_config['genome_length'],
        recombination_rate=experiment_config['recombination_rate'],
        num_replicates=experiment_config['num_reps'],
        random_seed=experiment_config['seed'],
    )

    # List to store file paths of the generated VCFs
    vcf_filepaths = []

    for ii, ts in enumerate(tree_sequences):
        ts = msprime.sim_mutations(ts, rate=experiment_config['mutation_rate'], random_seed=ii + 1)
        vcf_name = os.path.join(output_folder, f'rep.{ii}.vcf')
        with open(vcf_name, "w+") as fout:
            ts.write_vcf(fout, allow_position_zero=True)
        os.system(f"gzip {vcf_name}")

        # Store the compressed VCF file path
        vcf_filepaths.append(f"{vcf_name}.gz")

    # Write the metadata file with all VCF file paths
    metadata_file = os.path.join(output_folder, "metadata.txt")
    with open(metadata_file, "w+") as metafile:
        metafile.write("\n".join(vcf_filepaths))

    print(f"Metadata file written to {metadata_file}")

In [4]:
def write_samples_and_rec_map(experiment_config, folderpath):

    # Define the file paths
    samples_file = os.path.join(folderpath, f"samples.txt")
    flat_map_file = os.path.join(folderpath, f"flat_map.txt")

    print(f'Samples filepath: {samples_file}')
    print(f'Flat map filepath: {flat_map_file}')

    # Open and write the sample file
    with open(samples_file, "w+") as fout:
        fout.write("sample\tpop\n")

        # Dynamically define samples based on the num_samples dictionary
        sample_idx = 0  # Initialize sample index
        for pop_name, sample_size in experiment_config['num_samples'].items():
            for _ in range(sample_size):
                fout.write(f"tsk_{sample_idx}\t{pop_name}\n")
                sample_idx += 1

    # Write the recombination map file
    with open(flat_map_file, "w+") as fout:
        fout.write("pos\tMap(cM)\n")
        fout.write("0\t0\n")
        fout.write(f"{experiment_config['genome_length']}\t{experiment_config['recombination_rate'] * experiment_config['genome_length'] * 100}\n")

Functions in the demographic_models module

In [5]:
def split_isolation_model_simulation(sampled_params):

    # Unpack the sampled parameters
    Na, N1, N2, m, t_split = (
        sampled_params["Na"],  # Effective population size of the ancestral population
        sampled_params["N1"],  # Size of population 1 after split
        sampled_params["N2"],  # Size of population 2 after split
        sampled_params["m"],   # Migration rate between populations
        sampled_params["t_split"],  # Time of the population split (in generations)
    )

    b = demes.Builder()
    b.add_deme("Na", epochs=[dict(start_size=Na, end_time=t_split)])
    b.add_deme("N1", ancestors=["Na"], epochs=[dict(start_size=N1)])
    b.add_deme("N2", ancestors=["Na"], epochs=[dict(start_size=N2)])
    b.add_migration(demes=["N1", "N2"], rate=m)
    g = b.resolve()
    return g

Functions for the MomentsLD inference part

In [6]:
# Define your function with Ray's remote decorator
@ray.remote
def get_LD_stats(vcf_file, r_bins, flat_map_path, pop_file_path):
    ray.init(ignore_reinit_error=True)
    ld_stats = moments.LD.Parsing.compute_ld_statistics( #type:ignore
        vcf_file,
        rec_map_file=flat_map_path,
        pop_file=pop_file_path,
        pops=["N1", "N2"], # TODO: Change later
        r_bins=r_bins,
        report=False,
    )

    return ld_stats


In [7]:
def compute_ld_stats_parallel(folderpath, num_reps, r_bins):
    
    flat_map_path = os.path.join(folderpath, "flat_map.txt")
    pop_file_path = os.path.join(folderpath, "samples.txt")
    vcf_files = [
        os.path.join(folderpath, f"rep.{rep_ii}.vcf.gz")
        for rep_ii in range(num_reps)
    ]

    # Launch the tasks in parallel using Ray
    futures = [
        get_LD_stats.remote(vcf_file, r_bins, flat_map_path, pop_file_path)
        for vcf_file in vcf_files
    ]

    # Wait for all the tasks to complete and retrieve results
    results = ray.get(futures)
    return results

In [8]:
def run_inference_momentsLD(folderpath, demographic_model, p_guess, num_reps):
    """
    This should do the parameter inference for momentsLD
    index: unique simulation number
    """

    r_bins = np.array([0, 1e-6, 2e-6, 5e-6, 1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3])

    print("parsing LD statistics")


    ld_stats = {}
    results = compute_ld_stats_parallel(folderpath, num_reps, r_bins)

    for i, result in enumerate(results):
        ld_stats[i] = result

    # print("computing mean and varcov matrix from LD statistics sums")
    mv = moments.LD.Parsing.bootstrap_data(ld_stats)  # type: ignore
    # print("SHAPE OF THE COVARIANCE MATRIX")
    # print(mv["varcovs"][-1].shape)
    # mv["varcovs"][-1].shape = (1, 1)

    if demographic_model == "bottleneck_model":
        demo_func = moments.LD.Demographics1D.three_epoch # type: ignore

    elif demographic_model == "split_isolation_model":
        demo_func = demographic_models.split_isolation_model_momentsLD

    else:
        raise ValueError(f"Unsupported demographic model: {demographic_model}")

    # Set up the initial guess
    p_guess = moments.LD.Util.perturb_params(p_guess, fold=0.1) # type: ignore
    opt_params, LL = moments.LD.Inference.optimize_log_lbfgsb( #type:ignore
        p_guess, [mv["means"], mv["varcovs"]], [demo_func], rs=r_bins, maxiter = 100, verbose = 3
    )

    physical_units = moments.LD.Util.rescale_params( # type: ignore
        opt_params, ["nu", "nu", "T", "m", "Ne"]
)

    opt_params, LL = moments.LD.Inference.optimize_log_lbfgsb( #type:ignore
    p_guess, [mv["means"], mv["varcovs"]], [demo_func], rs=r_bins, verbose = 3
    )

    opt_params_dict = {}
    if demographic_model == "bottleneck_model":

        opt_params_dict = {
            # "N0": opt_params[4],
            "Nb": opt_params[0] * opt_params[4],
            "N_recover": opt_params[1] * opt_params[4],
            "t_bottleneck_start": (opt_params[2]+opt_params[3]) * 2 * opt_params[4],
            "t_bottleneck_end": opt_params[3] * 2 * opt_params[4]
        }

    elif demographic_model == "split_isolation_model":
        physical_units = moments.LD.Util.rescale_params( #type:ignore
            opt_params, ["nu", "nu", "T", "m", "Ne"]
        )

        print(physical_units)

        opt_params_dict = {
            "N1": physical_units[0],
            "N2": physical_units[1],
            "t_split": physical_units[2],
            "m": physical_units[3], 
            'Na': physical_units[4]
        }

        print("best fit parameters:")
        print(f"  N(deme0)         :  {physical_units[0]:.1f}")
        print(f"  N(deme1)         :  {physical_units[1]:.1f}")
        print(f"  Div. time (gen)  :  {physical_units[2]:.1f}")
        print(f"  Migration rate   :  {physical_units[3]:.6f}")
        print(f"  N(ancestral)     :  {physical_units[4]:.1f}")
    
    # print(f'Moments LD results: {opt_params_dict}')

    return opt_params_dict


## Driver

In [9]:
# Load in the experiment_config.json
with open("/sietch_colab/akapoor/Demographic_Inference/experiment_config.json", "r") as f:
    experiment_config = json.load(f)

In [10]:
sampled_params = sample_params(experiment_config["lower_bound_params"], experiment_config["upper_bound_params"])
print(sampled_params)

{'t_split': 1611, 'N1': 1813, 'N2': 1423, 'Na': 12796, 'm': 0}


In [11]:
folderpath = f"/sietch_colab/akapoor/Demographic_Inference/testing_things/simulations/{experiment_config['demographic_model']}/"

In [12]:
run_msprime_replicates(sampled_params, experiment_config, folderpath)

Samples: {'N1': 20, 'N2': 20}
Metadata file written to /sietch_colab/akapoor/Demographic_Inference/testing_things/simulations/split_isolation_model/metadata.txt


In [13]:
write_samples_and_rec_map(experiment_config, folderpath)

Samples filepath: /sietch_colab/akapoor/Demographic_Inference/testing_things/simulations/split_isolation_model/samples.txt
Flat map filepath: /sietch_colab/akapoor/Demographic_Inference/testing_things/simulations/split_isolation_model/flat_map.txt


In [14]:
demographic_model = "split_isolation_model"

In [16]:
p_guess = p_guess = [0.01, 0.8, 0.075, 0.05, 10000]
num_reps = 100

In [17]:
opt_params_dict = run_inference_momentsLD(folderpath, demographic_model, p_guess, num_reps)

parsing LD statistics


2024-10-20 20:48:59,965	INFO worker.py:1781 -- Started a local Ray instance.
[36m(get_LD_stats pid=1480715)[0m Calling ray.init() again after it has already been called.


3       , -3.69906e+11, array([ 0.00972849 ,  0.789578   ,  0.0718179  ,  0.0468688  ,  10706.4    ])
6       , -3.69943e+11, array([ 0.00972849 ,  0.788789   ,  0.0718179  ,  0.0468688  ,  10717.1    ])
9       , -1.75582e+07, array([ 0.0215045  ,  0.768531   ,  0.0396941  ,  0.0533926  ,  10293.6    ])
12      , -1.75592e+07, array([ 0.0215045  ,  0.767763   ,  0.0396941  ,  0.0533926  ,  10303.9    ])
15      , -1.75525e+07, array([ 0.0215052  ,  0.768531   ,  0.0396929  ,  0.0533926  ,  10293.6    ])
18      , -1.75535e+07, array([ 0.0215052  ,  0.767762   ,  0.0396929  ,  0.0533926  ,  10303.9    ])
21      , -1.75299e+07, array([ 0.0215079  ,  0.768527   ,  0.0396881  ,  0.0533926  ,  10293.5    ])
24      , -1.7531e+07 , array([ 0.0215079  ,  0.767759   ,  0.0396881  ,  0.0533926  ,  10303.8    ])
27      , -1.744e+07  , array([ 0.0215185  ,  0.768514   ,  0.0396689  ,  0.0533927  ,  10293.3    ])
30      , -1.7441e+07 , array([ 0.0215185  ,  0.767746   ,  0.0396689  ,  0.053392

[36m(pid=1475411)[0m [2024-10-20 21:34:27,441 E 1475411 1485928] gcs_rpc_client.h:653: Failed to connect to GCS within 60 seconds. GCS may have been killed. It's either GCS is terminated by `ray stop` or is killed unexpectedly. If it is killed unexpectedly, see the log file gcs_server.out. https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#logging-directory-structure. The program will terminate.
[36m(pid=gcs_server)[0m E1020 21:36:20.844855780 1474957 chttp2_transport.cc:2890]             keepalive_ping_end state error: 0 (expect: 1)


In [18]:
opt_params_dict

{'N1': 1829.695604190187,
 'N2': 1395.01610945435,
 't_split': 1627.4492700196256,
 'm': 1.9368601263622553e-06,
 'Na': 12934.796930862876}

In [19]:
sampled_params

{'t_split': 1611, 'N1': 1813, 'N2': 1423, 'Na': 12796, 'm': 0}

# Dadi makes me sad :(

In [1]:
import os
import moments
from tqdm import tqdm
import numpy as np
import msprime
import dadi
import glob
import demes
import ray
import json
os.chdir('/sietch_colab/akapoor/Demographic_Inference')
import src.demographic_models as demographic_models

In [2]:
# Load in the experiment_config.json
with open("/sietch_colab/akapoor/Demographic_Inference/experiment_config.json", "r") as f:
    experiment_config = json.load(f)

In [3]:
def sample_params(lower_bound_params, upper_bound_params):
    sampled_params = {}
    for key in lower_bound_params:
        lower_bound = lower_bound_params[key]
        upper_bound = upper_bound_params[key]
        sampled_value = np.random.uniform(lower_bound, upper_bound)
        sampled_params[key] = int(sampled_value)


        # Check if the sampled parameter is equal to the mean of the uniform distribution
        mean_value = (lower_bound + upper_bound) / 2
        if sampled_value == mean_value:
            # Add a small random value to avoid exact mean, while keeping within bounds
            adjustment = np.random.uniform(-0.1 * (upper_bound - lower_bound), 0.1 * (upper_bound - lower_bound))
            adjusted_value = sampled_value + adjustment
            
            # Ensure the adjusted value is still within the bounds
            adjusted_value = max(min(adjusted_value, upper_bound), lower_bound)
            sampled_params[key] = int(adjusted_value)

    return sampled_params

In [4]:
sampled_params = sample_params(experiment_config["lower_bound_params"], experiment_config["upper_bound_params"])
print(sampled_params)

{'t_split': 4782, 'N1': 5104, 'N2': 6980, 'Na': 18976, 'm': 0}


In [5]:
def create_SFS(
    experiment_config, sampled_params, mode, num_samples, demographic_model, length=1e7, mutation_rate=5.7e-9, recombination_rate = 3.386e-9, **kwargs
):
    """
    If we are in pretraining mode we will use a simulated SFS. If we are in inference mode we will use a real SFS.

    """

    if mode == "pretrain":
        # Simulate the demographic model
        g = demographic_model(sampled_params)
        demog = msprime.Demography.from_demes(g)

        # Dynamically define the samples using msprime.SampleSet, based on the sample_sizes dictionary
        samples = [
            msprime.SampleSet(sample_size, population=pop_name, ploidy=1)
            for pop_name, sample_size in num_samples.items()
        ]

        # Simulate ancestry for two populations (joint simulation)
        ts = msprime.sim_ancestry(
            samples=samples,  # Two populations
            demography=demog,
            sequence_length=length,
            recombination_rate=recombination_rate,
            random_seed=experiment_config['seed'],
        )
        
        # Simulate mutations over the ancestry tree sequence
        ts = msprime.sim_mutations(ts, rate=mutation_rate)

        # Define sample sets dynamically for the SFS
        sample_sets = [
            ts.samples(population=pop.id) 
            for pop in ts.populations() 
            if len(ts.samples(population=pop.id)) > 0  # Exclude populations with no samples
        ]
        
        # Create the joint allele frequency spectrum
        sfs = ts.allele_frequency_spectrum(sample_sets=sample_sets, mode="site", polarised=True)
        
        # Multiply SFS by the sequence length to adjust scale
        sfs *= length

        # Convert to moments Spectrum for further use
        sfs = moments.Spectrum(sfs)
    
    elif mode == "inference":
        vcf_file = kwargs.get("vcf_file", None)
        pop_file = kwargs.get("pop_file", None)
        popname = kwargs.get("popname", None)

        if vcf_file is None or pop_file is None:
            raise ValueError(
                "vcf_file and pop_file must be provided in inference mode."
            )

        dd = dadi.Misc.make_data_dict_vcf(vcf_file, pop_file)
        sfs = dadi.Spectrum.from_data_dict(
            dd, [popname], projections=[2 * num_samples], polarized=True
        )

    return sfs

In [6]:
sfs = create_SFS(
    experiment_config,
      sampled_params,
        "pretrain",
          experiment_config["num_samples"],
            demographic_models.split_isolation_model_simulation,
              length=experiment_config['genome_length'],
                mutation_rate=experiment_config['mutation_rate'], recombination_rate = experiment_config['recombination_rate'])

In [7]:
sfs

Spectrum([[-- 447.0 203.0 152.0 132.0 79.0 72.0 48.0 42.0 32.0 34.0 24.0 18.0 19.0
  37.0 23.0 1.0 3.0 11.0 8.0 1.0]
 [356.0 7.0 12.0 28.0 1.0 12.0 11.0 19.0 4.0 3.0 10.0 5.0 0.0 5.0 2.0 0.0
  0.0 3.0 3.0 1.0 1.0]
 [210.0 11.0 15.0 14.0 6.0 4.0 14.0 2.0 16.0 6.0 2.0 4.0 3.0 9.0 3.0 4.0
  1.0 0.0 3.0 5.0 0.0]
 [115.0 6.0 7.0 7.0 9.0 10.0 4.0 5.0 8.0 1.0 1.0 6.0 5.0 0.0 6.0 2.0 1.0
  0.0 4.0 2.0 1.0]
 [84.0 4.0 2.0 2.0 15.0 6.0 0.0 2.0 7.0 3.0 8.0 7.0 5.0 2.0 6.0 1.0 0.0
  1.0 2.0 5.0 1.0]
 [59.0 1.0 1.0 3.0 4.0 3.0 6.0 1.0 1.0 0.0 1.0 7.0 0.0 0.0 1.0 2.0 2.0
  1.0 0.0 2.0 4.0]
 [59.0 3.0 3.0 6.0 3.0 8.0 6.0 2.0 1.0 3.0 2.0 1.0 7.0 3.0 2.0 1.0 2.0
  1.0 0.0 1.0 5.0]
 [31.0 5.0 0.0 6.0 2.0 7.0 3.0 3.0 2.0 4.0 4.0 2.0 1.0 5.0 5.0 6.0 0.0
  0.0 0.0 1.0 0.0]
 [42.0 11.0 1.0 2.0 2.0 2.0 6.0 5.0 2.0 1.0 4.0 1.0 0.0 3.0 1.0 3.0 3.0
  2.0 3.0 0.0 1.0]
 [39.0 5.0 2.0 8.0 5.0 2.0 0.0 0.0 4.0 1.0 2.0 3.0 0.0 0.0 5.0 0.0 5.0
  0.0 0.0 1.0 16.0]
 [45.0 4.0 1.0 10.0 3.0 6.0 9.0 1.0 1.0 4.0 13.0 5.0 0.

In [8]:
from src.parameter_inference import run_inference_dadi

In [9]:
model_sfs_dadi, opt_theta_dadi, opt_params_dict_dadi, ll_list_dadi = (
        run_inference_dadi(
            sfs = sfs,
            p0= experiment_config['optimization_initial_guess'],
            lower_bound= experiment_config['lower_bound_optimization'],
            upper_bound= experiment_config['upper_bound_optimization'],
            num_samples=20,
            demographic_model=experiment_config['demographic_model'],
            mutation_rate=experiment_config['mutation_rate'],
            length=experiment_config['genome_length'],
            k  = experiment_config['k'], 
            top_values_k = experiment_config['top_values_k']
        )
    )

3       , -6934.29    , array([ 0.0147655  ,  1.09256    ,  0.047487   ,  0.02723    ])
6       , -14927.7    , array([ 0.00376638 ,  0.624363   ,  0.047487   ,  0.02723    ])
9       , -6771.8     , array([ 0.0147655  ,  0.624363   ,  0.047487   ,  0.00680751 ])
OPT DADI PARAMETER: [0.01476551 0.62436314 0.01194675 0.02723004]
12      , -9353.06    , array([ 0.0173614  ,  1.42488    ,  0.0940947  ,  0.0530843  ])
15      , -11356.1    , array([ 0.00996368 ,  1.42488    ,  0.0940947  ,  0.0928976  ])
18      , -6053.52    , array([ 0.00996368 ,  1.42488    ,  0.0235987  ,  0.0530843  ])
OPT DADI PARAMETER: [0.00996368 1.4248795  0.02359867 0.05308435]
21      , -12150.6    , array([ 0.0124891  ,  1.08654    ,  0.129034   ,  0.0302475  ])
24      , -12227.6    , array([ 0.0124891  ,  1.08654    ,  0.225735   ,  0.0302475  ])
27      , -11565.7    , array([ 0.0124891  ,  0.271709   ,  0.129034   ,  0.0302475  ])
30      , -10542      , array([ 0.0205984  ,  0.944363   ,  0.0001     ,  0.