# MomentsLD makes me sad :(

I need to update this notebook w.r.t. the changes that I made last week (or this week, I can't remember)

In [3]:
import os
import moments
import numpy as np
import msprime
import demes
import json
os.chdir('/sietch_colab/akapoor/Demographic_Inference')
import src.demographic_models as demographic_models

## Functions

I want to see what's going wrong with my MomentsLD specific scripts. I will copy and paste them here and will debug.

Functions in the preprocessing module

In [4]:
def sample_params(lower_bound_params, upper_bound_params):
    sampled_params = {}
    for key in lower_bound_params:
        lower_bound = lower_bound_params[key]
        upper_bound = upper_bound_params[key]
        sampled_value = np.random.uniform(lower_bound, upper_bound)
        sampled_params[key] = int(sampled_value)


        # Check if the sampled parameter is equal to the mean of the uniform distribution
        mean_value = (lower_bound + upper_bound) / 2
        if sampled_value == mean_value:
            # Add a small random value to avoid exact mean, while keeping within bounds
            adjustment = np.random.uniform(-0.1 * (upper_bound - lower_bound), 0.1 * (upper_bound - lower_bound))
            adjusted_value = sampled_value + adjustment
            
            # Ensure the adjusted value is still within the bounds
            adjusted_value = max(min(adjusted_value, upper_bound), lower_bound)
            sampled_params[key] = int(adjusted_value)

    return sampled_params

In [5]:
def simulate_chromosome(experiment_config, sampled_params, num_samples, demographic_model, length=1e7, mutation_rate=5.7e-9, recombination_rate = 3.386e-9, **kwargs):
    g = demographic_model(sampled_params)

    demog = msprime.Demography.from_demes(g)

    # Dynamically define the samples using msprime.SampleSet, based on the sample_sizes dictionary
    samples = [
        msprime.SampleSet(sample_size, population=pop_name, ploidy=1)
        for pop_name, sample_size in num_samples.items()
    ]

    # Simulate ancestry for two populations (joint simulation)
    ts = msprime.sim_ancestry(
        samples=samples,  # Two populations
        demography=demog,
        sequence_length=length,
        recombination_rate=recombination_rate,
        random_seed=experiment_config['seed'],
    )
    
    # Simulate mutations over the ancestry tree sequence
    ts = msprime.sim_mutations(ts, rate=mutation_rate)

    return ts

In [6]:
def generate_window(ts, window_length, n_samples):
    start = np.random.randint(0, n_samples - window_length)
    end = start + window_length
    return ts.keep_intervals([[start, end]])

In [7]:
def run_msprime_replicates(ts, experiment_config, window_number, folderpath):

    folderpath = os.path.join(folderpath, f"window_{window_number}")

    # Create directory for storing VCFs
    os.makedirs(folderpath, exist_ok=True)

    # Generate random windows
    window = generate_window(ts, experiment_config['window_length'], experiment_config['genome_length'])

    # List to store file paths of the generated VCFs
    vcf_filepath = []

    # Iterate over windows and write VCFs
    vcf_name = os.path.join(folderpath, f'window.{window_number}.vcf')
    with open(vcf_name, "w+") as fout:
        window.write_vcf(fout, allow_position_zero=True)
        
    # Compress the VCF file
    os.system(f"gzip {vcf_name}")
    
    # # Store the compressed VCF file path
    vcf_filepath.append(f"{vcf_name}.gz")
    
    # Write the metadata file with all VCF file paths
    metadata_file = os.path.join(folderpath, "individual_file_metadata.txt")
    with open(metadata_file, "w+") as metafile:
        metafile.write(vcf_name)

In [8]:
def write_samples_and_rec_map(experiment_config, window_number, folderpath):

    folderpath = os.path.join(folderpath, f"window_{window_number}")

    # Define the file paths
    samples_file = os.path.join(folderpath, f"samples.txt")
    flat_map_file = os.path.join(folderpath, f"flat_map.txt")

    # Open and write the sample file
    with open(samples_file, "w+") as fout:
        fout.write("sample\tpop\n")

        # Dynamically define samples based on the num_samples dictionary
        sample_idx = 0  # Initialize sample index
        for pop_name, sample_size in experiment_config['num_samples'].items():
            for _ in range(sample_size):
                fout.write(f"tsk_{sample_idx}\t{pop_name}\n")
                sample_idx += 1

    # Write the recombination map file
    with open(flat_map_file, "w+") as fout:
        fout.write("pos\tMap(cM)\n")
        fout.write("0\t0\n")
        fout.write(f"{experiment_config['genome_length']}\t{experiment_config['recombination_rate'] * experiment_config['genome_length'] * 100}\n")


Functions in the demographic_models module

In [9]:
def split_isolation_model_simulation(sampled_params):

    # Unpack the sampled parameters
    Na, N1, N2, m, t_split = (
        sampled_params["Na"],  # Effective population size of the ancestral population
        sampled_params["N1"],  # Size of population 1 after split
        sampled_params["N2"],  # Size of population 2 after split
        sampled_params["m"],   # Migration rate between populations
        sampled_params["t_split"],  # Time of the population split (in generations)
    )

    b = demes.Builder()
    b.add_deme("Na", epochs=[dict(start_size=Na, end_time=t_split)])
    b.add_deme("N1", ancestors=["Na"], epochs=[dict(start_size=N1)])
    b.add_deme("N2", ancestors=["Na"], epochs=[dict(start_size=N2)])
    b.add_migration(demes=["N1", "N2"], rate=m)
    g = b.resolve()
    return g

Functions for the MomentsLD inference part

In [10]:
# Define your function with Ray's remote decorator
def get_LD_stats(vcf_file, r_bins, flat_map_path, pop_file_path):
    ld_stats = moments.LD.Parsing.compute_ld_statistics( #type:ignore
        vcf_file,
        rec_map_file=flat_map_path,
        pop_file=pop_file_path,
        pops=["N1", "N2"], # TODO: Change later
        r_bins=r_bins,
        report=False,
    )

    return ld_stats


In [11]:
def compute_ld_stats_sequential(flat_map_path, pop_file_path, metadata_path, r_bins):
    print("=== Computing LD statistics sequentially ===")
    # Debugging: Print the path to check if it's correct
    print(f"Looking for metadata file at: {metadata_path}")

    # Check if the file exists before trying to open it
    if not os.path.exists(metadata_path):
        print(f"Error: Metadata file not found at {metadata_path}")
    else:
        print(f"Metadata file found at {metadata_path}, proceeding to open it...")

        # Try opening the file and read its contents
        try:
            with open(metadata_path, 'r') as f:
                vcf_files = [line.strip() for line in f]
            
        
        except Exception as e:
            print(f"Error while reading metadata file: {str(e)}")

    # List to store LD statistics results
    ld_stats_list = []

    # Sequentially compute LD statistics for each VCF file
    for vcf_file in vcf_files:
        ld_stats = get_LD_stats(vcf_file, r_bins, flat_map_path, pop_file_path)
        ld_stats_list.append(ld_stats)
    
    return ld_stats_list

In [12]:
def run_inference_momentsLD(ld_stats, demographic_model, p_guess):
    """
    This should do the parameter inference for momentsLD
    index: unique simulation number
    """

    r_bins = np.array([0, 1e-6, 2e-6, 5e-6, 1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3])
    ll_list = []
    opt_params_dict_list = []

    # print("computing mean and varcov matrix from LD statistics sums")
    # i could also job array this but let's see. 
    mv = moments.LD.Parsing.bootstrap_data(ld_stats)  # type: ignore
    # print("SHAPE OF THE COVARIANCE MATRIX")
    # print(mv["varcovs"][-1].shape)
    # mv["varcovs"][-1].shape = (1, 1)

    if demographic_model == "bottleneck_model":
        demo_func = moments.LD.Demographics1D.three_epoch # type: ignore

    elif demographic_model == "split_isolation_model":
        demo_func = demographic_models.split_isolation_model_momentsLD

    else:
        raise ValueError(f"Unsupported demographic model: {demographic_model}")

    opt_params, ll = moments.LD.Inference.optimize_log_lbfgsb( #type:ignore
    p_guess, [mv["means"], mv["varcovs"]], [demo_func], rs=r_bins, verbose = 3
    )

    physical_units = moments.LD.Util.rescale_params( # type: ignore
        opt_params, ["nu", "nu", "T", "m", "Ne"]
    )
    ll_list.append(ll)

    opt_params_dict = {}
    if demographic_model == "bottleneck_model":

        opt_params_dict = {
            # "N0": opt_params[4],
            "Nb": opt_params[0] * opt_params[4],
            "N_recover": opt_params[1] * opt_params[4],
            "t_bottleneck_start": (opt_params[2]+opt_params[3]) * 2 * opt_params[4],
            "t_bottleneck_end": opt_params[3] * 2 * opt_params[4]
        }

    elif demographic_model == "split_isolation_model":
        physical_units = moments.LD.Util.rescale_params( #type:ignore
            opt_params, ["nu", "nu", "T", "m", "Ne"]
        )
        

        print(physical_units)

        opt_params_dict = {
            "N1": physical_units[0],
            "N2": physical_units[1],
            "t_split": physical_units[2],
            "m": physical_units[3], 
            'Na': physical_units[4]
        }

        print("best fit parameters:")
        print(f"  N(deme0)         :  {physical_units[0]:.1f}")
        print(f"  N(deme1)         :  {physical_units[1]:.1f}")
        print(f"  Div. time (gen)  :  {physical_units[2]:.1f}")
        print(f"  Migration rate   :  {physical_units[3]:.6f}")
        print(f"  N(ancestral)     :  {physical_units[4]:.1f}")

        opt_params_dict_list.append(opt_params_dict)

    return opt_params_dict_list, ll_list 



## Driver

In [13]:
# Load in the experiment_config.json
with open("/sietch_colab/akapoor/Demographic_Inference/experiment_config.json", "r") as f:
    experiment_config = json.load(f)

In [14]:
import pickle
with open('/sietch_colab/akapoor/Demographic_Inference/sampled_params_2.pkl', 'rb') as f:
    sampled_params = pickle.load(f)

In [15]:
# load in the .trees file
import tskit
ts = tskit.load("/sietch_colab/akapoor/Demographic_Inference/ts_sim_2.trees")

In [16]:
# sampled_params = sample_params(experiment_config["lower_bound_params"], experiment_config["upper_bound_params"])
print(sampled_params)

{'t_split': 1296, 'N1': 325, 'N2': 6832, 'Na': 16297, 'm': 0}


In [17]:
ts

Tree Sequence,Unnamed: 1
Trees,265380
Sequence Length,100000000.0
Time Units,generations
Sample Nodes,40
Total Size,56.8 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,859588,26.2 MiB,
Individuals,40,1.1 KiB,
Migrations,0,8 Bytes,
Mutations,327909,11.6 MiB,
Nodes,173995,4.6 MiB,
Populations,3,297 Bytes,✅
Provenances,2,2.9 KiB,
Sites,327260,7.8 MiB,


In [18]:
folderpath = f"/sietch_colab/akapoor/Demographic_Inference/testing_things/simulations/{experiment_config['demographic_model']}/"

In [19]:
demographic_model = "split_isolation_model"

In [20]:
p_guess = [0.02, 0.42, 0.075, 0.01, 10000]

Create the LD stats

In [21]:
ld_stats = {}
base_dir = '/sietch_colab/akapoor/Demographic_Inference/sim_2'
absolute_paths = [os.path.abspath(os.path.join(base_dir, f)) for f in os.listdir(base_dir)]

for i in range(len(absolute_paths)):
    with open(absolute_paths[i], 'rb') as f:
        ld_stats[i] = pickle.load(f)

In [22]:
p_guess

[0.02, 0.42, 0.075, 0.01, 10000]

In [24]:
opt_params_dict, ll_list = run_inference_momentsLD(ld_stats, demographic_model, p_guess)

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])
357     , -9.57786e+10, array([ 0.02       ,  0.42042    ,  0.075      ,  0.01       ,  10000      ])
360     , -9.58562e+10, array([ 0.02       ,  0.42       ,  0.075      ,  0.01       ,  10010      ])
363     , -1.75719e+07, array([ 0.0396795  ,  0.433339   ,  0.0362608  ,  0.0100312  ,  9614.8     ])
366     , -1.75745e+07, array([ 0.0396795  ,  0.432906   ,  0.0362608  ,  0.0100312  ,  9624.41    ])
369     , -1.75582e+07, array([ 0.0396826  ,  0.433342   ,  0.0362579  ,  0.0100312  ,  9614.82    ])
372     , -1.75607e+07, array([ 0.0396826  ,  0.432908   ,  0.0362579  ,  0.0100312  ,  

In [25]:
ll_list

[58399.109329996245]

In [None]:
opt_params_dict[0][0]

In [None]:
sampled_params

## Doing stuff from scratch again

I suspect there could be wrong with something with SLURM, which would cause these issues ?

In [90]:
from src.demographic_models import split_isolation_model_simulation

demographic_model = split_isolation_model_simulation

In [93]:
ts = simulate_chromosome(experiment_config, sampled_params, experiment_config['num_samples'], demographic_model, length=experiment_config['genome_length'], mutation_rate=experiment_config['mutation_rate'], recombination_rate = experiment_config['recombination_rate'])

In [None]:
ts

In [None]:
from tqdm import tqdm

for window_number in tqdm(range(experiment_config['num_windows'])):

    run_msprime_replicates(ts, experiment_config, window_number, folderpath = 'sampled_genome_windows')
    write_samples_and_rec_map(experiment_config, window_number, folderpath = 'sampled_genome_windows')

In [None]:
ld_stats_all = {}

r_bins = np.array([0, 1e-6, 2e-6, 5e-6, 1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3])

for i in tqdm(range(experiment_config['num_windows'])):

    vcf_file = f'/sietch_colab/akapoor/Demographic_Inference/sampled_genome_windows/window_{i}/window.{i}.vcf.gz'
    flat_map_path = f'/sietch_colab/akapoor/Demographic_Inference/sampled_genome_windows/window_{i}/flat_map.txt'
    pop_file_path = f'/sietch_colab/akapoor/Demographic_Inference/sampled_genome_windows/window_{i}/samples.txt'

    ld_stats = get_LD_stats(vcf_file=vcf_file, r_bins=r_bins, flat_map_path=flat_map_path, pop_file_path=pop_file_path)
    ld_stats_all[i] = ld_stats

In [97]:
p_guess = [0.2, 0.3, 0.08, 0.25, 10000]

In [101]:
from src.demographic_models import split_isolation_model_momentsLD
demographic_model = "split_isolation_model"

In [None]:
opt_params_dict_list, ll_list = run_inference_momentsLD(ld_stats_all, demographic_model, p_guess)

In [None]:
opt_params_dict_list[0]

In [None]:
sampled_params

In [None]:
experiment_config['upper_bound_params']

In [None]:
experiment_config['lower_bound_params']

# Dadi makes me sad :(

In [1]:
import os
import moments
from tqdm import tqdm
import numpy as np
import msprime
import dadi
import glob
import demes
import ray
import json
os.chdir('/sietch_colab/akapoor/Demographic_Inference')
import src.demographic_models as demographic_models

In [2]:
# Load in the experiment_config.json
with open("/sietch_colab/akapoor/Demographic_Inference/experiment_config.json", "r") as f:
    experiment_config = json.load(f)

In [3]:
def sample_params(lower_bound_params, upper_bound_params):
    sampled_params = {}
    for key in lower_bound_params:
        lower_bound = lower_bound_params[key]
        upper_bound = upper_bound_params[key]
        sampled_value = np.random.uniform(lower_bound, upper_bound)
        sampled_params[key] = int(sampled_value)


        # Check if the sampled parameter is equal to the mean of the uniform distribution
        mean_value = (lower_bound + upper_bound) / 2
        if sampled_value == mean_value:
            # Add a small random value to avoid exact mean, while keeping within bounds
            adjustment = np.random.uniform(-0.1 * (upper_bound - lower_bound), 0.1 * (upper_bound - lower_bound))
            adjusted_value = sampled_value + adjustment
            
            # Ensure the adjusted value is still within the bounds
            adjusted_value = max(min(adjusted_value, upper_bound), lower_bound)
            sampled_params[key] = int(adjusted_value)

    return sampled_params

In [None]:
sampled_params = sample_params(experiment_config["lower_bound_params"], experiment_config["upper_bound_params"])
print(sampled_params)

In [5]:
def create_SFS(
    experiment_config, sampled_params, mode, num_samples, demographic_model, length=1e7, mutation_rate=5.7e-9, recombination_rate = 3.386e-9, **kwargs
):
    """
    If we are in pretraining mode we will use a simulated SFS. If we are in inference mode we will use a real SFS.

    """

    if mode == "pretrain":
        # Simulate the demographic model
        g = demographic_model(sampled_params)
        demog = msprime.Demography.from_demes(g)

        # Dynamically define the samples using msprime.SampleSet, based on the sample_sizes dictionary
        samples = [
            msprime.SampleSet(sample_size, population=pop_name, ploidy=1)
            for pop_name, sample_size in num_samples.items()
        ]

        # Simulate ancestry for two populations (joint simulation)
        ts = msprime.sim_ancestry(
            samples=samples,  # Two populations
            demography=demog,
            sequence_length=length,
            recombination_rate=recombination_rate,
            random_seed=experiment_config['seed'],
        )
        
        # Simulate mutations over the ancestry tree sequence
        ts = msprime.sim_mutations(ts, rate=mutation_rate)

        # Define sample sets dynamically for the SFS
        sample_sets = [
            ts.samples(population=pop.id) 
            for pop in ts.populations() 
            if len(ts.samples(population=pop.id)) > 0  # Exclude populations with no samples
        ]
        
        # Create the joint allele frequency spectrum
        sfs = ts.allele_frequency_spectrum(sample_sets=sample_sets, mode="site", polarised=True)
        
        # Multiply SFS by the sequence length to adjust scale
        sfs *= length

        # Convert to moments Spectrum for further use
        sfs = moments.Spectrum(sfs)
    
    elif mode == "inference":
        vcf_file = kwargs.get("vcf_file", None)
        pop_file = kwargs.get("pop_file", None)
        popname = kwargs.get("popname", None)

        if vcf_file is None or pop_file is None:
            raise ValueError(
                "vcf_file and pop_file must be provided in inference mode."
            )

        dd = dadi.Misc.make_data_dict_vcf(vcf_file, pop_file)
        sfs = dadi.Spectrum.from_data_dict(
            dd, [popname], projections=[2 * num_samples], polarized=True
        )

    return sfs

In [6]:
sfs = create_SFS(
    experiment_config,
      sampled_params,
        "pretrain",
          experiment_config["num_samples"],
            demographic_models.split_isolation_model_simulation,
              length=experiment_config['genome_length'],
                mutation_rate=experiment_config['mutation_rate'], recombination_rate = experiment_config['recombination_rate'])

In [None]:
sfs

In [8]:
from src.parameter_inference import run_inference_dadi

In [None]:
model_sfs_dadi, opt_theta_dadi, opt_params_dict_dadi, ll_list_dadi = (
        run_inference_dadi(
            sfs = sfs,
            p0= experiment_config['optimization_initial_guess'],
            lower_bound= experiment_config['lower_bound_optimization'],
            upper_bound= experiment_config['upper_bound_optimization'],
            num_samples=20,
            demographic_model=experiment_config['demographic_model'],
            mutation_rate=experiment_config['mutation_rate'],
            length=experiment_config['genome_length'],
            k  = experiment_config['k'], 
            top_values_k = experiment_config['top_values_k']
        )
    )