# Debugging LD Stats

MomentsLD is destroying my liver

In [1]:
import os 
os.chdir('/projects/kernlab/akapoor/Demographic_Inference/')
from src.parameter_inference import get_LD_stats
import argparse
import numpy as np
import pickle
import json
import tskit
import os
import shutil

In [2]:
with open('/projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_0/window_0/ld_stats_window.0.pkl', 'rb') as f:
    data = pickle.load(f)

In [3]:
data.keys()

dict_keys(['bins', 'sums', 'stats', 'pops'])

In [15]:
# Function to create LD statistics
def ld_stat_creation(vcf_filepath, flat_map_path, pop_file_path, sim_directory, sim_number, window_number):
    # Define recombination bins
    r_bins = np.array([0, 1e-6, 2e-6, 5e-6, 1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3])
    errors_to_retry = (IndexError, ValueError)  # Specific errors for retry logic

    try:
        print(f"Calculating LD stats for window {window_number}, sim {sim_number}")

        # Calculate LD stats
        ld_stats = get_LD_stats(vcf_filepath, r_bins, flat_map_path, pop_file_path)

        # Save LD stats to a file
        output_file = f"{sim_directory}/sim_{sim_number}/ld_stats_window.{window_number}.pkl"
        with open(output_file, "wb") as f:
            pickle.dump(ld_stats, f)

        print(f"LD stats successfully created for window {window_number}, sim {sim_number}")
    
    except errors_to_retry as e:
        print(f"Error encountered ({e}) for window {window_number}, sim {sim_number}. Regenerating the window...")

        # First delete the windows
        dir_path = f"/projects/kernlab/akapoor/Demographic_Inference/sampled_genome_windows/sim_{sim_number}"
        print("========================================================")
        print(f'WINDOWS PATH TO REMOVE: {dir_path}')
        print("========================================================")

        if os.path.exists(dir_path):
            shutil.rmtree(dir_path)  # Recursively delete the directory and its contents
            print(f"Deleted directory and all contents: {dir_path}")
        else:
            print(f"Directory not found: {dir_path}")
                    
        # Delete the LD stats that correspond to each window
        [os.remove(os.path.join(f"/projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_{sim_number}", f)) for f in os.listdir(f"/projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_{sim_number}") if os.path.isfile(os.path.join(f"/projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_{sim_number}", f))]
        
        from src.preprocess import Processor

        # Reload experiment configuration
        with open("/projects/kernlab/akapoor/Demographic_Inference/experiment_config.json", "r") as f:
            experiment_config = json.load(f)

        # Load the tree sequence
        ts_path = f"/projects/kernlab/akapoor/Demographic_Inference/simulated_parameters_and_inferences/simulation_results/ts_sim_{sim_number}.trees"
        ts = tskit.load(ts_path)

        genome_window_dir = f"/projects/kernlab/akapoor/Demographic_Inference/sampled_genome_windows/sim_{sim_number}"

        # Regenerate the window
        Processor.run_msprime_replicates(ts, experiment_config, window_number, genome_window_dir)
        Processor.write_samples_and_rec_map(experiment_config, window_number, genome_window_dir)

        # Update file paths for the regenerated window
        new_vcf_filepath = f"{genome_window_dir}/window_{window_number}/window.{window_number}.vcf.gz"
        new_flat_map_path = f"{genome_window_dir}/window_{window_number}/flat_map.txt"

        print(f"Retrying LD stats calculation for regenerated window {window_number}, sim {sim_number}")
        ld_stat_creation(new_vcf_filepath, new_flat_map_path, pop_file_path, sim_directory, sim_number, window_number)

    except Exception as e:
        print(f"Unexpected error: {e} for window {window_number}, sim {sim_number}. Type: {type(e)}")
        print(f"Failed to create LD stats for window {window_number}, sim {sim_number}.")


Load the files of interest (simulation 540)

In [16]:
sim_number = 540 
window_number = 79
flat_map_path = '/projects/kernlab/akapoor/Demographic_Inference/sampled_genome_windows/sim_540/window_79/flat_map.txt'
pop_file_path = '/projects/kernlab/akapoor/Demographic_Inference/sampled_genome_windows/sim_540/window_79/samples.txt'
vcf_filepath = '/projects/kernlab/akapoor/Demographic_Inference/sampled_genome_windows/sim_540/window_79/window.79.vcf.gz'
sim_directory = '/projects/kernlab/akapoor/Demographic_Inference/LD_inferences'

In [17]:
ld_stat_creation(vcf_filepath, flat_map_path, pop_file_path, sim_directory, sim_number, window_number)

Calculating LD stats for window 79, sim 540
Unexpected error: [Errno 2] No such file or directory: '/projects/kernlab/akapoor/Demographic_Inference/sampled_genome_windows/sim_540/window_79/window.79.vcf.gz' for window 79, sim 540. Type: <class 'FileNotFoundError'>
Failed to create LD stats for window 79, sim 540.


In [18]:
def combine_ld_stats(LD_inferences_path, sim_number):

    # Print all the files in the directory to see what's there
    all_files = os.listdir(LD_inferences_path)
    
    # Use os.path.join to get the absolute paths for the files and filter for 'ld_stats' files
    ld_stats_files = [os.path.join(LD_inferences_path, f) for f in all_files if f.startswith('ld_stats_window') and f.endswith('.pkl')]

    ld_stats = {}
    
    # Load and combine all LD stats pickle files
    for ii, ld_stats_file in enumerate(ld_stats_files):
        print(f"Processing file: {ld_stats_file}")
        with open(ld_stats_file, 'rb') as f:
            file_stats = pickle.load(f)
            ld_stats[ii] = file_stats
    
    # Save the combined results
    combined_file_path = os.path.join('combined_LD_inferences', f'sim_{sim_number}', f'combined_LD_stats_sim_{sim_number}.pkl')
    with open(combined_file_path, 'wb') as f:
        pickle.dump(ld_stats, f)

In [19]:
LD_inferences_path = f'{sim_directory}/sim_{sim_number}'

In [15]:
combine_ld_stats(LD_inferences_path, sim_number)

Processing file: /projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_540/ld_stats_window.67.pkl
Processing file: /projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_540/ld_stats_window.66.pkl
Processing file: /projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_540/ld_stats_window.1.pkl
Processing file: /projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_540/ld_stats_window.26.pkl
Processing file: /projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_540/ld_stats_window.29.pkl
Processing file: /projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_540/ld_stats_window.2.pkl
Processing file: /projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_540/ld_stats_window.53.pkl
Processing file: /projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_540/ld_stats_window.42.pkl
Processing file: /projects/kernlab/akapoor/Demographic_Inference/LD_inferences/sim_540/ld_stats_window.30.pkl
Processing f

In [None]:
import pickle
import json
import argparse
import ray
import numpy as np
import moments
import subprocess
import os
os.chdir('/projects/kernlab/akapoor/Demographic_Inference/')
from src.parameter_inference import run_inference_momentsLD
import os
import shutil

def cleanup_files(sim_directory, sim_number):
    simulation_results_directory = '/projects/kernlab/akapoor/Demographic_Inference/simulated_parameters_and_inferences/simulation_results'
    genome_windows_directory = f'/projects/kernlab/akapoor/Demographic_Inference/sampled_genome_windows/sim_{sim_number}'

    # Define files to delete
    files_to_delete = [
        f"{simulation_results_directory}/SFS_sim_{sim_number}.pkl",
        f"{simulation_results_directory}/ts_sim_{sim_number}.trees",
        f"{simulation_results_directory}/sampled_params_{sim_number}.pkl",
        f"{simulation_results_directory}/sampled_params_metadata_{sim_number}.txt"
    ]

    # Delete individual files
    for filepath in files_to_delete:
        if os.path.exists(filepath):
            os.remove(filepath)
            print(f"Deleted {filepath}")
        else:
            print(f"File not found, skipping deletion: {filepath}")

    # Delete genome windows directory
    if os.path.exists(genome_windows_directory):
        shutil.rmtree(genome_windows_directory)
        print(f"Deleted genome windows directory: {genome_windows_directory}")
    else:
        print(f"Genome windows directory not found: {genome_windows_directory}")

def obtain_feature(combined_ld_stats_path, sim_directory, sampled_params, experiment_config_filepath, sim_number):
    with open(experiment_config_filepath, "r") as f:
        experiment_config = json.load(f)

    with open(sampled_params, "rb") as f:
        sampled_params = pickle.load(f)

    with open(combined_ld_stats_path, "rb") as f:
        combined_ld_stats = pickle.load(f)

    mega_result_dict = {"simulated_params": sampled_params}

    p_guess = experiment_config["optimization_initial_guess"].copy()
    p_guess.extend([10000])
    p_guess = moments.LD.Util.perturb_params(p_guess, fold=0.1)

    def reoptimize():
        print("Attempting optimization...")
        opt_params_momentsLD, ll_list_momentsLD = run_inference_momentsLD(
            ld_stats=combined_ld_stats,
            demographic_model=experiment_config["demographic_model"],
            p_guess=p_guess
        )
        print("Optimization completed successfully.")
        return opt_params_momentsLD, ll_list_momentsLD

    try:
        opt_params_momentsLD, ll_list_momentsLD = reoptimize()
    except (np.linalg.LinAlgError, KeyError, RuntimeError) as e:
        print("======================================================================================")
        print(f"Error encountered: {e}. Attempting to resimulate for sim_number={sim_number}.")
        print("======================================================================================")

        # Clean up files before rerun
        cleanup_files(sim_directory, sim_number)

        ### Resimulate
        main(experiment_config=experiment_config_filepath, sim_directory = '/projects/kernlab/akapoor/Demographic_Inference/simulated_parameters_and_inferences', sim_number = sim_number)

        ### Rerun genome window creation
        for window_number in range(experiment_config['num_windows']):
            main(tree_sequence_file = f'/projects/kernlab/akapoor/Demographic_Inference/simulated_parameters_and_inferences/simulation_results/ts_sim_{sim_number}.trees',
             experiment_config_filepath = experiment_config_filepath,
              genome_sim_directory = '/projects/kernlab/akapoor/Demographic_Inference/sampled_genome_windows',
               window_number = window_number,
                sim_number = sim_number)

            print(f"Genome window creation for window {window_number} rerun successfully.")

        ### Rerun the LD stats creation. This will be slow because I am not doing it on the cluster. 
        sim_directory_LD = '/projects/kernlab/akapoor/Demographic_Inference/LD_inferences'
        vcf_filepath = f'/projects/kernlab/akapoor/Demographic_Inference/sampled_genome_windows/sim_{sim_number}/window_79/window.79.vcf.gz'

        flat_map_path = '/projects/kernlab/akapoor/Demographic_Inference/sampled_genome_windows/sim_540/window_79/flat_map.txt'
pop_file_path = '/projects/kernlab/akapoor/Demographic_Inference/sampled_genome_windows/sim_540/window_79/samples.txt'
vcf_filepath = '/projects/kernlab/akapoor/Demographic_Inference/sampled_genome_windows/sim_540/window_79/window.79.vcf.gz'



        ld_stat_creation(vcf_filepath, flat_map_path, pop_file_path, sim_directory, sim_number, window_number)
        

        # Retry optimization
        print("Retrying optimization after resimulation...")
        p_guess = moments.LD.Util.perturb_params(p_guess, fold=0.1)
        opt_params_momentsLD, ll_list_momentsLD = reoptimize()

    # Store results in dictionary and save to a pickle file
    momentsLD_results = {
        "opt_params_momentsLD": opt_params_momentsLD,
        "ll_all_replicates_momentsLD": ll_list_momentsLD,
    }
    mega_result_dict.update(momentsLD_results)

    output_path = f"{sim_directory}/momentsLD_inferences_sim_{sim_number}.pkl"
    with open(output_path, "wb") as f:
        pickle.dump(mega_result_dict, f)
    print(f"Results saved to {output_path}")

For rerunning the simulation: 

In [27]:
from snakemake_scripts.single_simulation import main

In [28]:
combined_ld_stats_path = f'/projects/kernlab/akapoor/Demographic_Inference/combined_LD_inferences/sim_{sim_number}/combined_LD_stats_sim_{sim_number}.pkl'
experiment_config_filepath = '/projects/kernlab/akapoor/Demographic_Inference/experiment_config.json'
sim_directory = '/projects/kernlab/akapoor/Demographic_Inference/final_LD_inferences'
sampled_params = '/projects/kernlab/akapoor/Demographic_Inference/simulated_parameters_and_inferences/simulation_results/sampled_params_540.pkl'

In [31]:
obtain_feature(combined_ld_stats_path, sim_directory, sampled_params, experiment_config_filepath, sim_number)

Attempting optimization...
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])
291     , -8.59008e+16, array([ 0.111801   ,  0.840376   ,  0.0478915  ,  0.0202475  ,  9548.35    ])
294     , -8.57973e+16, array([ 0.11169    ,  0.840376   ,  0.0478915  ,  0.0202677  ,  9548.35    ])
297     , -2.5138e+16 , array([ 0.0685996  ,  0.632511   ,  0.109283   ,  0.0202167  ,  9619.37    ])
300     , -2.51769e+16, array([ 0.0685311  ,  0.632511   ,  0.109283   ,  0.020237   ,  9619.37    ])
303     , -8.69207e+14, array([ 0.0784145  ,  0.774842   ,  0.0798897  ,  0.0201724  ,  9673.22    ])
306     , -8.57933e+14, array([ 0.0783362  ,  0.774842   ,  0

TypeError: main() got an unexpected keyword argument 'tree_sequence_file'

In [30]:
# Temporary
main(experiment_config=experiment_config_filepath, sim_directory = '/projects/kernlab/akapoor/Demographic_Inference/simulated_parameters_and_inferences', sim_number = sim_number)

BEGINNING THE PROCESS OF SIMULATING THE CHROMOSOME
