In [27]:
import pickle 
with open('/sietch_colab/akapoor/Demographic_Inference/split_isolation_model_seed_42/sims/sims_pretrain_20_sims_inference_1_seed_42_num_replicates_2_top_values_1/simulation_results/software_inferences_sim_0.pkl', 'rb') as f:
    software_inf_0 = pickle.load(f)

In [28]:
software_inf_0.keys()

dict_keys(['simulated_params', 'sfs', 'model_sfs_moments', 'opt_theta_moments', 'opt_params_moments', 'll_all_replicates_moments', 'opt_params_momentsLD', 'll_all_replicates_momentsLD'])

In [29]:
software_inf_0['opt_params_moments']

[{'Na': 14647.263163819067,
  'N1': 220.44436949177117,
  'N2': 11447.979197448301,
  't_split': 1284.1001672388359,
  'm': 267.53826508165463}]

In [30]:
software_inf_0['simulated_params']

{'t_split': 753, 'N1': 381, 'N2': 6214, 'Na': 15441, 'm': 0}

In [31]:
software_inf_0['opt_params_momentsLD']

[{'N1': 378.9628435128196,
  'N2': 6112.499524461679,
  't_split': 729.8327979756792,
  'm': 5.982285028854961e-06,
  'Na': 15429.015504446295}]

In [32]:
import pandas as pd
import pickle
import json

# Assuming you have a list of file paths for the pickle files
software_inferences_file_list = [f'/sietch_colab/akapoor/Demographic_Inference/split_isolation_model_seed_42/sims/sims_pretrain_20_sims_inference_1_seed_42_num_replicates_2_top_values_1/simulation_results/software_inferences_sim_{i}.pkl' for i in range(20)]

# Load the experiment configuration from the JSON file
with open('/sietch_colab/akapoor/Demographic_Inference/experiment_config.json', 'r') as config_file:
    experiment_config = json.load(config_file)

# Initialize empty lists to store the rows for both data and target parameters
data = []
targets = []

# Loop over each pickle file
for file in software_inferences_file_list:
    with open(file, 'rb') as f:
        software_inf = pickle.load(f)

        # Initialize a dictionary for this row (for opt_params)
        row = {}

        # Extracting parameters based on enabled analyses in the config file
        # "Connect" to dadi_analysis if it's set to True in the config
        if experiment_config['dadi_analysis']:
            for params_dict in software_inf.get('opt_params_dadi', []):
                row.update({f'opt_params_dadi_{key}': value for key, value in params_dict.items()})

        # "Connect" to moments_analysis if it's set to True in the config
        if experiment_config['moments_analysis']:
            for params_dict in software_inf.get('opt_params_moments', []):
                row.update({f'opt_params_moments_{key}': value for key, value in params_dict.items()})

        # "Connect" to momentsLD_analysis if it's set to True in the config
        if experiment_config['momentsLD_analysis']:
            for params_dict in software_inf.get('opt_params_momentsLD', []):
                row.update({f'opt_params_momentsLD_{key}': value for key, value in params_dict.items()})

        # Add the parameter row to the data list
        data.append(row)

        # Extract the simulated_params (targets)
        simulated_params = software_inf.get('simulated_params', {})
        target_row = {f'simulated_params_{key}': value for key, value in simulated_params.items()}

        # Add the target row to the targets list
        targets.append(target_row)

# Create DataFrames from the data lists
df = pd.DataFrame(data)
targets_df = pd.DataFrame(targets)

In [33]:
df.head()

Unnamed: 0,opt_params_moments_Na,opt_params_moments_N1,opt_params_moments_N2,opt_params_moments_t_split,opt_params_moments_m,opt_params_momentsLD_N1,opt_params_momentsLD_N2,opt_params_momentsLD_t_split,opt_params_momentsLD_m,opt_params_momentsLD_Na
0,14647.263164,220.444369,11447.979197,1284.100167,267.538265,378.962844,6112.499524,729.832798,5.982285e-06,15429.015504
1,13748.79639,144.598936,18598.815462,2176.475567,364.669866,5745.622462,4267.551026,4824.886292,3.421451e-07,14848.239808
2,14268.47281,77.495965,10704.824702,1691.588266,151.187833,4664.806447,6571.845982,2620.703564,4.573394e-07,11665.079576
3,13866.875442,187.548178,9078.729357,1699.645147,407.602158,5594.004151,1036.587329,4777.387124,7.579393e-08,18439.386603
4,21653.941656,142.190352,19533.834504,4596.362367,784.600172,6049.131651,6284.928248,213.753241,3.716031e-07,13425.382623


In [34]:
targets_df.head()

Unnamed: 0,simulated_params_t_split,simulated_params_N1,simulated_params_N2,simulated_params_Na,simulated_params_m
0,753,381,6214,15441,0
1,4665,5641,4165,14808,0
2,2583,4507,6593,10945,0
3,4742,5747,1044,18711,0
4,129,6545,8478,18125,0


In [55]:
import numpy as np
import os

# Change directory
os.chdir('/sietch_colab/akapoor/Demographic_Inference')
from src.preprocess import Processor

# Simulation directory
sim_directory = '/sietch_colab/akapoor/Demographic_Inference/test'
os.makedirs(sim_directory, exist_ok=True)

# Split the data into training and validation sets
all_indices = np.arange(experiment_config["num_sims_pretrain"])
np.random.shuffle(all_indices)
training_indices = all_indices[:int(experiment_config["training_percentage"] * experiment_config["num_sims_pretrain"])]
validation_indices = all_indices[int(experiment_config["training_percentage"] * experiment_config["num_sims_pretrain"]):]

# Initialize processor
processor = Processor(
    experiment_config,
    experiment_directory=sim_directory,
    recombination_rate=experiment_config["recombination_rate"],
    mutation_rate=experiment_config["mutation_rate"],
)

# Initialize the preprocessing results object
preprocessing_results_obj = {stage: {} for stage in ["training", "validation"]}

# Initialize empty lists to store DataFrames for features and targets
all_features_df = []
all_targets_df = []

# Separate loop to process each stage (training/validation)
for stage, indices in [
    ("training", training_indices),
    ("validation", validation_indices)
]:
    # Step 1: Initialize lists to hold simulation data
    all_simulations_data = []   # Inferred parameters
    all_targets_data = []       # Simulated parameters (targets)

    # Step 2: Dynamically extract and append data for each analysis type
    for sim_num, idx in enumerate(indices):
        sim_data = {}  # Dictionary to hold inferred parameters for each simulation
        target_data = {}  # Dictionary to hold target parameters for each simulation

        result_file = software_inferences_file_list[idx]
        with open(result_file, "rb") as f:
            result = pickle.load(f)

        # Collect moments_analysis data
        if experiment_config['moments_analysis']:
            for replicate, params in enumerate(result['opt_params_moments']):
                for key, value in params.items():
                    sim_data[f'Moments_rep{replicate+1}_{key}'] = value

        # Collect momentsLD_analysis data
        if experiment_config['momentsLD_analysis']:
            for key, value in result['opt_params_momentsLD'][0].items():
                sim_data[f'MomentsLD_{key}'] = value

        # Collect dadi_analysis data
        if experiment_config['dadi_analysis']:
            for replicate, params in enumerate(result['opt_params_dadi']):
                for key, value in params.items():
                    sim_data[f'Dadi_rep{replicate+1}_{key}'] = value

        # Collect simulated_params (targets)
        for key, value in result['simulated_params'].items():
            target_data[f'simulated_params_{key}'] = value

        # If upper triangular matrix exists, add it to the simulation data
        if 'upper_triangular_FIM' in result:
            upper_triangular = result['upper_triangular_FIM']
            upper_triangular_flat = upper_triangular.flatten()
            for i, value in enumerate(upper_triangular_flat):
                sim_data[f'upper_triangular_FIM_{i}'] = value

        # Append the inferred parameters and targets to the respective lists
        all_simulations_data.append(sim_data)
        all_targets_data.append(target_data)

    # Step 3: Create DataFrames from the simulation data
    features_df = pd.DataFrame(all_simulations_data, index=[f'Sim_{i}' for i in range(len(indices))])
    targets_df = pd.DataFrame(all_targets_data, index=[f'Sim_{i}' for i in range(len(indices))])

    # Store the DataFrames in the preprocessing object for later use
    preprocessing_results_obj[stage]["predictions"] = features_df
    preprocessing_results_obj[stage]["targets"] = targets_df

    # Save DataFrames for each stage (training or validation)
    features_df.to_csv(f"{sim_directory}/{stage}_features.csv", index=True)
    targets_df.to_csv(f"{sim_directory}/{stage}_targets.csv", index=True)

    # Save the DataFrames as .npy files for further processing
    np.save(f"{sim_directory}/{stage}_features.npy", features_df.values)
    np.save(f"{sim_directory}/{stage}_targets.npy", targets_df.values)

    # Optionally append to lists for inspection
    all_features_df.append(features_df)
    all_targets_df.append(targets_df)

# Save the preprocessing results object (optional)
with open(f"{sim_directory}/preprocessing_results_obj.pkl", "wb") as file:
    pickle.dump(preprocessing_results_obj, file)

In [56]:
preprocessing_results_obj['training']['predictions']

Unnamed: 0,Moments_rep1_Na,Moments_rep1_N1,Moments_rep1_N2,Moments_rep1_t_split,Moments_rep1_m,MomentsLD_N1,MomentsLD_N2,MomentsLD_t_split,MomentsLD_m,MomentsLD_Na
Sim_0,18057.907896,143.178711,25231.771451,2866.530445,347.502011,6100.547796,6870.737875,491.543,2.791314e-07,18081.669096
Sim_1,16835.863329,295.572694,12872.380975,4902.40043,516.940701,9154.300651,6813.389638,4415.722594,3.659896e-07,13471.646041
Sim_2,11700.80176,136.241235,12055.333469,3086.306722,320.845565,5340.745718,2107.358581,2188.295785,3.867876e-07,13263.804691
Sim_3,19187.88693,131.819279,13752.30807,3717.355294,416.652689,7224.588743,8275.063931,1791.849289,3.602836e-07,15045.749286
Sim_4,6940.069496,82.348248,3883.774331,837.690403,243.297959,617.317539,1620.882473,2645.254084,3.90657e-07,13624.263559
Sim_5,17116.083287,192.750509,14292.848201,2799.862026,466.545665,6544.097383,3551.748672,827.537205,3.175798e-07,16342.780954
Sim_6,13642.517299,220.896422,8485.675472,1497.662546,364.394687,2981.69216,6568.028088,422.618472,4.162745e-07,12253.06623
Sim_7,14647.263164,220.444369,11447.979197,1284.100167,267.538265,378.962844,6112.499524,729.832798,5.982285e-06,15429.015504
Sim_8,15427.514944,237.288114,14885.998174,1652.982039,215.813112,599.074184,6068.169527,3345.459448,2.624331e-07,20306.22522
Sim_9,13748.79639,144.598936,18598.815462,2176.475567,364.669866,5745.622462,4267.551026,4824.886292,3.421451e-07,14848.239808


In [57]:
preprocessing_results_obj['training']['targets']

Unnamed: 0,simulated_params_t_split,simulated_params_N1,simulated_params_N2,simulated_params_Na,simulated_params_m
Sim_0,487,6029,8506,19485,0
Sim_1,4513,9230,6812,12732,0
Sim_2,2178,5276,2075,12719,0
Sim_3,1903,7541,8640,14234,0
Sim_4,2598,607,1567,13544,0
Sim_5,878,6125,3764,16417,0
Sim_6,427,3144,8208,11962,0
Sim_7,753,381,6214,15441,0
Sim_8,3242,570,5905,19603,0
Sim_9,4665,5641,4165,14808,0


In [58]:
a = np.load('/sietch_colab/akapoor/Demographic_Inference/split_isolation_model_seed_42/sims/without_momentsld_sims_pretrain_20_sims_inference_1_seed_42_num_replicates_2_top_values_1/training_features.npy')

In [59]:
a.shape

(16, 10)

In [17]:
import pickle
with open('/sietch_colab/akapoor/Demographic_Inference/split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_False_seed_42/sims/sims_pretrain_10_sims_inference_1_seed_42_num_replicates_3_top_values_2/preprocessing_results_obj.pkl', 'rb') as f:
    preprocessing_results_obj = pickle.load(f)

In [18]:
preprocessing_results_obj['training']['predictions']

Unnamed: 0,Moments_rep1_Na,Moments_rep1_N1,Moments_rep1_N2,Moments_rep1_t_split,Moments_rep1_m,Moments_rep1_upper_triangular_FIM,Moments_rep2_Na,Moments_rep2_N1,Moments_rep2_N2,Moments_rep2_t_split,...,Dadi_rep1_Na,Dadi_rep1_N1,Dadi_rep1_N2,Dadi_rep1_t_split,Dadi_rep1_m,Dadi_rep2_Na,Dadi_rep2_N1,Dadi_rep2_N2,Dadi_rep2_t_split,Dadi_rep2_m
Sim_0,16718.617842,174.766697,9311.243786,4242.885612,1932.34899,"[-15680664.270421999, 1093.534726149046, -5286...",16680.901392,163.652917,7583.046418,1889.760948,...,12681.880779,116.832174,11590.273208,2.536376,1723.355871,14187.585355,281.426224,14952.442696,324.332098,878.992666
Sim_1,17716.815096,130.434291,12817.356998,1328.941011,1979.390581,"[-281901375.14451814, 341.51612224037444, 1076...",18834.193818,115.187817,10671.094816,2935.077046,...,14546.443165,260.43467,18161.66186,2.909289,2244.073567,14640.741659,122.939732,19776.777184,2.928148,1844.292243
Sim_2,12970.520939,88.652475,19553.284931,3011.798144,1206.944759,"[-46724484.723111436, 176.5364593866449, -3209...",14218.926854,207.623735,12701.122381,1097.147206,...,12061.215664,135.868504,7370.15044,2.412243,2019.25444,12094.057077,103.578422,9967.525294,2.418811,1729.354693
Sim_3,10417.264176,84.084224,8875.023282,1394.515246,630.036633,"[69854724.93896893, 12300.038050825111, -17016...",10056.719397,199.225921,10842.19198,850.862041,...,10327.508922,150.352342,5962.972038,429.36254,620.073997,9845.089313,126.669229,15057.951292,281.801363,964.128427
Sim_4,16955.620273,158.407798,16428.033549,2215.742935,1355.448173,"[26342166.68064195, 4420.975171222963, -254745...",17552.395961,258.603812,12329.747608,1641.88654,...,14428.337838,226.980182,10411.78736,2.885668,1319.717622,14375.387072,480.118198,12488.416277,2.875077,1807.821256
Sim_5,16854.857759,130.015561,14679.24104,3557.229825,1039.924306,"[-9109739.245094717, -541.5161471326063, -1910...",15194.580444,144.072802,22045.640821,3940.55104,...,16955.004961,201.096452,9134.287112,670.668123,2631.550961,20709.573689,459.446345,4237.757955,1763.623665,1579.732912
Sim_6,25415.250661,300.018548,11926.42419,4262.397267,4704.48723,"[41823007.54389563, 10836.138475786733, -17594...",26016.822288,369.646332,12084.233948,5987.066201,...,18827.148434,352.379898,20561.253533,3.76543,3056.526698,18792.714472,530.359431,10171.658813,3.758543,2677.685216
Sim_7,17196.245113,230.462079,14615.768033,3340.332295,1525.059132,"[25804645.44111893, 10539.505662401178, -11475...",17859.733338,254.376162,12016.705057,2477.283675,...,14439.700759,201.267979,13780.759339,2.88794,2013.297622,14445.52375,193.941863,8230.792844,2.889105,2123.362114


In [19]:
import numpy as np
a = np.load('/sietch_colab/akapoor/Demographic_Inference/split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_False_seed_42/sims/sims_pretrain_10_sims_inference_1_seed_42_num_replicates_3_top_values_2/training_features.npy')

ValueError: Object arrays cannot be loaded when allow_pickle=False