## Scatterplot of raw moments/dadi against simulated params

In [6]:
import os

In [7]:
def get_overlapping_simulations(software_dir, momentsLD_dir):
    """
    Find overlapping simulation files between software and momentsLD directories
    
    Args:
        software_dir (str): Path to software inferences directory
        momentsLD_dir (str): Path to momentsLD inferences directory
        
    Returns:
        tuple: Lists of overlapping software and momentsLD files (with full paths)
    """
    # Get list of files in each directory
    momentsLD_files = os.listdir(momentsLD_dir)
    software_files = os.listdir(software_dir)
    
    # Extract numbers from momentsLD files
    momentsLD_numbers = set()
    momentsLD_file_dict = {}  # Store mapping of number to full filename
    
    for file in momentsLD_files:
        if file.startswith('momentsLD_inferences_sim_') and file.endswith('.pkl'):
            try:
                num = int(file.split('_')[-1].split('.')[0])
                momentsLD_numbers.add(num)
                momentsLD_file_dict[num] = file
            except ValueError:
                continue

    # Find matching software files
    overlapping_software_files = []
    overlapping_momentsLD_files = []
    
    for file in software_files:
        if file.startswith('software_inferences_sim_') and file.endswith('.pkl'):
            try:
                num = int(file.split('_')[-1].split('.')[0])
                if num in momentsLD_numbers:
                    # Store full paths
                    overlapping_software_files.append(os.path.join(software_dir, file))
                    overlapping_momentsLD_files.append(os.path.join(momentsLD_dir, momentsLD_file_dict[num]))
            except ValueError:
                continue

    # Sort both lists by simulation number
    sorted_pairs = sorted(zip(overlapping_software_files, overlapping_momentsLD_files), 
                         key=lambda x: int(x[0].split('_')[-1].split('.')[0]))
    
    overlapping_software_files, overlapping_momentsLD_files = zip(*sorted_pairs)
    
    print(f"Found {len(momentsLD_numbers)} momentsLD files")
    print(f"Found {len(overlapping_software_files)} overlapping pairs")
    
    if overlapping_software_files:
        print("\nSample of overlapping simulation numbers:")
        for software_file in list(overlapping_software_files)[:5]:
            num = int(software_file.split('_')[-1].split('.')[0])
            print(f"Simulation {num}")
            
    return list(overlapping_software_files), list(overlapping_momentsLD_files)

# Use the function
software_inferences_dir = "/sietch_colab/akapoor/Demographic_Inference/software_inferences_dir"
momentsLD_inferences_dir = "/sietch_colab/akapoor/Demographic_Inference/final_LD_inferences"

# Get the overlapping files with full paths
software_files, momentsLD_files = get_overlapping_simulations(software_inferences_dir, momentsLD_inferences_dir)

Found 4923 momentsLD files
Found 4923 overlapping pairs

Sample of overlapping simulation numbers:
Simulation 0
Simulation 1
Simulation 2
Simulation 3
Simulation 4


In [8]:
import pickle
with open(software_files[0], 'rb') as f:
    software_data = pickle.load(f)

In [12]:
software_data['opt_params_moments']

[{'Na': 23876.280308249967,
  'N1': 24649.63175921235,
  'N2': 14357.349884380737,
  't_split': 17159.112138441425,
  'm': 184.39996895868322,
  'upper_triangular_FIM': array([-4.45638571e+05, -9.07620137e+04,  6.67789909e+05, -3.50415609e+05,
         -1.22600390e+06,  1.37925430e+06, -6.01616301e+05, -3.90933710e+06,
          2.01353442e+06,  1.24911311e+08]),
  'll': 7106.4181658968155},
 {'Na': 23870.675780510344,
  'N1': 24641.84156521614,
  'N2': 14365.933378706419,
  't_split': 17168.876655566513,
  'm': 191.3479371661034,
  'upper_triangular_FIM': array([-4.42215289e+05, -9.14434040e+04,  6.68562058e+05, -2.25093321e+05,
         -1.21882177e+06,  1.37915113e+06, -5.79152827e+05, -3.87425377e+06,
          1.93840776e+06,  1.15950327e+08]),
  'll': 7106.414255299631}]

In [9]:
import pandas as pd
import numpy as np
import pickle
import json
from sklearn.model_selection import train_test_split

experiment_config_file = '/sietch_colab/akapoor/Demographic_Inference/experiment_config.json'
software_inferences_dir = "/sietch_colab/akapoor/Demographic_Inference/software_inferences_dir"
momentsLD_inferences_dir = "/sietch_colab/akapoor/Demographic_Inference/final_LD_inferences"

# Load configuration
with open(experiment_config_file, "r") as f:
    experiment_config = json.load(f)

parameters = ["Na", "N1", "N2", "t_split"]
replicates = experiment_config['top_values_k']

# Containers for predictions and targets
software_predictions_data = []
momentsLD_predictions_data = []
targets_data = []

# Process software inference files
for idx, filepath in enumerate(software_files):
    with open(filepath, 'rb') as f:
        sim_data = pickle.load(f)
    
    row = {}
    
    # Verify dadi and moments predictions
    for replicate in range(1, replicates + 1):
        for param in parameters:
            dadi_val = sim_data['opt_params_dadi'][replicate - 1][param]
            moments_val = sim_data['opt_params_moments'][replicate - 1][param]

            row[f"dadi_rep{replicate}_{param}"] = dadi_val
            row[f"moments_rep{replicate}_{param}"] = moments_val

        # Extract FIM elements as separate columns if present
        if 'upper_triangular_FIM' in sim_data['opt_params_moments'][0]:
            fim = sim_data['opt_params_moments'][replicate-1]['upper_triangular_FIM']
            for i, fim_val in enumerate(fim):
                row[f"FIM_element_{i}"] = fim_val

    software_predictions_data.append(row)
    targets_data.append({f"simulated_params_{param}": sim_data['simulated_params'][param] 
                        for param in parameters})

# Process MomentsLD inference files
for idx, filepath in enumerate(momentsLD_files):
    with open(filepath, 'rb') as f:
        momentsLD_sim_data = pickle.load(f)

    row = {}

    for param in parameters:
        val = momentsLD_sim_data['opt_params_momentsLD'][0][param]
        if np.isnan(val):
            print(f"nan value for {param} in {filepath}")
        row[f"momentsLD_{param}"] = val

    momentsLD_predictions_data.append(row)

# Create DataFrames
software_df = pd.DataFrame(software_predictions_data)
momentsLD_df = pd.DataFrame(momentsLD_predictions_data)
targets_df = pd.DataFrame(targets_data)

# Combine software and momentsLD predictions
combined_predictions_df = pd.concat([software_df, momentsLD_df], axis=1)

# Drop any row that has at least one NaN value
combined_predictions_df = combined_predictions_df.dropna()
valid_indices = combined_predictions_df.dropna().index
combined_predictions_df = combined_predictions_df.loc[valid_indices].reset_index(drop=True)
targets_df = targets_df.loc[valid_indices].reset_index(drop=True)

# Filter based on bounds for all methods and parameters
mask = pd.Series(True, index=combined_predictions_df.index)

# Filter for each parameter and method
methods = ['momentsLD', 'dadi_rep1', 'dadi_rep2', 'moments_rep1', 'moments_rep2']

for param in parameters:
    # Get bounds for this parameter
    lower = experiment_config['lower_bound_params'][param]
    upper = experiment_config['upper_bound_params'][param]
    
    # Add to mask for each method
    for method in methods:
        col_name = f"{method}_{param}"
        param_mask = (combined_predictions_df[col_name] >= lower) & (combined_predictions_df[col_name] <= upper)
        mask &= param_mask

# Apply final mask to both dataframes
combined_predictions_df = combined_predictions_df[mask].reset_index(drop=True)
targets_df = targets_df[mask].reset_index(drop=True)

# Final NaN check
combined_predictions_df = combined_predictions_df.dropna()
valid_indices = combined_predictions_df.dropna().index
combined_predictions_df = combined_predictions_df.loc[valid_indices].reset_index(drop=True)
targets_df = targets_df.loc[valid_indices].reset_index(drop=True)

# Print shapes to verify
print("Final shapes:")
print(f"Predictions shape: {combined_predictions_df.shape}")
print(f"Targets shape: {targets_df.shape}")



Final shapes:
Predictions shape: (3826, 30)
Targets shape: (3826, 4)


In [None]:
# Generate train/validation split indices
train_indices, val_indices = train_test_split(
    range(len(combined_predictions_df)), 
    test_size=0.2, 
    random_state=42
)

# Create the preprocessing results object
preprocessing_results_obj = {
    "training": {
        "predictions": combined_predictions_df.iloc[train_indices].reset_index(drop=True),
        "targets": targets_df.iloc[train_indices].reset_index(drop=True),
        "indices": train_indices,
    },
    "validation": {
        "predictions": combined_predictions_df.iloc[val_indices].reset_index(drop=True),
        "targets": targets_df.iloc[val_indices].reset_index(drop=True),
        "indices": val_indices,
    },
    "parameter_names": parameters
}

In [None]:
preprocessing_results_obj['training']['predictions']

In [None]:
preprocessing_results_obj['validation']['predictions'].shape

In [None]:
preprocessing_results_obj['training']['predictions'].to_csv('/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2/training_features.csv', index=False)
preprocessing_results_obj['training']['targets'].to_csv('/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2/training_targets.csv', index=False)
preprocessing_results_obj['validation']['predictions'].to_csv('/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2/validation_features.csv', index=False)
preprocessing_results_obj['validation']['targets'].to_csv('/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2/validation_targets.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define parameters
parameters = ['Na', 'N1', 'N2', 't_split']

# Extract ground truth and predictions
ground_truth = preprocessing_results_obj['training']['targets']
predictions = preprocessing_results_obj['training']['predictions']

# Function to calculate MAPE
def calculate_mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Create 2x2 subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10), sharey=True)
fig.suptitle('Predicted vs Ground Truth for Parameters', fontsize=16)

# Flatten axes for easy indexing
axes = axes.flatten()

for i, param in enumerate(parameters):
    ax = axes[i]
    
    # Extract ground truth for this parameter
    ground_truth_param = ground_truth[f'simulated_params_{param}']
    
    # Extract predictions for each method
    moments_rep1 = predictions[f'moments_rep1_{param}']
    dadi_rep1 = predictions[f'dadi_rep1_{param}']
    momentsLD_param = predictions[f'momentsLD_{param}']
    
    # Calculate R²
    r2_moments = np.corrcoef(ground_truth_param, moments_rep1)[0, 1] ** 2
    r2_dadi = np.corrcoef(ground_truth_param, dadi_rep1)[0, 1] ** 2
    r2_momentsLD = np.corrcoef(ground_truth_param, momentsLD_param)[0, 1] ** 2
    
    # Calculate MAPE
    mape_moments = calculate_mape(ground_truth_param, moments_rep1)
    mape_dadi = calculate_mape(ground_truth_param, dadi_rep1)
    mape_momentsLD = calculate_mape(ground_truth_param, momentsLD_param)

    # Plot predictions
    ax.scatter(ground_truth_param, moments_rep1, color='red', alpha=0.5, 
               label=f'Moments MAPE: {mape_moments:.2f}%')
    ax.scatter(ground_truth_param, dadi_rep1, color='blue', alpha=0.5, 
               label=f'Dadi MAPE: {mape_dadi:.2f}%')
    ax.scatter(ground_truth_param, momentsLD_param, color='green', alpha=0.5, 
               label=f'MomentsLD MAPE: {mape_momentsLD:.2f}%')

    # Perfect prediction line
    ax.plot([ground_truth_param.min(), ground_truth_param.max()],
            [ground_truth_param.min(), ground_truth_param.max()],
            color='black', linestyle='--', label='Perfect Prediction')
    
    ax.set_title(f'{param}')
    ax.set_xlabel(f'Ground Truth {param}')
    if i % 2 == 0:
        ax.set_ylabel('Predicted Value')
    ax.legend()

# Adjust layout
plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust for suptitle
plt.show()


## Postprocessing results

In [None]:
os.chdir('/sietch_colab/akapoor/Demographic_Inference/')
from snakemake_scripts.postprocessing import postprocessing

In [None]:
config_file = '/sietch_colab/akapoor/Demographic_Inference/experiment_config.json'
training_features_filepath = '/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2/training_features.csv'
training_targets_filepath = '/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2/training_targets.csv'
validation_features_filepath = '/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2/validation_features.csv'
validation_targets_filepath = '/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2/validation_targets.csv'

# Run the postprocessing function
postprocessing_dict = postprocessing(
    config_file,
    training_features_filepath,
    training_targets_filepath,
    validation_features_filepath,
    validation_targets_filepath
)

# Save the postprocessing results
with open('/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2/postprocessing_results.pkl', 'wb') as f:
    pickle.dump(postprocessing_dict, f)



In [None]:
postprocessing_dict['training']['targets']

In [None]:
postprocessing_dict['training']['predictions']

## Plotting results

In [None]:
# Calculate MAPE and standard errors for each parameter and method
params = ['N1', 'N2', 'Na', 't_split']
methods = ['Moments', 'Dadi', 'MomentsLD']
mape_values = {}
mape_errors = {}

for param in params:
    mape_values[param] = {}
    mape_errors[param] = {}
    target_col = f'simulated_params_{param}'
    target_values = postprocessing_dict['training']['targets'][target_col]
    
    for method in methods:
        if method == 'Moments':
            # Pool predictions from both replicates
            predictions = np.concatenate([
                postprocessing_dict['training']['predictions'][f'moments_rep1_{param}'],
                postprocessing_dict['training']['predictions'][f'moments_rep2_{param}']
            ])
            # Repeat targets for both replicates
            targets = np.tile(target_values, 2)
            
        elif method == 'Dadi':
            # Pool predictions from both replicates
            predictions = np.concatenate([
                postprocessing_dict['training']['predictions'][f'dadi_rep1_{param}'],
                postprocessing_dict['training']['predictions'][f'dadi_rep2_{param}']
            ])
            # Repeat targets for both replicates
            targets = np.tile(target_values, 2)
            
        else:  # MomentsLD
            predictions = postprocessing_dict['training']['predictions'][f'momentsLD_{param}']
            targets = target_values

        # Calculate absolute percentage error for each simulation
        percentage_errors = np.abs((predictions - targets) / targets) * 100
        
        # Mean MAPE across all simulations
        mape_values[param][method] = np.mean(percentage_errors)
        
        # Standard error of MAPE across simulations
        mape_errors[param][method] = np.std(percentage_errors) / np.sqrt(len(percentage_errors))

# Create DataFrame
result_df = pd.DataFrame({
    'Moments': [mape_values[p]['Moments'] for p in params],
    'Moments_std_err': [mape_errors[p]['Moments'] for p in params],
    'Dadi': [mape_values[p]['Dadi'] for p in params],
    'Dadi_std_err': [mape_errors[p]['Dadi'] for p in params],
    'MomentsLD': [mape_values[p]['MomentsLD'] for p in params],
    'MomentsLD_std_err': [mape_errors[p]['MomentsLD'] for p in params]
}, index=params)

print("MAPE Values and Standard Errors:")
print(result_df)

# Create grouped bar plot
x = np.arange(len(params))
width = 0.25

fig, ax = plt.subplots(figsize=(10, 6))

# Create bars with error bars
rects1 = ax.bar(x - width, [mape_values[p]['Moments'] for p in params], width, 
              yerr=[mape_errors[p]['Moments'] for p in params],
              label='Moments', capsize=5)
rects2 = ax.bar(x, [mape_values[p]['Dadi'] for p in params], width,
              yerr=[mape_errors[p]['Dadi'] for p in params],
              label='Dadi', capsize=5)
rects3 = ax.bar(x + width, [mape_values[p]['MomentsLD'] for p in params], width,
              yerr=[mape_errors[p]['MomentsLD'] for p in params],
              label='MomentsLD', capsize=5)

ax.set_ylabel('MAPE (%)')
ax.set_title('Mean Absolute Percentage Error by Parameter and Method')
ax.set_xticks(x)
ax.set_xticklabels(params)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
result_df

In [None]:
result_df.to_csv('/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2/mape_values_preprocess.csv')

In [None]:
np.mean(result_df, axis = 0)

## Extracting Features

In [None]:
def getting_the_features(postprocessing_results_filepath, sim_directory):

    with open(postprocessing_results_filepath, "rb") as file:
        postprocessing_results_obj = pickle.load(file)

    print(postprocessing_results_obj.keys())

    features = {
        "training": {"features": postprocessing_results_obj['training']['predictions'], "targets": postprocessing_results_obj['training']['normalized_targets']},
        "validation": {"features": postprocessing_results_obj['validation']['predictions'], "targets": postprocessing_results_obj['validation']['normalized_targets']},
    }

    print(f'Training features shape: {features["training"]["features"].shape}')
    print(f'Validation features shape: {features["validation"]["features"].shape}')

    print(f'Training targets shape: {features["training"]["targets"].shape}')
    print(f'Validation targets shape: {features["validation"]["targets"].shape}')


    # Now save the dictionary as a pickle
    with open(f"{sim_directory}/features_and_targets.pkl", "wb") as file:
        pickle.dump(features, file)


In [None]:
postprocessing_results_filepath = '/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2/postprocessing_results.pkl'
sim_directory = '/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2'

In [None]:
getting_the_features(postprocessing_results_filepath, sim_directory)

In [None]:
with open('/sietch_colab/akapoor/Demographic_Inference/sims_pretrain_5000_sims_inference_1_seed_42_num_replicates_3_top_values_2/features_and_targets.pkl', "rb") as file:
    features = pickle.load(file)

In [None]:
features['training']['targets']