# Postprocessing and Model Building

In [2]:
import os
import pickle
import json

In [3]:
main_filepath = '/sietch_colab/akapoor/Demographic_Inference'



CONFIG_FILEPATH = f'{main_filepath}/experiment_config.json'
MODEL_CONFIG_FILEPATH = f'{main_filepath}/model_config.json'

with open(CONFIG_FILEPATH, 'r') as f:
   experiment_config = json.load(f)

with open(MODEL_CONFIG_FILEPATH, 'r') as f:
   model_config = json.load(f)


CWD = os.getcwd()

# Use double quotes for the dictionary keys inside the f-string
EXPERIMENT_DIRECTORY = f"{experiment_config['demographic_model']}_dadi_analysis_{experiment_config['dadi_analysis']}_moments_analysis_{experiment_config['moments_analysis']}_momentsLD_analysis_{experiment_config['momentsLD_analysis']}_seed_{experiment_config['seed']}"
EXPERIMENT_NAME = f'sims_pretrain_{experiment_config["num_sims_pretrain"]}_sims_inference_{experiment_config["num_sims_inference"]}_seed_{experiment_config["seed"]}_num_replicates_{experiment_config["k"]}_top_values_{experiment_config["top_values_k"]}'
SIM_DIRECTORY = f"{EXPERIMENT_DIRECTORY}/sims/{EXPERIMENT_NAME}"

# Check if hidden_size is a list, and if so, join the elements with "_"
hidden_size = model_config['neural_net_hyperparameters']['hidden_size']
if isinstance(hidden_size, list):
    hidden_size_str = "_".join(map(str, hidden_size))  # Join list elements with "_"
else:
    hidden_size_str = str(hidden_size)  # Convert integer to string if not a list

# Build the MODEL_DIRECTORY string
MODEL_DIRECTORY = (
    f"{EXPERIMENT_DIRECTORY}/models/{EXPERIMENT_NAME}/"
    f"num_hidden_neurons_{hidden_size_str}_"
    f"num_hidden_layers_{model_config['neural_net_hyperparameters']['num_layers']}_"
    f"num_epochs_{model_config['neural_net_hyperparameters']['num_epochs']}_"
    f"dropout_value_{model_config['neural_net_hyperparameters']['dropout_rate']}_"
    f"weight_decay_{model_config['neural_net_hyperparameters']['weight_decay']}_"
    f"batch_size_{model_config['neural_net_hyperparameters']['batch_size']}_"
    f"EarlyStopping_{model_config['neural_net_hyperparameters']['EarlyStopping']}"
)


In [4]:
os.chdir('/sietch_colab/akapoor/Demographic_Inference/')
os.environ['PYTHONPATH'] = '/sietch_colab/akapoor/Demographic_Inference:' + os.environ.get('PYTHONPATH', '')

## Postprocessing

In [10]:
!python /sietch_colab/akapoor/Demographic_Inference/snakemake_scripts/postprocessing.py \
    --config_file $CONFIG_FILEPATH \
    --training_features_filepath $SIM_DIRECTORY/training_features.csv \
    --training_targets_filepath $SIM_DIRECTORY/training_targets.csv \
    --validation_features_filepath $SIM_DIRECTORY/validation_features.csv \
    --validation_targets_filepath $SIM_DIRECTORY/validation_targets.csv \
    --sim_directory $SIM_DIRECTORY



Processing training data:
===> Normalizing the data.

Processing validation data:
===> Normalizing the data.
Postprocessing dict keys: dict_keys(['normalization', 'predictions', 'normalized_predictions', 'targets', 'normalized_targets'])
Postprocessing complete!


## Extracting Features

In [12]:
!python /sietch_colab//akapoor/Demographic_Inference/snakemake_scripts/extracting_features.py \
 --postprocessing_results_filepath $SIM_DIRECTORY/postprocessing_results.pkl \
 --sim_directory $SIM_DIRECTORY

dict_keys(['parameter_names', 'target_names', 'training', 'validation'])
Training features shape: (375, 40)
Validation features shape: (94, 40)
Training targets shape: (375, 4)
Validation targets shape: (94, 4)


## Linear Evaluation

In [41]:
!python /sietch_colab/akapoor/Demographic_Inference/snakemake_scripts/linear_evaluation.py \
     --features_and_targets_filepath /sietch_colab/akapoor/Demographic_Inference/split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_True_seed_42/sims/sims_pretrain_500_sims_inference_1_seed_42_num_replicates_3_top_values_2/features_and_targets.pkl \
     --model_config_path /sietch_colab/akapoor/Demographic_Inference/model_config.json \
     --color_shades_file /sietch_colab/akapoor/Demographic_Inference/split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_True_seed_42/sims/sims_pretrain_500_sims_inference_1_seed_42_num_replicates_3_top_values_2/color_shades.pkl \
     --main_colors_file /sietch_colab/akapoor/Demographic_Inference/split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_True_seed_42/sims/sims_pretrain_500_sims_inference_1_seed_42_num_replicates_3_top_values_2/main_colors.pkl \
     --experiment_config_filepath /sietch_colab/akapoor/Demographic_Inference/experiment_config.json \
     --regression_type standard

Model directory created/verified: split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_True_seed_42/models/sims_pretrain_500_sims_inference_1_seed_42_num_replicates_3_top_values_2/num_hidden_neurons_10_num_hidden_layers_2_num_epochs_500_dropout_value_0_weight_decay_0_batch_size_64_EarlyStopping_False
Initializing LinearRegression with kwargs={}
PREDICTIONS SHAPE TRAINING: (375, 4)
[2.17040379e-04 1.60429237e-04 1.06441734e-04 1.50368701e-03
 8.26248087e-04 1.89832255e-04 1.24870579e-03 1.76666686e-04
 4.68037115e-03 5.99968006e-05 3.64516289e-04 6.07166169e-04
 1.99164679e-05 1.89186152e-04 2.24410129e-03 4.44461757e-04
 8.30250839e-04 3.20050702e-04 6.67372049e-04 4.36815092e-03
 3.40148062e-05 2.65255535e-03 7.39937702e-04 4.93398717e-04
 3.78875481e-04 1.62686124e-03 8.04517662e-04 2.97266536e-04
 6.15999161e-04 1.40511056e-04 1.53534570e-02 2.21760329e-04
 1.80026333e-04 6.05329087e-04 1.14544154e-03 1.49647558e-04
 4.27424353e-04 5.11889835e-04 4.53690

## Random Forest

In [64]:
!python /sietch_colab/akapoor/Demographic_Inference/snakemake_scripts/random_forest_evaluation.py \
    --features_and_targets_filepath /sietch_colab/akapoor/Demographic_Inference/split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_True_seed_42/sims/sims_pretrain_500_sims_inference_1_seed_42_num_replicates_3_top_values_2/features_and_targets.pkl \
    --model_config_path /sietch_colab/akapoor/Demographic_Inference/model_config.json \
    --color_shades_file /sietch_colab/akapoor/Demographic_Inference/split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_True_seed_42/sims/sims_pretrain_500_sims_inference_1_seed_42_num_replicates_3_top_values_2/color_shades.pkl \
    --main_colors_file /sietch_colab/akapoor/Demographic_Inference/split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_True_seed_42/sims/sims_pretrain_500_sims_inference_1_seed_42_num_replicates_3_top_values_2/main_colors.pkl \
    --experiment_config_filepath /sietch_colab/akapoor/Demographic_Inference/experiment_config.json

Model directory created/verified: split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_True_seed_42/models/sims_pretrain_500_sims_inference_1_seed_42_num_replicates_3_top_values_2/num_hidden_neurons_1000_num_hidden_layers_2_num_epochs_100_dropout_value_0_weight_decay_0_batch_size_64_EarlyStopping_False

No hyperparameters specified. Running RandomizedSearchCV to find best hyperparameters...

Fitting 3 folds for each of 10 candidates, totalling 30 fits

Best hyperparameters found via RandomizedSearchCV: {'random_state': 2023, 'n_estimators': 50, 'min_samples_split': 5, 'max_depth': 30}

Initializing RandomForestRegressor with kwargs={'n_estimators': 50, 'max_depth': 30, 'random_state': 2023, 'min_samples_split': 5}

Random Forest predictions shape (training): (375, 4)
Random Forest predictions shape (validation): (94, 4)

[1.40051926e-04 7.35507236e-05 1.76780185e-04 3.72394739e-04
 6.29923147e-04 2.30621027e-04 1.10078211e-05 2.70376461e-04
 2.41653069e-04 

## XGBoost

In [66]:
 !python /sietch_colab/akapoor/Demographic_Inference/snakemake_scripts/xgboost_evaluation.py \
    --features_and_targets_filepath /sietch_colab/akapoor/Demographic_Inference/split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_True_seed_42/sims/sims_pretrain_500_sims_inference_1_seed_42_num_replicates_3_top_values_2/features_and_targets.pkl \
    --model_config_path /sietch_colab/akapoor/Demographic_Inference/model_config.json \
    --color_shades_file /sietch_colab/akapoor/Demographic_Inference/split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_True_seed_42/sims/sims_pretrain_500_sims_inference_1_seed_42_num_replicates_3_top_values_2/color_shades.pkl \
    --main_colors_file /sietch_colab/akapoor/Demographic_Inference/split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_True_seed_42/sims/sims_pretrain_500_sims_inference_1_seed_42_num_replicates_3_top_values_2/main_colors.pkl \
    --experiment_config_filepath /sietch_colab/akapoor/Demographic_Inference/experiment_config.json

Model directory created/verified: split_isolation_model_dadi_analysis_True_moments_analysis_True_momentsLD_analysis_True_seed_42/models/sims_pretrain_500_sims_inference_1_seed_42_num_replicates_3_top_values_2/num_hidden_neurons_1000_num_hidden_layers_2_num_epochs_100_dropout_value_0_weight_decay_0_batch_size_64_EarlyStopping_False

No XGBoost hyperparameters specified. Running RandomizedSearchCV to find best hyperparameters...

Fitting 3 folds for each of 10 candidates, totalling 30 fits

Best hyperparameters found via RandomizedSearchCV: {'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 20, 'learning_rate': 0.05, 'colsample_bytree': 0.6}

Initializing XGBRegressor with kwargs={'n_estimators': 500, 'max_depth': 20, 'learning_rate': 0.05, 'subsample': 0.6, 'colsample_bytree': 0.6, 'min_child_weight': 3, 'reg_lambda': 1, 'reg_alpha': 0}

XGBoost predictions shape (training): (375, 4)
XGBoost predictions shape (validation): (94, 4

## Neural Network

In [None]:
    # parser.add_argument("--experiment_directory", type=str, required=True)
    # parser.add_argument("--model_config_file", type=str, required=True)
    # parser.add_argument("--features_file", type=str, required=True)
    # parser.add_argument("--color_shades", type=str, required=True)
    # parser.add_argument("--main_colors", type=str, required=True)

In [50]:
!python /sietch_colab/akapoor/Demographic_Inference/snakemake_scripts/setup_trainer.py \
    --experiment_directory $EXPERIMENT_DIRECTORY \
    --model_config_file $MODEL_CONFIG_FILEPATH \
    --features_file $SIM_DIRECTORY/features_and_targets.pkl \
    --color_shades $SIM_DIRECTORY/color_shades.pkl \
    --main_colors $SIM_DIRECTORY/main_colors.pkl

Max Values in the dataset:
  Training features max: dadi_rep1_Na                  29463.672544
moments_rep1_Na               29856.497542
moments_rep1_FIM_element_0        0.254189
moments_rep1_FIM_element_1        0.287948
moments_rep1_FIM_element_2       17.340964
moments_rep1_FIM_element_3        0.653049
moments_rep1_FIM_element_4        0.120641
moments_rep1_FIM_element_5       18.464149
moments_rep1_FIM_element_6        0.658112
moments_rep1_FIM_element_7        0.343955
moments_rep1_FIM_element_8        2.814908
moments_rep1_FIM_element_9        4.545339
dadi_rep1_N1                  29554.623419
moments_rep1_N1               29885.956435
dadi_rep1_N2                  29582.069317
moments_rep1_N2               29868.185783
dadi_rep1_t_split             19930.419102
moments_rep1_t_split          19817.833040
dadi_rep2_Na                  29611.360053
moments_rep2_Na               29857.481577
moments_rep2_FIM_element_0        0.261474
moments_rep2_FIM_element_1        0.412274
mo

In [20]:
import torch
print("Is CUDA available?", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Current GPU:", torch.cuda.current_device())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

Is CUDA available? False
Number of GPUs: 0




RuntimeError: No CUDA GPUs are available