In [1]:
from dustbi_simulator import *
from Functions import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv("INPUT_DES5YR_D2D.FITRES", comment="#", sep='\s+')

df['SIM_EBV'] = df.SIM_AV/df.SIM_RV


dfdata = pd.read_csv("SIMS_FOR_TESTING/FITOPT000.FITRES.gz", 
                     comment="#", sep=r'\s+')

#dfdata = pd.read_csv("../INVERSE_H0/D5YR_DATA/FITOPT000_MUOPT000.FITRES.gz", comment="#", sep=r'\s+')

try:
    dfdata['SIM_EBV'] = dfdata.SIM_AV/dfdata.SIM_RV
except:
    print("eh.")

dfdata = dfdata.loc[dfdata.IDSURVEY == 10]
dfdata = dfdata.loc[dfdata.PROB_SNNV19 >= 0.5]

  from pandas.core import (


In [3]:
bounds_dict = {
    "SIM_c"   : (-0.5, 0.5),
    "SIM_RV"  : (1.5, 5),
    "SIM_EBV" : (0,1),
    "SIM_beta": (0.5,4),
}

function_dict = {
    "SIM_c"   : DistGaussian,
    "SIM_RV"  : DistGaussian,
    "SIM_EBV" : DistExponential,
    "SIM_beta": DistGaussian,
}

split_dict = {
#    "SIM_RV":["HOST_LOGMASS", 10],
    "SIM_EBV":['HOST_LOGMASS', 10],
    'SIM_c':['HOST_LOGMASS', 10]
}


#Prior dict is a weird one; it should be a tuple for each parameter and then a boolean statement.

split_dict = {}


priors_dict = {
    
    "SIM_c"   : [(-0.2, 0), (0.0, 0.1), False],
    "SIM_RV"  : [(1.5,4), (0,2), True],
    "SIM_EBV" : [(0.05, 0.3)],
    "SIM_beta": [(0,3), (0,1), True],
    
}

latex_dict = {
    
    'DistExponential':[r'$\tau$'],
    'DistGaussian':[r'$\mu$', r'$\sigma$'],
    'SIM_c':r"$c_{\rm int}$",
    'SIM_beta':r"$\beta_{\rm int}$",
    'SIM_RV':r"$R_V$",
    'SIM_EBV':r"$EBV$",
    
}


dicts = [bounds_dict, function_dict, split_dict, priors_dict]

In [4]:
param_names = ['SIM_c', 'SIM_RV', 'SIM_beta', 'SIM_EBV']
#param_names = ['SIM_c']


params_to_fit = parameter_generation(param_names, dicts)
priors = prior_generator(param_names, dicts)

Total priors added: 7
[0] <class 'sbi.utils.torchutils.BoxUniform'>
[1] <class 'sbi.utils.torchutils.BoxUniform'>
[2] <class 'sbi.utils.torchutils.BoxUniform'>
[3] <class 'sbi.utils.torchutils.BoxUniform'>
[4] <class 'sbi.utils.torchutils.BoxUniform'>
[5] <class 'sbi.utils.torchutils.BoxUniform'>
[6] <class 'sbi.utils.torchutils.BoxUniform'>


In [5]:
layout = build_layout(params_to_fit, dicts)

In [6]:
parameters_to_condition_on = ['c', 'mB', 'x1']

In [7]:
simulatinator = make_simulator(layout, df, param_names, parameters_to_condition_on, dicts, dfdata, is_split=True)


In [12]:
ndim = len(parameters_to_condition_on)

if any(p in split_dict for p in param_names): #check early to see if we need to split anything. 
    ndim *= 2

In [8]:
def batched_simulator(theta_batch):
    return torch.stack([simulatinator(theta) for theta in theta_batch])

In [9]:
from sbi import analysis as analysis

# sbi
from sbi import utils as utils
from sbi.inference import NPE, simulate_for_sbi
from sbi.utils.user_input_checks import (
    check_sbi_inputs,
    process_prior,
    process_simulator,
)

In [10]:
# Check prior, simulator, consistency
prior, num_parameters, prior_returns_numpy = process_prior(priors)
simulation_wrapper = process_simulator(simulatinator, prior, prior_returns_numpy)
check_sbi_inputs(simulation_wrapper, prior)

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F




In [14]:
from sbi.inference import SNPE
from sbi.utils import MultipleIndependent

from sbi.neural_nets import posterior_nn




# Potentially Upgraded Version

In [15]:
class PopulationEmbeddingFull(nn.Module):
    def __init__(self, input_dim=ndim, hidden_dim=64, output_dim=32):
        super().__init__()
        self.phi = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )
        self.rho = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        # x: (batch_size, N, 2)
        h = self.phi(x)           # (batch_size, N, hidden_dim)
        h = h.mean(dim=1)         # mean over N samples -> (batch_size, hidden_dim)
        return self.rho(h)        # (batch_size, output_dim)


In [16]:
from sbi.inference import SNPE
from sbi.utils import MultipleIndependent

from sbi.neural_nets import posterior_nn

density_estimator = posterior_nn(
    model="nsf", #switch to nsf if interested 
    embedding_net=PopulationEmbeddingFull(input_dim=4)
)

inference = SNPE(
    prior=priors,
    density_estimator=density_estimator, 
)





In [None]:
import torch
import os

batch_size = 100
num_simulations = 3000
save_path = "simulations_v1.pt"

# If the file already exists, start fresh
if os.path.exists(save_path):
    os.remove(save_path)

for start in range(0, num_simulations, batch_size):
    current_bs = min(batch_size, num_simulations - start)

    # Sample and simulate
    theta_batch = priors.sample((current_bs,))
    x_batch = batched_simulator(theta_batch)

    # Append to SBI inference
    inference.append_simulations(theta_batch, x_batch)

    # Save incrementally
    if start == 0:
        # First batch, create the file
        torch.save({'theta': theta_batch, 'x': x_batch}, save_path)
    else:
        # Load existing data
        data = torch.load(save_path)
        data['theta'] = torch.cat([data['theta'], theta_batch], dim=0)
        data['x'] = torch.cat([data['x'], x_batch], dim=0)
        torch.save(data, save_path)

    print(f"Appended {start + current_bs}/{num_simulations} simulations and saved incrementally.")

print(f"All simulations saved incrementally to '{save_path}'")




Appended 100/3000 simulations and saved incrementally.


  data = torch.load(save_path)


Appended 200/3000 simulations and saved incrementally.




Appended 300/3000 simulations and saved incrementally.




Appended 400/3000 simulations and saved incrementally.


In [None]:
#inference.append_simulations(theta_batch, x_batch)

density_estimator = inference.train()

print("\n inferred successfully")

posterior = inference.build_posterior(density_estimator)

torch.save(posterior, "posterior.pt")


In [None]:
data = torch.load("simulations_v1.pt")
theta_batch = data["theta"]
x_batch = data["x"]


In [None]:
x = preprocess_data(param_names, parameters_to_condition_on, split_dict, dfdata)

In [None]:
labels = unspool_labels(param_names, dicts, latex_dict, function_dict)

In [None]:
posterior_samples = posterior.sample((50000,), x=x)


In [None]:
fig, axes = analysis.pairplot(
    posterior_samples,
    labels=labels

);

In [None]:
theta_hat = posterior_samples.mean(0)


In [None]:
theta_hat


In [None]:
posterior_samples.std(0)

In [None]:
from IPython.display import display, Math


In [None]:
for n in range(len(theta_hat)):
    string = rf"{labels[n]} = {theta_hat[n]:.3f} +/- {posterior_samples.std(0)[n]:.3f}"
    display(Math(string))


In [None]:
true_params = torch.tensor([-0.07, 0.53, 2, 0.95, 2.07, 0.22, 0.14,])

In [None]:
simulatinator = make_simulator(layout, df, param_names, dicts, dfdata, debug=True)


In [None]:
dft = simulatinator(theta_hat)

#dft = simulatinator(torch.tensor([[-0.1006,  0.0507,  2.7590,  1.0042,  1.4923,  0.5086,  0.142]]))

In [None]:
import matplotlib.pyplot as plt

In [None]:
bins = np.linspace(-0.4, 0.4, 20)

plt.hist(dft.c.values, histtype='step', bins=bins, label="sim output", density=True)
plt.hist(dfdata.c.values, histtype='step', bins=bins, label="data", density=True)

plt.legend()
plt.xlabel("c")

In [None]:
bins = np.linspace(18, 26, 20)

plt.hist(dft.mB.values, histtype='step', bins=bins, label="sim output", density=True)
plt.hist(dfdata.mB.values, histtype='step', bins=bins, label="data", density=True)

plt.legend()
plt.xlabel("mB")

In [None]:
bins = np.linspace(0, 0.6, 20)

plt.hist(dft.SIM_EBV.values, histtype='step', bins=bins, label="sim output", density=True)
plt.hist(dfdata.SIM_EBV.values, histtype='step', bins=bins, label="data", density=True)


plt.legend()
plt.xlabel("E(B-V)")

# Calibrate some posteriors

In [None]:

num_calib = 100  # how many trials for calibration

ranks = []

for _ in range(num_calib):
    # Sample a "true" parameter from the prior
    theta_true = priors.sample((1,))  # shape (1, num_parameters)

    # Simulate data for that theta
    x_sim = simulatinator(theta_true)

    posterior = inference.build_posterior(density_estimator, sample_with="mcmc")
    samples = posterior.sample((200,), x=x_sim)


    # Compute rank of true parameter in the posterior samples
    for i in range(theta_true.shape[1]):
        rank_i = (samples[:, i] < theta_true[0, i]).float().mean()
        ranks.append(rank_i.item())

# ranks should be ~Uniform[0,1] if well-calibrated
import matplotlib.pyplot as plt
plt.hist(ranks, bins=20)
plt.xlabel("Rank")
plt.ylabel("Frequency")
plt.title("SBC Histogram")
plt.show()


In [None]:
#Flat histogram → well-calibrated.

#U-shaped → posteriors too narrow.

#Bell-shaped → posteriors too wide.

In [None]:
import matplotlib.pyplot as plt

In [None]:
posterior = inference.build_posterior(density_estimator)
posterior_samples = posterior.sample((1000,), x=x)

# Simulate data from these posterior samples
simulated_data = []
for theta_s in posterior_samples:
    simulated_data.append(simulatinator(theta_s.unsqueeze(0)))

simulated_data = torch.cat(simulated_data, dim=0)

# Compare histograms of observed vs simulated


In [None]:
plt.hist(x.numpy().flatten(), bins=30, alpha=0.5, label="observed", density=True)
plt.hist(simulated_data.numpy().flatten(), bins=30, alpha=0.5, label="posterior predictive", histtype="step", density=True)
plt.legend()
plt.show()


In [None]:
matching = [p for p in param_names if p in split_dict]
name = matching[0]

split_param = split_dict[name][0]
split_val   = split_dict[name][1]


split_tensor = torch.tensor(
dft[split_param].to_numpy(),
dtype=torch.float32,
device=device
)

x = split_outputs(
    output_distribution,
    split_tensor,
    split_val,
    parameters_to_condition_on
)


In [None]:
def preprocess_data(param_names, parameters_to_condition_on, split_dict, dfdata, ):
    
    output_distribution = preprocess_input_distribution(dfdata, parameters_to_condition_on)
    
    matching = [p for p in param_names if p in split_dict]
    name = matching[0]

    split_param = split_dict[name][0]
    split_val   = split_dict[name][1]

    split_tensor = torch.tensor(
        df[split_param].to_numpy(),
        dtype=torch.float32,
        )

    x = split_outputs(
        output_distribution,
        split_tensor,
        split_val,
        parameters_to_condition_on
        )
    
    return x 