In [18]:
import pandas as pd
import numpy as np
import os
import json
import random
import math
from tqdm import notebook # Library for displaying progress bar
from distfit import distfit

bin_count = 151

def process_sample(i, snapshot_count, rhod, time, bin_size=15, num_bins=151):
    """ Creates a training sample from two points in time. Selects a random output bin for y, and saves the output bins for comparison"""
    # First sample will always be the first and last element
    if i == 0:
        idxs = [0, snapshot_count-1]
    else:
        # Pick two indexes for snapshots (lowest = input, highest = output)
        idxs = sorted([random.randint(0,snapshot_count-1) for _ in range(2)])
    input_a = rhod[idxs[0]]
    output_a = rhod[idxs[1]]
    
    new_input_bins = []
    new_output_bins = []
    input_bin_sum = np.sum(input_a)
    output_bin_sum = np.sum(output_a)
    for i in range(len(input_a)):
            
        # Get the old bins and sum them together to create the new one
        # Also normalize the input bins
        # Could add a statement here to leave out one of the input bins
        new_input_bin = input_a[i] / input_bin_sum
        if new_input_bin < 1e-30:
            new_input_bin = 0
        new_input_bins.append(new_input_bin)
        
        # Normalize the output bin so we can compare the prob distribution to it
        new_output_bin = output_a[i] / output_bin_sum
        if new_output_bin < 1e-30:
            new_output_bin = 0
        new_output_bins.append(new_output_bin)

    # Initialize distfit to fit the exponential gaussian distribution
    dist = distfit(distr='exponnorm')
    
    # Scale our data to our bins
    t = np.floor(np.multiply(new_output_bins,bin_count))

    new_samp = []
    for idx, e in enumerate(t):
        if e:
            # To generate our samples add the bin number e number of times
            new_samp.extend([idx for _  in range(int(e))])
    # Fit the distribution to our data
    if len(new_samp) == 0:
        exgauss_expected_value = 0
    else:
        dist.fit_transform(np.array(new_samp), verbose=1)
        # Extract the model parameters
        K, loc, scale = dist.model['params']
        # Obtain the expected value of the distribution
        exgauss_expected_value = loc + (K * scale)

    # Time of the input
    t = time[idxs[0]]
        
    # Difference of time in seconds between two snapshots
    delta_t = time[idxs[1]] - t
    
    row = np.concatenate([input_params,new_input_bins,[t, delta_t], new_output_bins, [exgauss_expected_value]])
    return row

def write_to_file(data, header=True, batch=False):
    """ Helper method to write training data to a file"""
    columns = ['R', 'Mstar', 'alpha', 'd2g', 'sigma', 'Tgas'] + [f'Input_Bin_{i}' for i in range(bin_count)] + ['t','Delta_t'] + [f'Output_Bin_{i}' for i in range(bin_count)] + ["Output_Exgauss_Mean"]
    df = pd.DataFrame(res, columns=columns)

    # If writing in batch set the file mode to append
    mode = 'a' if batch else 'w'
    df.to_csv(filename, chunksize=100000, mode=mode, header=header, index=False)
    
filename = '/scratch/keh4nb/dust_training_data_all_bins_bayes_100.csv'
root_data_path = "/project/SDS-capstones-kropko21/uva-astronomy/data/dust_coag_data_v1"
data_group = "combined_v1"

# Store formatted data for training
res = []

chunk_size = 100
# Set this to a smaller number to get a smaller training set
model_count = 100
writes = 0
for d in notebook.tqdm(range(model_count)):
    data_set = data_set = str(d).zfill(5)

    data_dir = f"{root_data_path}/{data_group}/data_{data_set}"

    input_params = None
    # Open and extract the input parameters
    with open(os.path.join(root_data_path, "model_dict_v1.json")) as f:
        model_dict = json.load(f)
        input_dict = model_dict[data_set]
        input_params = [input_dict['R'], input_dict['Mstar'], input_dict['alpha'],input_dict['d2g'], input_dict['sigma'], input_dict['Tgas']]

    try:
        # `rho_dat`: The dust mass density (in g/cm^3) in each particle size/bin at a given snapshot in time. This is the main "output", i.e., the primary result, of any given model.
        rhod = np.loadtxt(os.path.join(data_dir,"rho_d.dat"))
        # Replace NaNs with 0s
        rhod = np.nan_to_num(rhod)
        # Replace negative values with 0s
        rhod = np.where(rhod<0, 0, rhod) 
        
        # `a_grid.dat`: The dust particle size in each "bin" in centimeters.
        a_grid = np.loadtxt(os.path.join(data_dir, 'a_grid.dat'))

        # `time.dat`: The time of each snapshot (in seconds).
        time = np.loadtxt(os.path.join(data_dir, "time.dat"))
    except Exception as e:
        print(f'model {d} skipped')
        import traceback
        print(traceback.print_exc())
        continue

    snapshot_count = len(rhod)

    # Set the number of samples
    if snapshot_count > 15:
        # Set the max to 100 for time as 15 cHr 2 is about 100
        samples = 100
    else:
        # The number of pairs
        samples = int(math.factorial(snapshot_count) / math.factorial(2) / math.factorial(snapshot_count-2))
    
    samples += 1
    for i in range(samples):
        row = process_sample(i, snapshot_count, rhod, time)
        res.append(row)
        
    # Write to csv every x models to avoid oom
    if d != 0 and d % chunk_size == (model_count - 1) % chunk_size:
        writes += 1
        # Only write the header on first chunk
        header = writes == 1
        write_to_file(res, header, batch=True)
        res = []

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [24]:
import pandas as pd
pd.read_csv('/scratch/keh4nb/dust_training_data_all_bins_bayes_100.csv', nrows=10).head(10)


Unnamed: 0,R,Mstar,alpha,d2g,sigma,Tgas,Input_Bin_0,Input_Bin_1,Input_Bin_2,Input_Bin_3,...,Output_Bin_142,Output_Bin_143,Output_Bin_144,Output_Bin_145,Output_Bin_146,Output_Bin_147,Output_Bin_148,Output_Bin_149,Output_Bin_150,Output_Exgauss_Mean
0,0.316228,1.0,1e-05,0.0001,9559.802528,177.827941,0.004347,0.004517,0.004694,0.004877,...,0.0,0.0,7.363925e-08,2e-06,0.007944,0.005785,0.026953,0.116548,0.842768,149.805268
1,0.316228,1.0,1e-05,0.0001,9559.802528,177.827941,0.0,0.0,0.0,0.0,...,4.685381e-16,1.51941e-15,3.093949e-05,0.000222,0.012034,0.010959,0.027267,0.112318,0.837169,149.790588
2,0.316228,1.0,1e-05,0.0001,9559.802528,177.827941,0.0,0.0,0.0,0.0,...,7.144664000000001e-18,2.488595e-17,1.631775e-05,0.000136,0.011395,0.010144,0.027147,0.113038,0.838123,149.785895
3,0.316228,1.0,1e-05,0.0001,9559.802528,177.827941,0.0,0.0,0.0,0.0,...,6.891912999999999e-19,2.494373e-18,1.149224e-05,0.000104,0.011075,0.009736,0.027101,0.113386,0.838586,149.785895
4,0.316228,1.0,1e-05,0.0001,9559.802528,177.827941,0.0,0.0,0.0,0.0,...,1.377272e-05,2.514533e-05,0.002410983,0.005786,0.020473,0.022946,0.034801,0.095616,0.817898,149.696024
5,0.316228,1.0,1e-05,0.0001,9559.802528,177.827941,0.0,0.0,0.0,0.0,...,0.00127566,0.001867369,0.00876471,0.015132,0.028372,0.035028,0.047348,0.080027,0.7783,149.506726
6,0.316228,1.0,1e-05,0.0001,9559.802528,177.827941,0.0,0.0,0.0,0.0,...,3.6561060000000005e-23,1.538793e-22,2.747402e-06,3.5e-05,0.009941,0.008302,0.027001,0.114565,0.840152,149.785895
7,0.316228,1.0,1e-05,0.0001,9559.802528,177.827941,0.0,0.0,0.0,0.0,...,3.4773369999999996e-19,1.272484e-18,1.038112e-05,9.7e-05,0.010985,0.009623,0.02709,0.113482,0.838713,149.785895
8,0.316228,1.0,1e-05,0.0001,9559.802528,177.827941,0.0,0.0,0.0,0.0,...,5.850053e-15,1.813233e-14,4.593136e-05,0.000299,0.012468,0.011516,0.027376,0.111804,0.836491,149.790588
9,0.316228,1.0,1e-05,0.0001,9559.802528,177.827941,0.0,0.0,0.0,0.0,...,3.32442e-07,6.889823e-07,0.001059487,0.003145,0.017846,0.0189,0.03125,0.102463,0.825335,149.748239
