The code below goes through each model of the simulated data, and created training data from the output distributions. It will sample up to 101 pairs of entry in each model always including the first and last pair. It calculates delta_t (time in seconds) between the pairs and the mean of the second pair which is what we want to predict. 

First we did not do any transformations.

In [3]:
import pandas as pd
import numpy as np
import os
import json
import random
import math
from tqdm import notebook # Library for displaying progress bar

These are helper functions pulled out from the main code to make it easier to modify.

In [1]:
def process_sample(i, snapshot_count, rhod, time):
    """ Creates a training sample from two points in time and calculates the mean of the output"""
    # First sample will always be the first and last element
    if i == 0:
        idxs = [0, snapshot_count-1]
    else:
        # Pick two indexes for snapshots (lowest = input, highest = output)
        idxs = sorted([random.randint(0,snapshot_count-1) for _ in range(2)])
        
    input_a = rhod[idxs[0]]
    
    # Time of the input
    t = time[idxs[0]]
        
    # Difference of time in seconds between two snapshots
    delta_t = time[idxs[1]] - t
    
    # Target variable is the average density of another snapshot in time
    output_mean = np.mean(rhod[idxs[1]])
    
    row = np.concatenate([input_params,input_a,[t, delta_t, output_mean]])
    return row

def write_to_file(data, header=True, batch=False):
    """ Helper method to write training data to a file"""
    columns = ['R', 'Mstar', 'alpha', 'd2g', 'sigma', 'Tgas'] + [f'Bin_{i}' for i in range(rhod.shape[1])] + ['t','Delta_t', 'y']
    df = pd.DataFrame(res, columns=columns)

    # If writing in batch set the file mode to append
    mode = 'a' if batch else 'w'
    df.to_csv(filename, chunksize=100000, mode=mode, header=header, index=False)

This code will loop through each model, create a training row with y (average of second distribution).

In [None]:
filename = 'dust_training_data_small_20.csv'
root_data_path = "/project/SDS-capstones-kropko21/uva-astronomy/data/dust_coag_data_v1"
data_group = "combined_v1"

# Store formatted data for training
res = []

chunk_size = 1000
model_count = 10000
writes = 0
for d in notebook.tqdm(range(model_count)):
    data_set = data_set = str(d).zfill(5)

    data_dir = f"{root_data_path}/{data_group}/data_{data_set}"

    input_params = None
    # Open and extract the input parameters
    with open(os.path.join(root_data_path, "model_dict_v1.json")) as f:
        model_dict = json.load(f)
        input_dict = model_dict[data_set]
        input_params = [input_dict['R'], input_dict['Mstar'], input_dict['alpha'],input_dict['d2g'], input_dict['sigma'], input_dict['Tgas']]

    try:
        # `rho_dat`: The dust mass density (in g/cm^3) in each particle size/bin at a given snapshot in time. This is the main "output", i.e., the primary result, of any given model.
        rhod = np.loadtxt(os.path.join(data_dir,"rho_d.dat"))
        # Replace NaNs with 0s
        rhod = np.nan_to_num(rhod)
        
        # `a_grid.dat`: The dust particle size in each "bin" in centimeters.
        a_grid = np.loadtxt(os.path.join(data_dir, 'a_grid.dat'))

        # `time.dat`: The time of each snapshot (in seconds).
        time = np.loadtxt(os.path.join(data_dir, "time.dat"))
    except Exception as e:
        print(f'model {d} skipped')
        import traceback
        print(traceback.print_exc())
        continue

    snapshot_count = len(rhod)

    # Set the number of samples (tried max of 10000 but jupyter kernel kept crashing)
    if snapshot_count > 15:
        # Set the max to 10000 for time as 150 cHr 2 is about 11000
       samples = 100
    else:
        # The number of pairs
       samples = int(math.factorial(snapshot_count) / math.factorial(2) / math.factorial(snapshot_count-2))
    
    samples += 1
    for i in range(samples):
        row = process_sample(i, snapshot_count, rhod, time)
        res.append(row)
        
    # Write to csv every x models to avoid oom
    #if d != 0 and d % chunk_size == (model_count - 1) % chunk_size:
    #    writes += 1
    #    # Only write the header on first chunk
    #    header = writes == 1
    
# Write out the entire file at the end
write_to_file(res)


In [22]:
read_df = pd.read_csv(filename)
read_df.describe()

Unnamed: 0,R,Mstar,alpha,d2g,sigma,Tgas,Bin_0,Bin_1,Bin_2,Bin_3,...,Bin_145,Bin_146,Bin_147,Bin_148,Bin_149,Bin_150,Bin_151,t,Delta_t,y
count,800131.0,800131.0,800131.0,800131.0,800131.0,800131.0,800131.0,800131.0,800131.0,800131.0,...,800131.0,800131.0,800131.0,800131.0,800131.0,800131.0,800131.0,800131.0,800131.0,800131.0
mean,40714.883732,69.942751,1.0,0.016301,0.185592,698.739173,42.074054,4.327434e+48,4.558752e+48,4.7705850000000005e+48,...,1.824384e-14,1.430676e-14,2.317453e-13,1.673062e-13,1.617644e-12,8.228013e-12,2.907163e-10,3237179000000.0,5688001000000.0,-2.268093e+40
std,24265.921448,108.697432,0.0,0.034412,0.362841,1657.785735,42.523274,3.870892e+51,4.077805e+51,4.267291e+51,...,3.886467e-13,2.80319e-13,5.398921e-12,1.861797e-12,3.240495e-11,1.522909e-10,2.550048e-09,5482230000000.0,7769287000000.0,1.171337e+43
min,0.0,0.316228,1.0,1e-05,0.0001,0.152053,4.472136,-1.435424e-13,-1.036626e-14,-9.229458e-15,...,-1.509965e-248,9.18254e-198,-8.151477e-30,-1.275597e-22,-1.267761e-14,1.6099180000000002e-120,1.6099180000000002e-120,0.0,0.0,-6.049262e+45
25%,20003.0,2.792938,1.0,0.0001,0.001,2.110976,10.748423,1.459631e-40,4.8766620000000005e-39,2.981931e-37,...,1.935889e-116,1.964865e-116,1.935889e-116,1.964865e-116,1.935889e-116,1.964865e-116,1.964865e-116,94743000000.0,173553500000.0,6.894017999999999e-19
50%,40006.0,17.693273,1.0,0.001,0.01,22.842133,23.773653,2.077302e-20,2.1223959999999998e-20,2.2636169999999998e-20,...,1.1241600000000001e-114,1.140858e-114,1.1241600000000001e-114,1.140858e-114,1.1241600000000001e-114,1.140858e-114,1.1241600000000001e-114,668099000000.0,1825145000000.0,3.9316390000000006e-17
75%,60009.0,86.558651,1.0,0.01,0.1,364.214276,59.836936,3.742349e-18,3.567534e-18,3.546004e-18,...,7.191572e-76,9.738442e-92,2.607671e-109,4.194422e-110,5.8761329999999996e-111,2.794099e-111,1.4656759999999999e-111,3657707000000.0,8563278000000.0,3.931639e-15
max,95196.0,500.0,1.0,0.1,1.0,9559.802528,177.827941,3.462514e+54,3.6475990000000004e+54,3.8170930000000004e+54,...,2.83871e-11,4.156829e-11,4.237247e-10,3.257025e-10,1.441383e-09,6.47401e-09,5.333178e-08,31558150000000.0,31558150000000.0,1.3845659999999999e+40


Then we tried doing a log transformation on all of the bins and the resulting mean. The data had NaN and values less than 0, so we replaced these with 0. We also added a tiny constant to each one so we didn't take the log of 0.

In [None]:
def process_sample(i, sample_count, rhod, time):
    """ Creates a training sample from two points in time and calculates the mean of the output"""
    # First sample will always be the first and last element
    if i == 0:
        idxs = [0, snapshot_count-1]
    else:
        # Pick two indexes for snapshots (lowest = input, highest = output)
        idxs = sorted([random.randint(0,snapshot_count-1) for _ in range(2)])
        
    # Get the input distribution of dust mass density
    # Add a very small number to avoid taking the log of 0
    input_a = np.log10(rhod[idxs[0]] + np.finfo(np.float64).tiny)

    # Time of the input
    t = time[idxs[0]]
        
    # Difference of time in seconds between two snapshots
    delta_t = time[idxs[1]] - t
    
    # Target variable is the average density of another snapshot in time
    # Add a very small number to avoid taking the log of 0
    output_mean = np.mean(np.log10(rhod[idxs[1]] + np.finfo(np.float64).tiny))

    
    row = np.concatenate([input_params,input_a,[t, delta_t, output_mean]])
    return row
    
filename = 'dust_training_data_log_v2.csv'
root_data_path = "/project/SDS-capstones-kropko21/uva-astronomy/data/dust_coag_data_v1"
data_group = "combined_v1"

# Store formatted data for training
res = []

chunk_size = 1000
model_count = 10000
writes = 0
for d in notebook.tqdm(range(model_count)):
    data_set = data_set = str(d).zfill(5)

    data_dir = f"{root_data_path}/{data_group}/data_{data_set}"

    input_params = None
    # Open and extract the input parameters
    with open(os.path.join(root_data_path, "model_dict_v1.json")) as f:
        model_dict = json.load(f)
        input_dict = model_dict[data_set]
        input_params = [input_dict['R'], input_dict['Mstar'], input_dict['alpha'],input_dict['d2g'], input_dict['sigma'], input_dict['Tgas']]

    try:
        # `rho_dat`: The dust mass density (in g/cm^3) in each particle size/bin at a given snapshot in time. This is the main "output", i.e., the primary result, of any given model.
        rhod = np.loadtxt(os.path.join(data_dir,"rho_d.dat"))
        # Replace NaNs with 0s
        rhod = np.nan_to_num(rhod)
        # Replace negative values with 0s.. is this right?
        rhod = np.where(rhod<0, 0, rhod) 
        
        # `a_grid.dat`: The dust particle size in each "bin" in centimeters.
        a_grid = np.loadtxt(os.path.join(data_dir, 'a_grid.dat'))

        # `time.dat`: The time of each snapshot (in seconds).
        time = np.loadtxt(os.path.join(data_dir, "time.dat"))
    except Exception as e:
        print(f'model {d} skipped')
        import traceback
        print(traceback.print_exc())
        continue

    snapshot_count = len(rhod)

    # Set the number of samples (tried max of 10000 but jupyter kernel kept crashing)
    if snapshot_count > 15:
        # Set the max to 10000 for time as 150 cHr 2 is about 11000
       samples = 100
    else:
        # The number of pairs
       samples = int(math.factorial(snapshot_count) / math.factorial(2) / math.factorial(snapshot_count-2))
    
    #OR select each pair of rows in order
    #samples = len(rhod) - 1
    
    samples += 1
    for i in range(samples):
        row = process_sample(i, snapshot_count, rhod, time)
        res.append(row)
        #print(row)
        
    # Write to csv every x models
    if d != 0 and d % chunk_size == (model_count - 1) % chunk_size:
        writes += 1
        # Only write the header on first chunk
        header = writes == 1
        res.clear()
    
# Write out the entire file at the end
#write_to_file(res)

    

In [3]:
read_df = pd.read_csv('dust_training_data_log_v2.csv')
read_df.describe()

Unnamed: 0,R,Mstar,alpha,d2g,sigma,Tgas,Bin_0,Bin_1,Bin_2,Bin_3,...,Bin_144,Bin_145,Bin_146,Bin_147,Bin_148,Bin_149,Bin_150,t,Delta_t,y
count,800338.0,800338.0,800338.0,800338.0,800338.0,800338.0,800338.0,800338.0,800338.0,800338.0,...,800338.0,800338.0,800338.0,800338.0,800338.0,800338.0,800338.0,800338.0,800338.0,800338.0
mean,69.939112,1.0,0.016297,0.185802,698.571593,42.066924,-65.815133,-63.954152,-62.047036,-59.867839,...,-91.893412,-92.115311,-90.82278,-90.380698,-90.672768,-90.707993,-90.583315,3230440000000.0,5701342000000.0,-55.824754
std,108.683875,0.0,0.034409,0.36303,1657.638323,42.52049,94.410072,91.939246,89.493226,86.777187,...,40.341564,40.038341,42.617556,43.17369,43.455311,43.148602,43.713958,5474076000000.0,7782881000000.0,26.797276
min,0.316228,1.0,1e-05,0.0001,0.152053,4.472136,-307.652656,-307.652656,-307.652656,-307.652656,...,-307.652656,-192.615873,-307.652656,-307.652656,-307.652656,-119.793196,-119.793196,0.0,0.0,-294.263792
25%,2.792938,1.0,0.0001,0.001,2.110976,10.748423,-39.911313,-38.353616,-36.527678,-34.629517,...,-115.713119,-115.706667,-115.713119,-115.706667,-115.706667,-115.705023,-115.705023,94689490000.0,173498400000.0,-70.106592
50%,17.693273,1.0,0.001,0.01,22.842133,23.773653,-19.681661,-19.670043,-19.641722,-19.537965,...,-113.942768,-113.942768,-113.949172,-113.942768,-113.949172,-113.942768,-113.942768,665729900000.0,1825147000000.0,-54.455769
75%,86.558651,1.0,0.01,0.1,364.214276,59.836936,-17.427284,-17.450572,-17.45101,-17.373018,...,-75.994501,-96.571258,-108.635613,-109.443423,-110.260692,-110.589844,-110.856092,3651732000000.0,8583024000000.0,-37.795473
max,500.0,1.0,0.1,1.0,9559.802528,177.827941,-9.633772,-9.617105,-9.600438,-9.583772,...,-10.454954,-10.381238,-9.372916,-9.475613,-8.841221,-8.188826,-7.273014,31558150000000.0,31558150000000.0,-11.760821
