In [2]:
import tensorflow as tf
import numpy as np
import mdn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
from timeit import default_timer as timer

bin_count = 171

In [3]:
def create_test_train(data_set_path, test_size=0.10):
    """ Splits a given csv file into testing and training. Target column is all the bins. Add a c """
    # Make sure the columns are set
    data_set = pd.read_csv(data_set_path)

    # Shuffle the data
    data_set = data_set.sample(frac=1, random_state=0)
 
    # Select all except output bins
    data_set_X = data_set.drop([f'Output_Bin_{i}' for i in range(bin_count)], axis=1)
    # Select only the output bins
    data_set_Y = data_set[[f'Output_Bin_{i}' for i in range(bin_count)]]

    #Split into training and test data
    return train_test_split(data_set_X,
                            data_set_Y,
                            test_size=test_size, 
                            random_state=300)

#filename = "/scratch/keh4nb/dust_training_data_all_bins_large.csv"
filename= "/project/SDS-capstones-kropko21/uva-astronomy/dust_training_data_all_bins_v2.csv"
X_train, X_test, y_train, y_test = create_test_train(filename, test_size=0.10)
display(y_test.describe())

Unnamed: 0,Output_Bin_0,Output_Bin_1,Output_Bin_2,Output_Bin_3,Output_Bin_4,Output_Bin_5,Output_Bin_6,Output_Bin_7,Output_Bin_8,Output_Bin_9,...,Output_Bin_161,Output_Bin_162,Output_Bin_163,Output_Bin_164,Output_Bin_165,Output_Bin_166,Output_Bin_167,Output_Bin_168,Output_Bin_169,Output_Bin_170
count,142330.0,142330.0,142330.0,142330.0,142330.0,142330.0,142330.0,142330.0,142330.0,142330.0,...,142330.0,142330.0,142330.0,142330.0,142330.0,142330.0,142330.0,142330.0,142330.0,142330.0
mean,0.004788404,0.003511234,0.003102781,0.003476701,0.003457728,0.003501525,0.003642096,0.003778553,0.003935451,0.004021584,...,0.001428,0.001252,0.001058,0.000883,0.000728,0.000676,0.001939,0.005278,0.01627,0.151275
std,0.01166852,0.00780303,0.006573299,0.007853398,0.006781768,0.006542634,0.006904298,0.006660142,0.007485042,0.00683656,...,0.007746,0.007059,0.006218,0.005424,0.00479,0.004623,0.006232,0.013469,0.040799,0.326542
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.413419e-11,5.957408e-11,2.004835e-10,7.735218e-10,2.20372e-09,4.972974e-09,1.233129e-08,2.64102e-08,5.567167e-08,1.080826e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0001091139,0.0001224256,0.0001354748,0.000179149,0.0002160863,0.0002491617,0.0002911052,0.0003537974,0.0004251557,0.000501017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.002666961,0.002534129,0.002509944,0.002984808,0.003315966,0.003667415,0.004060019,0.00450234,0.004900773,0.005306905,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0935414,0.05693502,0.04501455,0.3978454,0.1493864,0.03745879,0.2716096,0.03748155,0.3097229,0.1159227,...,0.086597,0.087307,0.087176,0.087361,0.087655,0.09135,0.10538,0.133252,0.245536,0.999991


Load the first xgboost model and predict whether the output is all at the end or not

In [5]:
import xgboost as xgb

xgb_model_at_end = xgb.XGBClassifier()
xgb_model_at_end.load_model("xgb_model_is_end.model")
preds_at_end = xgb_model_at_end.predict(X_test)

# Get the indexes for outputs at end
output_is_end_idx = y_test.index[preds_at_end == 1]

KeyError: 'Output_Is_End'

Predict for the spikes at end graphs

In [None]:
X_test_spike = X_test.loc[output_is_end_idx]
y_test_spike = y_test.loc[output_is_end_idx]
# Note: Need to record the k used for sampling
k=32
# Returns the predictions of the parameters of the distributions and weights
spiked_model = tf.keras.models.load_model('spiked_mdn_model/model', custom_objects={'MDN': mdn.MDN, 'mdn_loss_func': mdn.get_mixture_loss_func(1, k)})

preds_spike = spiked_model.predict(X_test_spike)
samples_list_spike = []
# Obtain 10 samples per prediction
for i in range(5):
    samples_list_spike.append(np.apply_along_axis(mdn.sample_from_output, 1, preds_spike, bin_count, k, temp=1.0, sigma_temp=1.0))

# Average the samples for our predicitons
y_samples_spike = np.mean(np.array(samples_list_spike),axis=0)
y_samples_spike = y_samples_spike.reshape(len(X_test_spike),bin_count)
# Convert negatives to 0
y_samples_spike = np.clip(y_samples_spike,0, a_max=None)

In [None]:
from scipy.stats import entropy

# Renormalize samples
y_samples_spike_normalized = []
for s in y_samples_spike:
    y_samples_spike_normalized.append(np.divide(s,np.sum(s)))

def obs_from_bins(bins):
    # Scale our data to our bins
    #print(bins)
    n = 200
    t = np.floor(np.multiply(bins, n))
    #print(sum(bins), 'bin sum')
    #print(sum(t), 't sum')
    new_samp = []
    for idx, e in enumerate(t):
        if e:
            # To generate our samples add the bin number e number of times
            new_samp.extend([idx for _  in range(int(e))])
    
    while len(new_samp) < n:
        new_samp.append(max(new_samp))
    #print(len(new_samp))
    return new_samp


def evaluate_fit(y_samples, y_test):
    ent = []

    # Turn all negative preds to 0
    y_samples = np.clip(y_samples,0, a_max=None)

    y_test_obs = [obs_from_bins(s.array) for idx, s in y_test.iterrows()]
    y_samples_obs = [obs_from_bins(s) for s in y_samples]
    
    # Small constant to prevent inf for 0s
    #c = 1e-100
    
    #y_test_obs += c
    #y_samples_obst += c

    for i in range(len(y_test)):
        e = entropy(y_test_obs[i], y_samples_obs[i])
        ent.append(e if e != np.inf else 1000)   
    display(pd.DataFrame(ent).describe())

Evalue the spike model

In [None]:
evaluate_fit(y_samples_spike_normalized, y_test_spike)

In [None]:
from matplotlib import pyplot as plt
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 10))

for ax in axes.flatten():
    # Generate random index to plot
    i = np.random.randint(len(y_test_spike), size=1)[0]
    test_pred = y_samples_spike[i]
    true = y_test_spike.iloc[i]
    ax.plot(test_pred, color='blue', label='pred')
    ax.plot(true, color='orange', label='true')
    ax.xaxis.set_visible(False)
    ax.legend(loc="upper left")
    e = entropy(test_pred, true)
    ax.set_title(e)
    
plt.show()

For the non-spiked, run a different classifier to predict when the first 0 will occur from the right

In [10]:
import xgboost as xgb
# Load first zero model
xgb_model_first_zero = xgb.XGBRegressor()
xgb_model_first_zero.load_model('xgb_model_first_zero.model')

# Only predict on output not at end
output_not_end_test_idx = y_test.index[preds_at_end == 0]
X_test_not_end = X_test.loc[output_not_end_test_idx]
y_test_not_end = y_test.loc[output_not_end_test_idx]

# Predict the first zero bins Round because bins are whole numbers
preds_first_zero = np.rint(xgb_model_first_zero.predict(X_test_not_end))

# Add these as a predictor to the mdn model
X_test_not_end['Output_First_Zero'] = preds_first_zero
display(X_test_not_end.describe())

Unnamed: 0,R,Mstar,alpha,d2g,sigma,Tgas,Input_Bin_0,Input_Bin_1,Input_Bin_2,Input_Bin_3,...,Input_Bin_144,Input_Bin_145,Input_Bin_146,Input_Bin_147,Input_Bin_148,Input_Bin_149,Input_Bin_150,t,Delta_t,Output_First_Zero
count,60409.0,60409.0,60409.0,60409.0,60409.0,60409.0,60409.0,60409.0,60409.0,60409.0,...,60409.0,60409.0,60409.0,60409.0,60409.0,60409.0,60409.0,60409.0,60409.0,60409.0
mean,90.123679,1.0,0.021587,0.145886,221.723665,26.887871,0.005833,0.004432,0.003996,0.004463,...,0.000655,0.000457,0.000301,0.0002,0.000135,8.8e-05,0.00304,3391656000000.0,5254239000000.0,97.089371
std,116.789323,0.0,0.038233,0.326546,816.037189,27.815883,0.012172,0.00811,0.006812,0.008631,...,0.004715,0.003634,0.002807,0.002415,0.002258,0.002067,0.037897,5676912000000.0,7436204000000.0,30.359917
min,0.316228,1.0,1e-05,0.0001,0.152053,4.472136,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0
25%,9.272498,1.0,0.0001,0.0001,1.213275,8.936554,3.5e-05,4.5e-05,5.3e-05,7.2e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,79088680000.0,125835100000.0,74.0
50%,37.026011,1.0,0.001,0.01,7.545509,16.434123,0.000995,0.001008,0.001041,0.001288,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,666350000000.0,1460484000000.0,95.0
75%,125.216008,1.0,0.01,0.1,60.20794,32.839884,0.004347,0.004517,0.004694,0.004877,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3971115000000.0,7779626000000.0,121.0
max,500.0,1.0,0.1,1.0,9559.802528,177.827941,0.093878,0.056898,0.045007,0.389976,...,0.075148,0.076681,0.077753,0.092316,0.128576,0.165617,0.840638,31558150000000.0,31558150000000.0,161.0


In [20]:
evaluate_fit(spike_preds, y_test_spike)

Unnamed: 0,0
count,19660.0
mean,3.733988
std,1.047191
min,0.414306
25%,3.325009
50%,3.963877
75%,4.417032
max,7.878138


Create a seperate mdn model to use on the rest.

In [None]:
# Note: Need to record the k used for sampling
k=16
# Returns the predictions of the parameters of the distributions and weights
#TODO: Get non-spiked model
non_spiked_model = tf.keras.models.load_model('saved_model/my_model')

preds_non_spike = model.predict(X_test_non_spike)
samples_list_non_spike = []

for i in range(10):
    samples_list_spike.append(np.apply_along_axis(mdn.sample_from_output, 1, preds_non_spike, bin_count, k, temp=1.0, sigma_temp=1.0))

# Average the samples for our predicitons
y_samples_non_spike = np.mean(np.array(samples_list_non_spike),axis=0)
y_samples_non_spike = y_samples.reshape(len(X_testnon_spike),bin_count)
# Convert negatives to 0
y_samples_non_spike = np.clip(y_samples_spike,0, a_max=None)

##TODO Apply 0s to preds from previous classifier

In [25]:
evaluate_fit(non_spike_preds, y_test_non_spike)

Unnamed: 0,0
count,60374.0
mean,59.828494
std,40.482187
min,0.875193
25%,31.162486
50%,51.231835
75%,81.2418
max,200.490032
