In [2]:
import tensorflow as tf
import numpy as np
import mdn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
from timeit import default_timer as timer

bin_count = 151

In [3]:
def create_test_train(data_set_path, test_size=0.10):
    """ Splits a given csv file into testing and training. Target column is all the bins. Add a c """
    # Make sure the columns are set
    data_set = pd.read_csv(data_set_path)

    # Add column for classifying whether the output has most of the data in the last 10 bins.
    end_average = data_set[[f'Output_Bin_{i}' for i in range(bin_count-10, bin_count)]].sum(axis=1) > 0.9

    data_set['Output_Is_End'] = end_average
    data_set['Output_Is_End'] = data_set['Output_Is_End'].astype(int)
    #data_set['Output_Is_Not_End'] = ~data_set['Output_Is_End']

    # Shuffle the data
    data_set = data_set.sample(frac=1, random_state=0)
 
    # Select all except output bins
    data_set_X = data_set.drop([f'Output_Bin_{i}' for i in range(bin_count)] + ['Output_Is_End'], axis=1)
    # Select only the output bins
    data_set_Y = data_set[['Output_Is_End']]

    #Split into training and test data
    return train_test_split(data_set_X,
                            data_set_Y,
                            test_size=test_size, 
                            random_state=300)

#filename = "/scratch/keh4nb/dust_training_data_all_bins_large.csv"
filename= "/project/SDS-capstones-kropko21/uva-astronomy/dust_training_data_all_bins.csv"
X_train, X_test, y_train, y_test = create_test_train(filename, test_size=0.05)


Try xgboost for predicting whether all the particles are at the end of the distribution. Ended up getting over 99% test accuracy.

In [None]:
#%pip install xgboost
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective="binary:logistic", 
                              random_state=42)
xgb_model.fit(X_train, y_train,
             eval_set=[(X_train, y_train), (X_test, y_test)],
             eval_metric='error',verbose=True)

evals_result = xgb_model.evals_result()
#test_loss, test_acc = xgb_model.evaluate(X_test, y_test)

#print(test_acc)

In [13]:
preds = xgb_model.predict(X_test)

# Count the number of matches between predictions and labels
correct = np.sum(preds == y_test['Output_Is_End'])

# Calculate accuracy
accuracy = correct / len(y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9989254566809106


Example of hyperparameter tuning of Xgboost. Xgboost worked well on its own so this is not needed.
From: https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning

In [None]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

Try to predict whether all the dust is in the last bins with a neural network

In [16]:
# Number of columns (bin count plus 8 inputs)
l = bin_count + 8

activation = 'selu'

# Network
input = tf.keras.Input(shape=(l,))

layer = tf.keras.layers.Dense(1024, activation='relu', name='baselayer')(input)
#layer_2 = tf.keras.layers.Dense(1024, activation='tanh', name='baselayer2')(layer)
#layer_3 = tf.keras.layers.Dense(16, activation='tanh', name='baselayer3')(layer_2)
#layer_4 = tf.keras.layers.Dense(64, activation=activation, name='baselayer4')(layer_3)
#layer_5 = tf.keras.layers.Dense(32, activation=activation, name='baselayer5')(layer_4)
# Connect the mdn layer to the output of our neural network
#mdn_layer = mdn.MDN(bin_count,k, name='mdn')(layer_5)
#model = tf.keras.models.Model(input, [mdn_layer])

# 2 classes
outputs = tf.keras.layers.Dense(1, activation='sigmoid', name="predictions")(layer)
model = tf.keras.models.Model(input, [outputs])


opt = tf.keras.optimizers.Adam(learning_rate=1e4)
#model.compile(loss=mdn.get_mixture_loss_func(bin_count,k), optimizer=tf.keras.optimizers.Adam())
model.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 159)]             0         
_________________________________________________________________
baselayer (Dense)            (None, 1024)              163840    
_________________________________________________________________
predictions (Dense)          (None, 1)                 1025      
Total params: 164,865
Trainable params: 164,865
Non-trainable params: 0
_________________________________________________________________


In [17]:
# Add class weights for imbalanced data (~25% is at the end, 75% are not)
from sklearn.utils import class_weight
weights = class_weight.compute_class_weight('balanced',
                                            np.unique(y_train),
                                            y_train["Output_Is_End"].tolist())

cweights = {0: weights[0], 1: weights[1]}
print(cweights)


fit = model.fit(x=X_train, y=y_train, batch_size=128, epochs=25, validation_split=0.1, class_weight=cweights, callbacks=[tf.keras.callbacks.TerminateOnNaN()])

{0: 0.6621194202979325, 1: 2.0420731183257774}
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


Try to find the best tuning parameters for the neural network. The network gets stuck predicting everything as 0 (75% accuracy), but cannot improve on it.

In [None]:
#%pip install -U keras-tuner
import kerastuner as kt

def build_model(hp):
        # Number of columns (bin count plus 8 inputs)
    l = bin_count + 8

    activation = 'selu'

    # Network
    input = tf.keras.Input(shape=(l,))

    layer =  tf.keras.layers.Dense(hp.Int('hidden_size', 16, 256, step=16, default=128), activation=hp.Choice('activation', values=['relu', 'selu', 'tanh', 'swish'], default='relu'), name='baselayer')(input)
    #layer = tf.keras.layers.Dense(1024, activation='relu', name='baselayer')(input)
    #layer_2 = tf.keras.layers.Dense(1024, activation='tanh', name='baselayer2')(layer)
    #layer_3 = tf.keras.layers.Dense(16, activation='tanh', name='baselayer3')(layer_2)
    #layer_4 = tf.keras.layers.Dense(64, activation=activation, name='baselayer4')(layer_3)
    #layer_5 = tf.keras.layers.Dense(32, activation=activation, name='baselayer5')(layer_4)
    # Connect the mdn layer to the output of our neural network
    #mdn_layer = mdn.MDN(bin_count,k, name='mdn')(layer_5)
    #model = tf.keras.models.Model(input, [mdn_layer])

    # 2 classes
    outputs = tf.keras.layers.Dense(1, activation='sigmoid', name="predictions")(layer)
    model = tf.keras.models.Model(input, [outputs])


    #opt = tf.keras.optimizers.Adam(learning_rate=1e4)
    opt = tf.keras.optimizers.Adam(learning_rate=hp.Float('learning_rate', 1e-5, 1e-2, sampling='log'))
    #model.compile(loss=mdn.get_mixture_loss_func(bin_count,k), optimizer=tf.keras.optimizers.Adam())
    model.compile(optimizer=opt,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model

tuner = kt.tuners.BayesianOptimization(
  build_model,
  objective='val_accuracy',
  max_trials=50)

tuner.search(x=X_train, y=y_train, batch_size=128, epochs=30, validation_split=0.1, class_weight=cweights, callbacks=[tf.keras.callbacks.EarlyStopping(patience=1)])

In [5]:
preds = model.predict(X_test)
display(pd.DataFrame(preds).describe())
display(y_test.describe())

Unnamed: 0,0
count,40017.0
mean,0.502865
std,0.064357
min,0.042015
25%,0.452332
50%,0.50993
75%,0.547307
max,0.598693


Unnamed: 0,Output_Is_End
count,40017.0
mean,0.243771
std,0.429362
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [93]:
tuner.results_summary()

In [95]:
# Retrieve the best model.
best_model = tuner.get_best_models(num_models=1)[0]

# Evaluate the best model.
loss, accuracy = best_model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

























Test accuracy: 0.5297748446464539


In [6]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Test accuracy: 0.5297748446464539
