# Galazy Zoo as a training set for a Galaxy Classification neural network

In [None]:
import h5py
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
    
# this package
from astronn.data import fetch_GalaxyZoo

%matplotlib inline

### Load up Galaxy Zoo data - contains train and validation data

In [None]:
cache_file = fetch_GalaxyZoo()

In [None]:
def randomize(data, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_data = data[permutation]
    shuffled_labels = labels[permutation]
    return shuffled_data, shuffled_labels

with h5py.File(cache_file, 'r') as f:
    train_dataset, train_labels = randomize(f['train']['images'][:], f['train']['labels'][:])
#    valid_dataset, valid_labels = randomize(f['validate']['images'][:], f['validate']['labels'][:])
    valid_dataset, valid_labels = randomize(f['test']['images'][0:5000], f['test']['labels'][0:5000])
    test_dataset, test_labels = randomize(f['test']['images'][5000:10000], f['test']['labels'][5000:10000])


### Let's explore this awesome data

In [None]:
fig,axes = plt.subplots(4,6,figsize=(8,5.3),sharex=True,sharey=True)
with h5py.File(cache_file, 'r') as f:
    for i in range(24):
        ax = axes.flat[i]
        
        idx = np.random.randint(f['test']['images'].shape[0])
        ax.imshow(f['test']['images'][idx], interpolation='nearest')
        

In [None]:
# hdf5 data structure

with h5py.File(cache_file, 'r') as f:
    print f.filename
    print f.name
    print f.keys()
    print f['train']['images'].shape[0], "images in the training set"
    print f['test']['images'].shape[0], "images in the testing set"
    print f['validate']['images'].shape[0], "images in the validation set"
    print f['test'].name    
    print f['test'].keys()
    print f['test']['metadata'].dtype.names
    print f['test']['images'].shape
    print f['test']['labels'].shape

In [None]:
fig,axes = plt.subplots(1,6,figsize=(12,2),sharex=True,sharey=True)

with h5py.File(cache_file, 'r') as f:
    
    n_images = f['train']['labels'].shape[0]
    
        
    gal_class = np.zeros((6, n_images))
    for i in np.arange(6):
        ax = axes.flat[i]
        ax.set_xticks([0.2, 0.5, 0.8])
        
        gal_class[i] = np.array(f['train']['labels']).T[i]
        gal_class[i] = np.sort(gal_class[i])
    
        n_50pct = len(gal_class[i][gal_class[i]>0.5])
        n_75pct = len(gal_class[i][gal_class[i]>0.75])
        n_90pct = len(gal_class[i][gal_class[i]>0.9])
            
        ax.plot(np.arange(n_images)/float(n_images), gal_class[i])
        ax.set_title(str(i) + "\nP>50%:" + str(n_50pct) + "\n" + "P>75%:" + str(n_75pct) + "\n" + "P>90%:" + str(n_90pct))
    
plt.show()

### Let's create a refined training set that includes only galaxies with a classification > 50%

In [None]:
fig,axes = plt.subplots(6,5,figsize=(8,9),sharex=True,sharey=True)

for i in np.arange(6):
        
    args = test_labels.T[i].argsort()[-5:]
    
    # Plot 5 highest probability galaxy types
    for j in np.arange(5):
        ax = axes.flat[5*i + j]
        ax.imshow(test_dataset[args[j]], interpolation='nearest')
        ax.set_xticks([0, 10, 20, 30])
        ax.set_yticks([0, 10, 20, 30])
        if j == 0: ax.set_ylabel(str(i))


In [None]:
args = np.array([], dtype=int)

for i in np.arange(6):

    args = np.append(args, train_labels.T[i].argsort()[-1500:])
    
train_subdata, train_sublabels = randomize(train_dataset[args], train_labels[args])

print train_subdata.shape
print train_sublabels.shape

### Set all labels to highest likelihood class

In [None]:
def orthogonal_basis(array_in):
    array_tmp = np.zeros(array_in.shape)
    
    array_tmp[np.arange(len(array_in)),np.argmax(array_in, axis=1)] = 1

    return array_tmp
    

train_sublabels = orthogonal_basis(train_sublabels)
test_labels = orthogonal_basis(test_labels)
valid_labels = orthogonal_basis(valid_labels)



### Now we reformat into 1-hats

In [None]:
image_size = 32
num_labels = 6

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
    # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
#    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels

train_subdata, train_sublabels = reformat(train_subdata, train_sublabels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)

print('Training set', train_subdata.shape, train_sublabels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

### Let's now build our neural network

In [None]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])

In [None]:
batch_size = 512
image_size = 32
num_labels = 6


def run_graph(num_steps=1001, beta=1.0e-4, offset_flag=False, step_graph=False, dropout_flag=False):
    graph = tf.Graph()
    with graph.as_default():
    
        # Input data. For the training data, we use a placeholder that will be fed
        # at run time with a training minibatch.
        tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
        tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
        tf_valid_dataset = tf.constant(valid_dataset)
        tf_test_dataset = tf.constant(test_dataset)
        keep_prob = tf.placeholder(tf.float32)
  
        # First step to move from image to first layer
        W_fc1 = tf.Variable(tf.truncated_normal([32**2, 1024], stddev=0.1))
        b_fc1 = tf.Variable(tf.constant(0.1, shape=[1024]))
        h_fc1 = tf.nn.relu(tf.matmul(tf_train_dataset, W_fc1) + b_fc1)

        # Add dropout layer if flag is true
        if dropout_flag:
            keep_prob = tf.placeholder("float")
            h_fc1_out = tf.nn.dropout(h_fc1, keep_prob)
        else:
            h_fc1_out = h_fc1
            
        
        # Move from first layer to output
        W_fc2 = tf.Variable(tf.truncated_normal([1024, 6], stddev=0.1))
        b_fc2 = tf.Variable(tf.constant(0.1, shape=[6]))
        logits = tf.matmul(h_fc1_out, W_fc2) + b_fc2
        

        # Improvement is in the loss function IS HERE
        # Training computation.
        loss_xentropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
        loss_l2 = tf.nn.l2_loss(W_fc2)
        loss = loss_xentropy + beta*loss_l2
    
        # Optimizer.
        optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
        # Predictions for the training dataset
        train_prediction = tf.nn.softmax(logits)
    
        # Predictions for the validation dataset
        h_valid1 = tf.nn.relu(tf.matmul(tf_valid_dataset, W_fc1) + b_fc1)
        h_valid2 = tf.matmul(h_valid1, W_fc2) + b_fc2
        valid_prediction = tf.nn.softmax(h_valid2)
    
        # Predictions for the test dataset
        h_test1 = tf.nn.relu(tf.matmul(tf_test_dataset, W_fc1) + b_fc1)
        h_test2 = tf.matmul(h_test1, W_fc2) + b_fc2
        test_prediction = tf.nn.softmax(h_test2)
    
    
    step_x = np.array([])
    minibatch_accuracy = np.array([])
    validation_accuracy = np.array([])

    with tf.Session(graph=graph) as session:
        tf.initialize_all_variables().run()
        print("Initialized, beta =", beta)
        for step in range(num_steps):
        
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            if offset_flag:
                offset = (np.random.randint(20, size=1) * batch_size) % (train_sublabels.shape[0] - batch_size)
            else:
                offset = (step * batch_size) % (train_sublabels.shape[0] - batch_size)
        
            # Generate a minibatch.
            batch_data = train_subdata[offset:(offset + batch_size), :]
            batch_labels = train_sublabels[offset:(offset + batch_size), :]
                
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5}
            _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        

            if (step % 100 == 0):
                print("Minibatch loss at step %d: %f" % (step, l))
                print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
                step_x = np.append(step_x, step)
                minibatch_accuracy = np.append(minibatch_accuracy, accuracy(predictions, batch_labels))
                print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
                feed_dict_val = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 1.0}
                validation_accuracy = np.append(validation_accuracy,
                                                accuracy(valid_prediction.eval(feed_dict=feed_dict_val), valid_labels))
                
#        print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
        acc_out = accuracy(test_prediction.eval(feed_dict=feed_dict_val), test_labels)

        if step_graph:
            plt.plot(step_x, minibatch_accuracy, color='k')
            plt.plot(step_x, validation_accuracy, color='r')
            plt.show()
    
    return acc_out

In [None]:
run_graph(beta=0.01)

In [None]:
beta = 10.0**np.linspace(-4.0, 0.0, 20)

acc = np.array([])
for b in beta:
    acc = np.append(acc, run_graph(beta=b))
    

plt.plot(beta, acc, color='k')
plt.xscale('log')
plt.xlabel(r"$\beta$")
plt.ylabel("test set accuracy")
plt.show()
    
