In [322]:
import numpy as np
import matplotlib.pyplot as plt
from __future__ import absolute_import, division, print_function

# TensorFlow and tf.keras
import tensorflow as tf
processed_data = np.load('Full_array.npy')

from tensorflow import keras

import pandas as pd

#The size of the dataset is 10,000; but I only filled the first 9166 with values, the rest are zeros. (shouldn't be counted)
number = 9166
processed_data = processed_data[0:number]

np.random.seed(1)
tf.set_random_seed(1)

In [323]:
#we need to push accuracy to over 90%

#idea number 1: run a set of high percentage accuracy classifications through the NN

#idea number 2: clean the training set by running through and deleting the bad examples from my data aquisition, were going to 
#set all arrays of zero to be class A. If I feed this NN nothing, I want it to tell me it is odd.

#first lets see what doing number 2 will get us.

what = np.array([0,0,1])
if what.any() == 0:
    print('This works')

In [324]:
#hyperparameters:
HP_allowed_confidence = 0.85
test_train_split = 0.85
batch_size = 64
epoch_number = 15
learning_rate = 0.0025
dropout_rate = 0.3
CNL1_filters = 32
CNL1_kernal_size = 5
MPL1_pool_size= (2,2)
MPL1_strides = 2
CNL2_filters = 64
CNL2_kernal_size = 5
MPL2_pool_size = (2,2)
MPL2_strides = 2

#I have a non-round number of examples
train_split_indice = int(np.round(test_train_split*number))

In [325]:
#Load in my dataset of targets, targets are strings labels under the name "Class"
galaxyzoo = pd.read_csv("zoo2MainSpecz.csv/zoo2MainSpecz.csv", usecols=[2,3,4,8,15,21,27], nrows=number)
#galaxyzoo = pd.read_csv("zoo2MainSpecz.csv/zoo2MainSpecz.csv", nrows=number)
Class = galaxyzoo["gz2class"].values
RA = galaxyzoo['ra'].values
DEC = galaxyzoo['dec'].values
Spiral = galaxyzoo['t01_smooth_or_features_a02_features_or_disk_debiased'].values
Elliptical = galaxyzoo['t01_smooth_or_features_a01_smooth_debiased'].values
Anythingelse = galaxyzoo['t01_smooth_or_features_a03_star_or_artifact_debiased'].values

In [326]:
#I want to take the first character of the Class string and interpret as a integer, ala MNIST example code
dictionary = {'A':int(2),'E':int(1),'S':int(0)}

In [327]:
#resave using my dictionary
target = np.empty((len(Class)))
for i in range(len(Class)):
    target[i] = int(dictionary[Class[i][0]])

In [328]:
#go through, find the arrays of zero, set that target to 'A' = 2
for i in range(len(target)):
    if processed_data[i].any() == 0:
        target[i] = 2

In [329]:
#split my data between training and test sets
train_target = target[0:train_split_indice]
test_target = target[train_split_indice:number]
train_images = processed_data[0:train_split_indice]
test_images = processed_data[train_split_indice:number]

np.shape(train_images[1])

(28, 28)

In [330]:
#we want to form a subset of our training examples that are probabilities that they are sorted correctly greater than %90
#we will then train this dataset additional times.

confidence = list()
HP_indice = list()
for i in range(len(train_images)):
    if train_target[i] == 0:
        confidence.append(Spherical[i])
    if train_target[i] == 1:
        confidence.append(Elliptical[i])
    if train_target[i] == 2:
        confidence.append(Anythingelse[i])
    if confidence[i] >= HP_allowed_confidence:
        HP_indice.append(i)
train_weights = np.asarray(confidence)
HP_target = np.empty(len(HP_indice))
HP_images = np.empty((len(HP_indice),28,28))
for i in range(len(HP_indice)):
    HP_target[i] = train_target[HP_indice[i]]
    HP_images[i] = train_images[HP_indice[i]]

#print(len(HP_indice))
HP_images = HP_images.reshape(len(HP_indice),28,28,1)
train_images = train_images.reshape(train_split_indice,28,28,1)
test_images = test_images.reshape(train_split_indice-number,28,28,1)
print(np.shape(train_images))
print(np.shape(HP_images))

(7791, 28, 28, 1)
(865, 28, 28, 1)


In [331]:

#define my model, using a CNN with 2 Convolutional layers, 2 max pool layers, 1 dense layer, 1 drop out layer, and another dense layer 
def create_model(dropout_rate, learning_rate):
    
    model = keras.Sequential([])
    model.add(keras.layers.Conv2D(input_shape=(28,28,1),filters=CNL1_filters,kernel_size=CNL1_kernal_size,padding="same",activation=tf.nn.relu))
    model.add(keras.layers.MaxPool2D(pool_size=MPL1_pool_size, strides=MPL1_strides))
    model.add(keras.layers.Conv2D(filters=CNL2_filters,kernel_size=CNL2_kernal_size,padding="same",activation=tf.nn.relu))
    model.add(keras.layers.MaxPool2D(pool_size=MPL2_pool_size,strides=MPL2_strides))
    model.add(keras.layers.Reshape([7*7*64]))
    model.add(keras.layers.Dense(units=1024,activation=tf.nn.relu))
    model.add(keras.layers.Dropout(rate=dropout_rate))
    model.add(keras.layers.Dense(units=3,activation=tf.nn.softmax))
    adam = tf.train.AdamOptimizer(learning_rate = learning_rate)
    model.compile(optimizer=adam, loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    return(model)



In [332]:
#Fit the model with the high probability first, i'll guess 5 times, then maybe do 8 next.
model=create_model(dropout_rate=dropout_rate, learning_rate=learning_rate)
model.fit(HP_images, HP_target, epochs=2, batch_size=batch_size, verbose=1, shuffle=True)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x2b5a1a76ef0>

In [333]:
#model = create_model(dropout_rate=dropout_rate)
model.fit(train_images, train_target, epochs=epoch_number, batch_size=batch_size, verbose=1, shuffle=True,
         sample_weight = train_weights)
#can use sample_weight = train_weights as a method of fit too, but didnt increase accuracy

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x2b5a1a76c50>

In [334]:
#model.summary()
test_loss, test_acc = model.evaluate(test_images.reshape(number - train_split_indice,28,28,1), test_target)
print('Test accuracy:', test_acc)
#0.848 fixing it with the zeros fixed
#0.850 with the HP dataset implemented


Test accuracy: 0.718545454675501
