In [1]:
import numpy as np
import pandas as pd
import keras.backend as K
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Reshape, Flatten, Dropout, Activation
from keras.models import Model, Sequential, load_model
from keras.layers.normalization import BatchNormalization
from keras import optimizers, callbacks
from skimage.transform import resize
from imageio import imread
from imgaug import augmenters as iaa

  return f(*args, **kwds)
  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
path_data = '../all/'

In [3]:
def preprocess_input(np_img):
    np_img = np_img / 255
    
    # randome data augmentation
    aug = np.random.choice([0, 1])
    if aug == True:
        seq = iaa.Sequential([iaa.Fliplr(0.5), iaa.Flipud(0.5)])
        np_img = seq.augment_image(np_img)
        list_zoom = [1.4, 1.3, 1.2, 1.5]
        # Zoom out randomly
        if np.random.choice([True, False]):
            random_x_f = np.random.choice(list_zoom)
            random_x_s = np.random.choice(list_zoom)
            random_y_f = np.random.choice(list_zoom)
            random_y_s = np.random.choice(list_zoom)
            seq = iaa.Sequential([iaa.Affine(scale={"x": (random_x_f, random_x_s), "y": (random_y_f, random_y_s)})])
            np_img = seq.augment_image(np_img)
        
    return np_img

In [4]:
def generator(path_images, labels, batch_size, val=False):
    count = 0
    while True:
        batch_features = []
        batch_labels = []
        for i in range(batch_size):
            # Read image
            name_img = labels[count][0] + '.jpg'
            img = imread(path_images + '/' + name_img)
            
            # Image preprocessing
            if val:
                img = img / 255
            else:
                img = preprocess_input(img)

            label = labels[count][1:]

            batch_features.append(img)
            batch_labels.append(label)
            count += 1
            
            # Restart counter when it has reached the size 
            # of the data set
            if count == labels.shape[0] - 1:
                count = 0
                break
            
        yield np.array(batch_features), np.array(batch_labels)
            

In [5]:
def generator_predictions(path_images, list_names, batch_size):
    size_list = list_names.shape[0] - 1
    count = 0
    while True:
        batch_features = []
        for i in range(batch_size):
            # Read image
            name_img = list_names[count]
            img = imread(path_images + '/' + name_img)
            
            # Image preprocessing
            img = img / 255

            batch_features.append(img)
            
            # Restart counter when it has reached the size 
            # of the data set
            if count == size_list:
                count = 0
                yield np.array(batch_features)
            count += 1
            
        yield np.array(batch_features)

In [6]:
def galaxy_cnn(input_size, output_size):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',input_shape=input_size))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, kernel_size=(3, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Activation('relu'))
    model.add(Conv2D(128, kernel_size=(3, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Activation('relu'))
    model.add(Conv2D(256, kernel_size=(3, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Activation('relu'))
    model.add(Conv2D(512, kernel_size=(3, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(2048, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2048, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(output_size, activation='sigmoid'))
    return model

In [33]:
def galaxy_cnn_2(input_size, output_size):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',input_shape=input_size))
    model.add(Conv2D(32, kernel_size=(3, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, kernel_size=(3, 3)))
    model.add(Conv2D(64, kernel_size=(3, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Activation('relu'))
    model.add(Conv2D(128, kernel_size=(3, 3)))
    model.add(Conv2D(128, kernel_size=(3, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Activation('relu'))
    model.add(Conv2D(256, kernel_size=(3, 3)))
    model.add(Conv2D(256, kernel_size=(3, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Activation('relu'))
    model.add(Conv2D(512, kernel_size=(3, 3)))
    model.add(Conv2D(512, kernel_size=(3, 3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(2048, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2048, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(output_size, activation=None))
    return model

In [7]:
def rmse (y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [9]:
labels_train = np.load(path_data + 'y_train.npy')
labels_test = np.load(path_data + 'y_test.npy')
labels_val = np.load(path_data + 'y_val.npy')

In [10]:
batch_size = 32
steps = np.ceil(43104 // batch_size + 1)
steps_va = np.ceil(9237 // batch_size + 1)
epochs = 50

In [22]:
# Set architecture
learning_rate = 0.0001
decay_rate = learning_rate / epochs
opt = optimizers.Adam(lr=learning_rate, decay=decay_rate)
model = galaxy_cnn((160, 160, 3), 37)
model.compile(loss='mean_squared_error', optimizer=opt, metrics=[rmse])
tbCallBack = callbacks.TensorBoard(log_dir=path_data + './Graph/model_1', histogram_freq=0, write_graph=True, write_images=False)
# Train the model
model.fit_generator(generator(path_data + 'preprocess_img_train', labels_train, batch_size=batch_size), 
        steps_per_epoch=steps, 
        epochs=epochs, 
        validation_data= generator(path_data + 'preprocess_img_val', labels_val, batch_size=batch_size),
        validation_steps=steps_va, callbacks=[tbCallBack])
# Evaluation over validation test
evaluation = model.evaluate_generator(generator(path_data + 'preprocess_img_test', labels_test, batch_size=batch_size), steps=steps)
print('Evaluation:', evaluation)
# Save model after training
model.save(path_data + 'weights/model_1.h5')
# Clear session to free memory
K.clear_session()

Evaluation: [0.008220546006904953, 0.09034255245887919]


In [24]:
# Set architecture
learning_rate = 0.0001
decay_rate = learning_rate / epochs
opt = optimizers.Adam(lr=learning_rate, decay=decay_rate)
model = galaxy_cnn_2((160, 160, 3), 37)
model.compile(loss='mean_squared_error', optimizer=opt, metrics=[rmse])
tbCallBack = callbacks.TensorBoard(log_dir=path_data + './Graph/model_2', histogram_freq=0, write_graph=True, write_images=False)
# Train the model
model.fit_generator(generator(path_data + 'preprocess_img_train', labels_train, batch_size=batch_size), 
        steps_per_epoch=steps, 
        epochs=epochs, 
        validation_data= generator(path_data + 'preprocess_img_val', labels_val, batch_size=batch_size),
        validation_steps=steps_va, callbacks=[tbCallBack])
# Evaluation over validation test
evaluation = model.evaluate_generator(generator(path_data + 'preprocess_img_test', labels_test, batch_size=batch_size), steps=steps)
print('Evaluation:', evaluation)
# Save model after training
model.save(path_data + 'weights/model_2.h5')
# Clear session to free memory
K.clear_session()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Evaluation: [0.008391103329779522, 0.0912702066842515]


In [8]:
labels_train_test = np.load(path_data + 'y_train_test.npy')
labels_val = np.load(path_data + 'y_val.npy')

In [9]:
labels_val[0]

array(['100122', 0.7388319999999999, 0.238159, 0.023009, 0.0, 0.238159,
       0.0, 0.238159, 0.0, 0.238159, 0.0, 0.0, 0.238159, 0.0, 0.19793,
       0.80207, 0.066806667, 0.663691308, 0.008334764, 0.0, 0.0,
       0.0494825, 0.098965, 0.0494825, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], dtype=object)

In [10]:
batch_size = 32
steps = np.ceil(52341 // batch_size + 1)
steps_va = np.ceil(9237 // batch_size + 1)
epochs = 50

In [11]:
# Set architecture
learning_rate = 0.0001
decay_rate = learning_rate / epochs
opt = optimizers.Adam(lr=learning_rate, decay=decay_rate)
model = galaxy_cnn((160, 160, 3), 37)
model.compile(loss='mean_squared_error', optimizer=opt, metrics=[rmse])
tbCallBack = callbacks.TensorBoard(log_dir=path_data + './Graph/model_1_train_test', histogram_freq=0, write_graph=True, write_images=False)
# Train the model
model.fit_generator(generator(path_data + 'train_test_img', labels_train_test, batch_size=batch_size), 
        steps_per_epoch=steps, 
        epochs=epochs, 
        validation_data= generator(path_data + 'preprocess_img_val', labels_val, batch_size=batch_size, val=True),
        validation_steps=steps_va, callbacks=[tbCallBack])

# Save model after training
model.save(path_data + 'weights/model_1_train_test.h5')
# Clear session to free memory
K.clear_session()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [12]:
# Set architecture
learning_rate = 0.0001
decay_rate = learning_rate / epochs
opt = optimizers.Adam(lr=learning_rate, decay=decay_rate)
model = galaxy_cnn_2((160, 160, 3), 37)
model.compile(loss='mean_squared_error', optimizer=opt, metrics=[rmse])
tbCallBack = callbacks.TensorBoard(log_dir=path_data + './Graph/model_2_none', histogram_freq=0, write_graph=True, write_images=False)
# Train the model
model.fit_generator(generator(path_data + 'preprocess_img_train', labels_train, batch_size=batch_size), 
        steps_per_epoch=steps, 
        epochs=epochs, 
        validation_data= generator(path_data + 'preprocess_img_val', labels_val, batch_size=batch_size),
        validation_steps=steps_va, callbacks=[tbCallBack])
# Evaluation over validation test
evaluation = model.evaluate_generator(generator(path_data + 'preprocess_img_test', labels_test, batch_size=batch_size), steps=steps)
print('Evaluation:', evaluation)
# Save model after training
model.save(path_data + 'weights/model_2_none.h5')
# Clear session to free memory
K.clear_session()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Evaluation: [0.008414974405298497, 0.09142655692787097]


In [12]:
labels_all_data = np.load(path_data + 'y_all_data.npy')

In [14]:
labels_all_data.shape

(61578, 38)

In [15]:
batch_size = 32
steps = np.ceil(61578 // batch_size + 1)
epochs = 25

In [16]:
# Set architecture
learning_rate = 0.0001
decay_rate = learning_rate / epochs
opt = optimizers.Adam(lr=learning_rate, decay=decay_rate)
model = galaxy_cnn((160, 160, 3), 37)
model.compile(loss='mean_squared_error', optimizer=opt, metrics=[rmse])
tbCallBack = callbacks.TensorBoard(log_dir=path_data + './Graph/model_1_all_data', histogram_freq=0, write_graph=True, write_images=False)
# Train the model
model.fit_generator(generator(path_data + 'train_test_img', labels_train_test, batch_size=batch_size), 
        steps_per_epoch=steps, 
        epochs=epochs, 
        callbacks=[tbCallBack])

# Save model after training
model.save(path_data + 'weights/model_1_all_data.h5')
# Clear session to free memory
K.clear_session()

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


## Submission

In [8]:
test_sub = pd.read_csv(path_data + 'all_zeros_benchmark.csv')

In [9]:
name_test_sub = test_sub['GalaxyID'].apply(lambda x: str(x) + '.jpg').values

In [12]:
# Set architecture
learning_rate = 0.0001
epochs = 50
decay_rate = learning_rate / epochs
opt = optimizers.Adam(lr=learning_rate, decay=decay_rate)
model = galaxy_cnn((160, 160, 3), 37)
model.load_weights(path_data + 'weights/' + 'model_1_train_test.h5')
model.compile(loss='mean_squared_error', optimizer=opt, metrics=[rmse])

In [13]:
batch_size_sub = 32
steps_sub = np.ceil(79975 // batch_size_sub + 1)

In [14]:
predictions = model.predict_generator(generator_predictions(path_data + 'sub_test_img', name_test_sub, batch_size=batch_size_sub), steps=steps_sub, verbose=1)



In [16]:
results = test_sub.copy()
results.head()
columns = results.columns[1:]

In [17]:
for count in range(predictions.shape[1]):
    column = columns[count]
    results[column] = predictions[:, count]

In [18]:
results.head()

Unnamed: 0,GalaxyID,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,...,Class9.3,Class10.1,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6
0,100018,0.346925,0.643762,0.011634,0.044182,0.603001,0.200188,0.407515,0.183076,0.44961,...,0.006868,0.106377,0.060244,0.019532,0.005925,0.057141,0.003561,0.001952,0.003695,0.109736
1,100037,0.414265,0.586715,0.006081,0.514483,0.078817,0.009039,0.064192,0.016148,0.059729,...,0.355658,0.00942,0.003744,0.002091,0.000113,0.002882,2.2e-05,3.1e-05,7.6e-05,0.010653
2,100042,0.647536,0.339018,0.012971,0.233329,0.106007,0.022397,0.079855,0.023471,0.084502,...,0.143508,0.010715,0.005235,0.004539,0.000299,0.004889,2.6e-05,3.1e-05,0.000169,0.014631
3,100052,0.546672,0.42769,0.032136,0.00486,0.417123,0.010465,0.40266,0.04602,0.381574,...,0.000189,0.024307,0.012805,0.004848,0.002994,0.005092,0.000856,0.000592,0.002663,0.030919
4,100056,0.214191,0.788704,0.003178,0.759802,0.018314,0.00285,0.015672,0.00601,0.01115,...,0.520035,0.002748,0.001443,0.000921,1.4e-05,0.001508,4e-06,8e-06,9e-06,0.002709


In [19]:
# Save results
results.to_csv(path_data + 'results_train_test.csv', index=False)

## Last submission

In [21]:
test_sub = pd.read_csv(path_data + 'all_zeros_benchmark.csv')

In [22]:
name_test_sub = test_sub['GalaxyID'].apply(lambda x: str(x) + '.jpg').values

In [23]:
# Set architecture
learning_rate = 0.0001
epochs = 25
decay_rate = learning_rate / epochs
opt = optimizers.Adam(lr=learning_rate, decay=decay_rate)
model = galaxy_cnn((160, 160, 3), 37)
model.load_weights(path_data + 'weights/' + 'model_1_all_data.h5')
model.compile(loss='mean_squared_error', optimizer=opt, metrics=[rmse])

In [24]:
batch_size_sub = 32
steps_sub = np.ceil(79975 // batch_size_sub + 1)

In [25]:
predictions = model.predict_generator(generator_predictions(path_data + 'sub_test_img', name_test_sub, batch_size=batch_size_sub), steps=steps_sub, verbose=1)



In [26]:
results = test_sub.copy()
results.head()
columns = results.columns[1:]

In [27]:
for count in range(predictions.shape[1]):
    column = columns[count]
    results[column] = predictions[:, count]

In [28]:
results.head()

Unnamed: 0,GalaxyID,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,...,Class9.3,Class10.1,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6
0,100018,0.498929,0.484521,0.01552,0.029538,0.457886,0.121249,0.337303,0.152857,0.308163,...,0.004015,0.09383,0.046974,0.013948,0.005331,0.040953,0.002538,0.001344,0.002983,0.101939
1,100037,0.444095,0.544365,0.016311,0.399661,0.177579,0.03488,0.141534,0.063909,0.120789,...,0.309674,0.031725,0.016574,0.011426,0.001322,0.013688,0.000345,0.000444,0.000825,0.044672
2,100042,0.65365,0.330237,0.010269,0.250757,0.08255,0.01407,0.057086,0.015416,0.059824,...,0.134382,0.007916,0.003082,0.002554,0.000154,0.002643,1.4e-05,2.8e-05,0.000151,0.010567
3,100052,0.54637,0.432853,0.02633,0.007645,0.428913,0.026269,0.410655,0.097361,0.349402,...,0.000475,0.053577,0.032062,0.012592,0.008409,0.018628,0.003103,0.001431,0.003497,0.058519
4,100056,0.175571,0.821594,0.006239,0.776542,0.035632,0.008878,0.030479,0.017929,0.020094,...,0.448413,0.007156,0.004766,0.004248,0.000101,0.007112,7.1e-05,0.000133,0.000167,0.008859


In [29]:
# Save results
results.to_csv(path_data + 'results_all_dataset.csv', index=False)