In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

import imageio # for opening the images into np arrays

from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dense, Dropout, Flatten # Activation
from keras.callbacks import ModelCheckpoint

from tensorflow import set_random_seed

%matplotlib inline

Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [2]:
# for getting reproducible results
random_seed = 112
np.random.seed(random_seed)

### Read in Ids and Labels:

In [5]:
data_filepath = '~/data/protein_data/protein_data/'
df = pd.read_csv(data_filepath + 'clean_train.csv')

df.drop(columns = 'Target', inplace=True) # not needed since it's been one-hot encoded

### Take a Sample of all Training Image Ids

In [7]:
# Slice size:
sample_size = 1500

# Slice out the portion of the training 'Id' to work with
sample = df.sample(n=sample_size, random_state=random_seed)

In [8]:
# Make X from Ids
X = sample['Id']

# make y from labels
y = sample.iloc[:, 1:]

### Train-Test Split:

In [9]:
X_train_id, X_test_id, y_train, y_test = train_test_split(X, y, random_state=random_seed, test_size=.20)

### Load Images:

In [12]:
def load_img_slice(id_slice):
# This function builds and returns an array of images out of a series of Ids
    i = 0
    
    # Make empty X_array to fill with image data
    X_array = np.empty((len(id_slice), 512, 512, 3))

    # for each Id
    for img in id_slice:

        # Load image
        X_array[i,:,:,:] = imageio.imread(data_filepath+'rgb_images/'+img+'_rgb.png')
        i += 1

    return X_array

In [13]:
# Call the function to load the images
X_train = load_img_slice(X_train_id)
X_test = load_img_slice(X_test_id)

### Neural Network:

In [14]:
# Compute class_weights to pass into the model
# protein_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)

In [19]:
# Instantiate model
model = Sequential()

# Input layer
model.add(Conv2D(filters = 10,                   # number of filters
                        kernel_size = 5,        # height/width of filter
                        activation='relu',      # activation function 
                        input_shape=(X_train.shape[1],X_train.shape[2],X_train.shape[3]))) # shape of input (image)
model.add(MaxPooling2D(pool_size=(2,2)))

# 2
model.add(Conv2D(10, kernel_size = 5, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

# 3
model.add(Conv2D(5, kernel_size = 3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

# 4
model.add(Conv2D(5, kernel_size = 3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

# # 5
# model.add(Conv2D(5, kernel_size = 5, activation='relu'))
# model.add(MaxPooling2D(pool_size=(2,2)))

# model.add(Dropout(0.25))

# Flatten convolutional layers for fully connected layers
model.add(Flatten())

# Fully connected layers
# 1
model.add(Dense(3000, activation='relu'))
model.add(Dropout(0.5))
# 2
model.add(Dense(3000, activation='relu'))
model.add(Dropout(0.5))
# 3
model.add(Dense(3000, activation='relu'))
model.add(Dropout(0.5))

# Output layer
model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# saves the model weights after each epoch if the validation loss score decreased
checkpointer = ModelCheckpoint(filepath='../checkpoint_model.hdf5', verbose=1, save_best_only=True,) 
#                                monitor='val_acc', mode='max')

### Fit!

In [None]:
start = time.time()

set_random_seed(random_seed) # set tensorflow random state

result = model.fit(X_train, y_train,
                   validation_data = (X_test, y_test), 
                   batch_size=15,
                   epochs=15,
                   verbose=1,
#                    class_weight = protein_weights,
                   callbacks = [checkpointer]);

# Print the runtime:
print('Runtime: ', round((time.time() - start)/60, 3), ' minutes')

Train on 1200 samples, validate on 300 samples
Epoch 1/15

Epoch 00001: val_loss improved from inf to 20.36253, saving model to ../checkpoint_model.hdf5
Epoch 2/15

Epoch 00002: val_loss did not improve from 20.36253
Epoch 3/15

Epoch 00003: val_loss did not improve from 20.36253
Epoch 4/15

Epoch 00004: val_loss did not improve from 20.36253
Epoch 5/15

Epoch 00005: val_loss did not improve from 20.36253
Epoch 6/15

Epoch 00006: val_loss did not improve from 20.36253
Epoch 7/15

Epoch 00007: val_loss did not improve from 20.36253
Epoch 8/15

# Plot Accuracy vs. Epochs

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols=2, figsize=(12,4))

i = 0
for metric in ['acc', 'loss']:
    
    # Change settings for each subplot
    if i == 0:
        metric_label = 'Accuracy'
        c = '#1f77b4'
    else:
        metric_label = 'Loss Function'
        c = 'green'

    # Plot metrics
    ax[i].plot(result.history[metric], color = c)
    ax[i].plot(result.history['val_'+metric], color = 'orange')
        
    # Set title
    ax[i].set_title(metric_label + ' vs. Epoch')

    # Set axis labels
    ax[i].set_ylabel(metric_label)
    ax[i].set_xlabel('Epoch')

    # Add a legend
    ax[i].legend(['Train', 'Test'], loc='best')

    # Plot girdlines:
    ax[i].grid()

    i += 1

In [14]:
# check macro f1 score, that's how the cometition is evaluated
# model.metrics_names


### Kaggle Submission Link:

https://www.kaggle.com/c/human-protein-atlas-image-classification/submit

<hr>

<br>