Load Data

In [None]:
%ls train

In [None]:
import os, cv2
import matplotlib.pyplot as plt
import keras
from keras.preprocessing import image                  
from tqdm import tqdm
from PIL import ImageFile                            
ImageFile.LOAD_TRUNCATED_IMAGES = True    
import numpy as np
import pandas as pd

from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Dropout, Flatten, Dense
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint  

from keras.applications.resnet50 import ResNet50
from keras import Model
from keras.layers import Input

In [None]:
# Display available classes
seed_types = os.listdir('train')
seed_dict = {}
[seed_dict.update({k:v}) for v,k in enumerate(seed_types)]
seed_dict

In [None]:
# Utility functions
def get_data(train_path):
    data = {}
    for i in os.listdir(train_path):
        data[i] =  data[i] = [train_path + i + "/" + j for j in os.listdir(train_path + i)]

        print i + " contains " + str(len(data[i])) + " images"
    return data

# Improve the quality of the images
def transform_image(img):
    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)

    lab_planes = cv2.split(lab)

    clahe = cv2.createCLAHE(clipLimit=2.0,tileGridSize=(8,8))

    lab_planes[0] = clahe.apply(lab_planes[0])
    lab_planes[1] = clahe.apply(lab_planes[1])
    lab_planes[2] = clahe.apply(lab_planes[2])

    lab = cv2.merge(lab_planes)

    return cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)

def path_to_tensor(img_path):
    # loads RGB image as PIL.Image.Image type
    #img = image.load_img(img_path, target_size=(224, 224))
    # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
    #x = image.img_to_array(img)
    # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
    
    img = image.load_img(img_path, target_size=(224, 224))
    img = np.array(img)
    img = transform_image(img)
    
    np_image = np.asarray(img)
    return np.expand_dims(np_image, axis=0)

def paths_to_tensor(img_paths):
    list_of_tensors = [path_to_tensor(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

# Call back which reports the f1 score
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict, average = 'micro')
        _val_recall = recall_score(val_targ, val_predict, average = 'micro')
        _val_precision = precision_score(val_targ, val_predict, average = 'micro')
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print '— val_f1: %f — val_precision: %f — val_recall %f' %(_val_f1, _val_precision, _val_recall)
        return

metrics = Metrics()

# Plot the history that is stored by keras
def plot_history(history):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s]
    val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s]
    
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    
    ## As loss always exists
    epochs = range(1,len(history.history[loss_list[0]]) + 1)
    
    ## Loss
    plt.figure(1)
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b', label='Training loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g', label='Validation loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    ## Accuracy
    plt.figure(2)
    for l in acc_list:
        plt.plot(epochs, history.history[l], 'b', label='Training accuracy (' + str(format(history.history[l][-1],'.5f'))+')')
    for l in val_acc_list:    
        plt.plot(epochs, history.history[l], 'g', label='Validation accuracy (' + str(format(history.history[l][-1],'.5f'))+')')

    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

In [None]:
# Load the data
data = get_data('train/')

In [None]:
train_x = []
train_y = []
for key in data.keys():
    for i in data[key]:
        train_x.append(i)
        train_y.append(seed_dict[key]) 
len(train_x), len(train_y)

# one hot encode the y values
train_y = keras.utils.to_categorical(train_y)

In [None]:
for i,seed in enumerate(seed_types):
    plt.subplot(3,4,i+1)
    img = plt.imread(data[seed][100])
    plt.imshow(img)

In [None]:
seed

In [None]:
for i in xrange(12):
    plt.subplot(3,4,i+1)
    img = plt.imread(data[seed][i])
    plt.imshow(img)

In [None]:
seed='Common wheat'
img_path = data[seed][2]
img = image.load_img(img_path, target_size=(224, 224))
img

In [None]:
# Check transformed image
img2 = np.array(img)
plt.imshow(transform_image(img2))

In [None]:
# Load data into tenors
train_tensors = paths_to_tensor(train_x).astype('float32')/255
#test_tensors = paths_to_tensor(test_files).astype('float32')/255

In [None]:
#np.save('test.n',train_tensors)
#train_tensors = np.load('test.n.npy')
#p.dump(train_tensors,open('train_tensors.p','wb'))
#train_tensors = p.load(open( "train_tensors.p", "rb" ))
type(train_tensors)

In [None]:
# Basic model for testing
model = Sequential()

model.add(Conv2D(16,kernel_size=2,input_shape=(224,224,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32,(2,2), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64,(2,2), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(GlobalAveragePooling2D())
model.add(Dense(12,activation='softmax'))

model.summary()

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
# Split the model into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(train_tensors, train_y, test_size=0.20, random_state=42)

In [None]:
datagen = image.ImageDataGenerator(
    featurewise_center = True,
    featurewise_std_normalization = True,
    rotation_range=90,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    validation_split=0.2,
)
datagen.fit(X_train)

In [None]:
### TODO: specify the number of epochs that you would like to use to train the model.

epochs = 20

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.from_scratch.hdf5', 
                               verbose=1, save_best_only=True)

history = model.fit_generator(datagen.flow(X_train, y_train), steps_per_epoch=len(X_train)/32,
          epochs=epochs, validation_data=(X_test, y_test), callbacks=[checkpointer, metrics], verbose=1)

In [None]:
plot_history(history)

In [None]:
#Load in the ResNet model
input_tensor = Input(shape=(224, 224, 3))
r50_model = ResNet50(input_tensor = input_tensor, weights='imagenet', include_top=False)

In [None]:
x = r50_model.output
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dense(1024, activation='sigmoid')(x)
# and a logistic layer -- let's say we have 200 classes
predictions = Dense(12, activation='softmax')(x)
# this is the model we will train
r50_model = Model(inputs=r50_model.input, outputs=predictions)

for layer in r50_model.layers[:172]:
   layer.trainable = False
for layer in r50_model.layers[172:]:
   layer.trainable = True

### TODO: Compile the model.
r50_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

#for i, layer in enumerate(r50_model.layers):
 #  print(i, layer.name)

#r50_model.summary()

In [None]:
### Train the model.
checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.resnet50.hdf5', 
                               verbose=1, save_best_only=True)

history = r50_model.fit_generator(datagen.flow(X_train, y_train), steps_per_epoch=len(X_train)/32,
          validation_data=(X_test, y_test),
          epochs=20, callbacks=[checkpointer, metrics], verbose=1)

In [None]:
### Train the model.
checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.resnet50.hdf5', 
                               verbose=1, save_best_only=True)

history = r50_model.fit(X_train, y_train,
          validation_data=(X_test, y_test), batch_size=32,
          epochs=20, callbacks=[checkpointer, metrics], verbose=1)

In [None]:
reverse_seed_dict = {}
[reverse_seed_dict.update({v:k}) for k,v in seed_dict.items()]

def predict_img(path):
    img = plt.imread(path)
    plt.imshow(img)
    idx = np.where(1==model.predict(path_to_tensor(path))[0])[0][0]
    print(reverse_seed_dict[idx])

In [None]:
predict_img(data[seed][100])

In [None]:
plot_history(history)

In [None]:
## Below code is to create submission for kaggle, not documented for capstone!

In [None]:
test_paths = ['test/'+i for i in os.listdir('test/')]
test_tensors = paths_to_tensor(test_paths)

In [None]:
l = model.predict(test_tensors)

In [None]:
df = pd.DataFrame([os.listdir('test/'), [np.argmax(i) for i in l]]).transpose()
df.columns =['file', 'species']
df['species'] = df['species'].map(reverse_seed_dict)

In [None]:
df.to_csv('something.csv', index=False)

In [None]:
reverse_seed_dict