In [1]:
import pandas as pd
import numpy as np

import os, shutil

In [2]:
original_train_dir = 'train'
test_dir = 'test'


In [3]:
train_labels = np.array(pd.read_csv('labels.csv'))
classes, counts = np.unique(train_labels[:, 1], return_counts=True)

print("There are %d" % classes.size)

There are 120


In [4]:
def mkdirIfNotExist(directory):
    if not os.path.exists(directory):
        os.mkdir(directory)
    return directory

In [5]:
base_dir = mkdirIfNotExist('./data_gen')
train_dir = mkdirIfNotExist(os.path.join(base_dir, 'train'))
validation_dir = mkdirIfNotExist(os.path.join(base_dir, 'validation'))
test_dir = mkdirIfNotExist(os.path.join(base_dir, 'test'))

In [6]:
for c in classes[:]:
    mkdirIfNotExist(os.path.join(train_dir, c))
    mkdirIfNotExist(os.path.join(validation_dir, c))

In [7]:
def copyIfNotExist(fnames, src_dir, dst_dir):
    nCopied = 0
    for fname in fnames:
        src = os.path.join(src_dir, fname)
        dst = os.path.join(dst_dir, fname)
        if not os.path.exists(dst):
            shutil.copyfile(src, dst)
            nCopied += 1
    if nCopied > 0:
        print("Copied %d to %s" % (nCopied, dst_dir))


In [8]:
train_ratio = 0.7
for c in classes[:]:
    fnames = train_labels[train_labels[:, 1] == c][:, 0]
    fnames = ['{}.jpg'.format(name) for name in fnames]
    idx = int(len(fnames) * (1 - train_ratio))
    val_fnames = fnames[:idx]
    train_fnames = fnames[idx:]
    train_class_dir = os.path.join(train_dir, c)
    validation_class_dir = os.path.join(validation_dir, c)
    copyIfNotExist(train_fnames, original_train_dir, train_class_dir)
    copyIfNotExist(val_fnames, original_train_dir, validation_class_dir)


In [9]:
from sklearn.datasets import load_files       

from keras import applications
from keras.utils import np_utils
from keras.layers import Dropout, Flatten, Dense, BatchNormalization
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.preprocessing import image                  

from tqdm import tqdm
from PIL import ImageFile                            
ImageFile.LOAD_TRUNCATED_IMAGES = True  

import pandas as pd
import numpy as np
import glob
import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
train_dir = 'data_gen/train'
val_dir = 'data_gen/validation'
test_dir = 'data/test'

In [11]:
def path_to_tensor(img_path):
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=(224, 224))
    # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
    x = image.img_to_array(img)
    # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
    return np.expand_dims(x, axis=0)

In [12]:
def paths_to_tensor(img_paths):
    list_of_tensors = [path_to_tensor(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

In [13]:
def generate_bottleneck_features():
    model = applications.ResNet50(include_top=False, weights='imagenet')

    #train_files = load_files(train_dir)
    #train_tensors = paths_to_tensor(train_files['filenames'])
    #train_data = applications.resnet50.preprocess_input(train_tensors)
    #train_files, train_tensors=None, None
        
    #bottleneck_features_train = model.predict(
        #train_data, batch_size=16)

    #np.save('bottleneck_features/train.npy', bottleneck_features_train)
    
    #bottleneck_features_train = None

    val_files = load_files(val_dir)
    val_tensors = paths_to_tensor(val_files['filenames'])
    val_data = applications.resnet50.preprocess_input(val_tensors)
    val_files, val_tensors=None, None
    
    bottleneck_features_validation = model.predict(
        val_data, batch_size=16)
    
    
    np.save('bottleneck_features/validation.npy', bottleneck_features_validation)
    bottleneck_features_validation = None

In [14]:
def generate_bottleneck_features_test():
    # build the network
    model = applications.ResNet50(include_top=False, weights='imagenet')

    files = glob.glob('test/*.jpg')
    tensors = paths_to_tensor(files)
    data = applications.resnet50.preprocess_input(tensors)
        
    bottleneck_features = model.predict(
        data, batch_size=16)
    
    

    np.save('bottleneck_features/test.npy', bottleneck_features)
    bottleneck_features, files, tensors= None, None, None

In [15]:
def load_labels(path):
    data = load_files(path)
    labels = np_utils.to_categorical(np.array(data['target']), 120)

    return labels

In [16]:
def extract_Resnet50(tensor):
    return ResNet50(weights='imagenet', include_top=False).predict(preprocess_input(tensor))

In [17]:
## Create bottleneck features
#generate_bottleneck_features()

In [18]:
#np.save("train.npy", bottleneck_features_train)

In [19]:
print('Loading training bottleneck features')
train_data = np.load('bottleneck_features/train.npy')
train_labels = load_labels('data_gen/train')

print('Loading validation bottleneck features')
validation_data = np.load('bottleneck_features/validation.npy')
validation_labels = load_labels('data_gen/validation')

Loading training bottleneck features
Loading validation bottleneck features


In [20]:
print('Defining model')
model = Sequential()
model.add(Flatten(input_shape = train_data.shape[1:]))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(256, activation='relu', kernel_initializer='he_uniform'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(120, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

Defining model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 2048)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 2048)              0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 2048)              8192      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               524544    
_________________________________________________________________
batch_normalization_2 (Batch (None, 256)               1024      
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896 

In [21]:
checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.hdf5', 
                               verbose=2, save_best_only=True)

In [22]:
epochs = 75
batch_size = 64
history = model.fit(train_data, train_labels,
                    validation_data=(validation_data, validation_labels),
                    epochs=epochs, batch_size=batch_size,
                    callbacks=[checkpointer], verbose=2)

Train on 7213 samples, validate on 3009 samples
Epoch 1/75

Epoch 00001: val_loss improved from inf to 3.26701, saving model to saved_models/weights.best.hdf5
 - 20s - loss: 4.9192 - acc: 0.0417 - val_loss: 3.2670 - val_acc: 0.3539
Epoch 2/75

Epoch 00002: val_loss improved from 3.26701 to 2.15533, saving model to saved_models/weights.best.hdf5
 - 3s - loss: 3.5646 - acc: 0.1833 - val_loss: 2.1553 - val_acc: 0.5287
Epoch 3/75

Epoch 00003: val_loss improved from 2.15533 to 1.60582, saving model to saved_models/weights.best.hdf5
 - 3s - loss: 2.8008 - acc: 0.3144 - val_loss: 1.6058 - val_acc: 0.6215
Epoch 4/75

Epoch 00004: val_loss improved from 1.60582 to 1.32107, saving model to saved_models/weights.best.hdf5
 - 3s - loss: 2.3154 - acc: 0.4001 - val_loss: 1.3211 - val_acc: 0.6677
Epoch 5/75

Epoch 00005: val_loss improved from 1.32107 to 1.15627, saving model to saved_models/weights.best.hdf5
 - 3s - loss: 2.0002 - acc: 0.4608 - val_loss: 1.1563 - val_acc: 0.6903
Epoch 6/75

Epoch 00

Epoch 00055: val_loss did not improve
 - 2s - loss: 0.8040 - acc: 0.7435 - val_loss: 0.7878 - val_acc: 0.7640
Epoch 56/75

Epoch 00056: val_loss did not improve
 - 3s - loss: 0.8067 - acc: 0.7428 - val_loss: 0.7918 - val_acc: 0.7650
Epoch 57/75

Epoch 00057: val_loss did not improve
 - 3s - loss: 0.8071 - acc: 0.7414 - val_loss: 0.8081 - val_acc: 0.7634
Epoch 58/75

Epoch 00058: val_loss did not improve
 - 3s - loss: 0.8191 - acc: 0.7381 - val_loss: 0.7936 - val_acc: 0.7654
Epoch 59/75

Epoch 00059: val_loss did not improve
 - 3s - loss: 0.8052 - acc: 0.7471 - val_loss: 0.7980 - val_acc: 0.7617
Epoch 60/75

Epoch 00060: val_loss did not improve
 - 3s - loss: 0.7814 - acc: 0.7531 - val_loss: 0.7986 - val_acc: 0.7601
Epoch 61/75

Epoch 00061: val_loss did not improve
 - 3s - loss: 0.7810 - acc: 0.7517 - val_loss: 0.7963 - val_acc: 0.7574
Epoch 62/75

Epoch 00062: val_loss did not improve
 - 3s - loss: 0.7872 - acc: 0.7498 - val_loss: 0.7898 - val_acc: 0.7607
Epoch 63/75

Epoch 00063: val

In [23]:
## Testing

model.load_weights('saved_models/weights.best.hdf5')

train_labels = np.array(pd.read_csv('labels.csv'))
classes, counts = np.unique(train_labels[:, 1], return_counts=True)

f = open('results.csv', 'w')
f.write('id')
for c in classes:
    f.write(',' + c)
f.write('\n')

1

In [24]:
#generate_bottleneck_features_test()

In [25]:
test = np.load('bottleneck_features/test.npy')


In [26]:
output = model.predict(test)

In [27]:
filenames = os.listdir('test')
for [o, name] in zip(output, filenames):
    f.write(name[:-4] + ',')
    o.tofile(f, sep=',', format='%.17f')
    f.write('\n')

f.close()