In [27]:
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input, decode_predictions

In [2]:
train_dir = '../data/train'
valid_dir = '../data/valid'

In [3]:
def read_img(imgpath, size):
    img = image.load_img(imgpath, target_size=size)
    img = image.img_to_array(img)
    return img

In [8]:
import os
import glob
import random

train_paths = glob.glob(os.path.join(train_dir, '*', '*.jpg'))
valid_paths = glob.glob(os.path.join(valid_dir, '*', '*.jpg'))
random.shuffle(train_paths)
random.shuffle(valid_paths)

In [9]:
train_paths[:5]

['../data/train/newfoundland/cf3697f8f3ee67b50cedaa63904ab5e8.jpg',
 '../data/train/cocker_spaniel/37e42e634970f00ab6f6ca7db8239606.jpg',
 '../data/train/bluetick/b66b6ff2ff16c6e746af3ef624e471f1.jpg',
 '../data/train/curly-coated_retriever/3ad193d212c34fb9e5c77a1dfc99efe1.jpg',
 '../data/train/beagle/86f0e4abee677119258764eadc368b9d.jpg']

In [10]:
valid_paths[:5]

['../data/valid/clumber/54ece8d1cb7a77b9968d714fba342c36.jpg',
 '../data/valid/miniature_schnauzer/692965e541833d6cf6089b0d416f0c0f.jpg',
 '../data/valid/affenpinscher/c32fb0c78bfc35f176ae7090155ef2c9.jpg',
 '../data/valid/gordon_setter/18de05937a44cb467b229889f8a95bcb.jpg',
 '../data/valid/lhasa/fc7317da160bff89cd13aacc980adf26.jpg']

In [11]:
len(train_paths), len(valid_paths)

(9199, 1023)

## Extract VGG16 features

In [22]:
import numpy as np
train_images = np.zeros((len(train_paths), 224, 224, 3), dtype='float32')
for i, imgpath in enumerate(train_paths):
    img = read_img(imgpath, (224, 224))
    x = preprocess_input(np.expand_dims(img.copy(), axis=0))
    train_images[i] = x

In [23]:
train_images.shape

(9199, 224, 224, 3)

In [24]:
valid_images = np.zeros((len(valid_paths), 224, 224, 3), dtype='float32')
for i, imgpath in enumerate(valid_paths):
    img = read_img(imgpath, (224, 224))
    x = preprocess_input(np.expand_dims(img.copy(), axis=0))
    valid_images[i] = x

In [25]:
valid_images.shape

(1023, 224, 224, 3)

In [30]:
vgg_bottleneck = VGG16(weights='imagenet', include_top=False, pooling='avg')
vgg_bottleneck.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

In [None]:
train_feats = vgg_bottleneck.predict(train_images, batch_size=32, verbose=1)

 512/9199 [>.............................] - ETA: 5:34

In [None]:
valid_feats = vgg_bottleneck.predict(valid_images, batch_size=32, verbose=1)

In [33]:
train_feats.shape

(9199, 512)

In [34]:
valid_feats.shape

(1023, 512)

In [36]:
np.save('train_feats_vgg16.npy', train_feats)
np.save('valid_feats_vgg16.npy', valid_feats)

## Target Label

In [176]:
dogs = list(set([x.split('/')[3] for x in train_paths]))
dogs = sorted(dogs)
len(dogs)

120

In [177]:
train_labels = [dogs.index(x) for x in [x.split('/')[3] for x in train_paths]]
print(train_labels[:10])

valid_labels = [dogs.index(x) for x in [x.split('/')[3] for x in valid_paths]]
print(valid_labels[:10])

from keras.utils import np_utils
valid_onehot= np_utils.to_categorical(valid_labels)
print(valid_onehot.shape)

[78, 32, 15, 34, 9, 48, 63, 110, 56, 6]
[31, 77, 0, 50, 70, 84, 106, 61, 54, 20]
(1023, 120)


## LogReg on VGG bottleneck features

In [178]:
from sklearn.linear_model import LogisticRegression

In [179]:
logreg = LogisticRegression(multi_class='multinomial', random_state=1234)
logreg

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=1234, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [180]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(train_feats)
train_feats_norm = scaler.transform(train_feats)
valid_feats_norm = scaler.transform(valid_feats)

In [181]:
logreg.fit(train_feats_norm, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=1234, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [182]:
valid_probs = logreg.predict_proba(valid_feats_norm)
valid_probs.shape

(1023, 120)

In [183]:
valid_preds = logreg.predict(valid_feats_norm)
valid_preds.shape

(1023,)

In [184]:
valid_preds[:10]  # 予測したラベル

array([ 31,   1, 119,  50, 103,  84,  77,  61,  54,  64])

In [185]:
valid_labels[:10]  # 正解ラベル

[31, 77, 0, 50, 70, 84, 106, 61, 54, 20]

## Metrics

In [186]:
from sklearn.metrics import log_loss, accuracy_score

In [187]:
print('Validation VGG LogLoss {}'.format(log_loss(valid_onehot, valid_probs)))

Validation VGG LogLoss 1.4694004211113254


In [188]:
print('Validation VGG Accuracy {}'.format(accuracy_score(valid_labels, valid_preds)))

Validation VGG Accuracy 0.6490713587487781


## Evaluate on Test Data

In [189]:
test_dir = '../data/test'
test_paths = glob.glob(os.path.join(test_dir, 'unknown', '*.jpg'))
test_paths[:5]

['../data/test/unknown/92e1474b1351dc69071cea6b792fd499.jpg',
 '../data/test/unknown/c8e6cb302052b1a985c2d9dc934e757d.jpg',
 '../data/test/unknown/a08b6c26e51a3ee261222985d1641072.jpg',
 '../data/test/unknown/d20a3f640bc733ec21e4670d1a7c29f7.jpg',
 '../data/test/unknown/02ce818b70734ce460d0ffc47e6b9682.jpg']

In [169]:
import numpy as np
test_images = np.zeros((len(test_paths), 224, 224, 3), dtype='float32')
for i, imgpath in enumerate(test_paths):
    img = read_img(imgpath, (224, 224))
    x = preprocess_input(np.expand_dims(img.copy(), axis=0))
    test_images[i] = x

In [171]:
test_feats = vgg_bottleneck.predict(test_images, batch_size=32, verbose=1)



In [172]:
test_feats.shape

(10357, 512)

In [173]:
np.save('test_feats_vgg16.npy', train_feats)

In [191]:
test_feats_norm = scaler.transform(test_feats)

In [199]:
test_probs = logreg.predict_proba(test_feats_norm)
test_probs.shape

(10357, 120)

In [200]:
test_probs = test_probs.T  # (120, 10357)

In [203]:
# create submission csv
import pandas as pd

data = {}
data['id'] = [x.split('/')[4].replace('.jpg', '') for x in test_paths]
for i, dog in enumerate(dogs):
    data[dog] = test_probs[i]

submissions = pd.DataFrame(
    data=data,
    columns=['id'] + dogs
)

In [205]:
submissions

Unnamed: 0,id,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
0,92e1474b1351dc69071cea6b792fd499,6.606660e-08,6.193216e-03,9.409237e-07,1.531382e-06,8.139623e-07,1.273572e-07,1.675252e-07,9.892289e-08,1.419992e-04,...,3.138502e-10,6.004503e-08,2.579528e-07,1.923375e-04,4.861497e-05,3.182413e-04,4.346007e-07,1.294497e-03,1.725496e-04,2.032557e-07
1,c8e6cb302052b1a985c2d9dc934e757d,4.953636e-07,3.066073e-06,1.636236e-06,1.528126e-07,5.940144e-09,2.430919e-09,2.674669e-05,4.960597e-05,8.401744e-07,...,6.215988e-04,2.947099e-07,7.660395e-07,7.687343e-07,6.747192e-09,8.631270e-07,1.197412e-09,1.398904e-06,1.421352e-08,2.657780e-07
2,a08b6c26e51a3ee261222985d1641072,4.299817e-06,3.842691e-05,1.336552e-05,8.165357e-07,6.299205e-07,1.639637e-06,4.407677e-07,1.572657e-06,2.909368e-06,...,2.361840e-06,2.312744e-05,1.688559e-05,2.496015e-04,4.251381e-05,1.396039e-06,1.938594e-06,9.430178e-06,5.628175e-04,7.462807e-08
3,d20a3f640bc733ec21e4670d1a7c29f7,5.472645e-10,2.567184e-08,4.765130e-07,1.348157e-13,3.258201e-09,2.402619e-08,2.989557e-13,1.165050e-11,1.389454e-09,...,2.989962e-11,7.986040e-10,2.387630e-12,1.388281e-09,8.699295e-07,2.909222e-09,3.056509e-09,5.180248e-08,2.898896e-09,2.594655e-11
4,02ce818b70734ce460d0ffc47e6b9682,1.470398e-06,4.573357e-04,1.391983e-04,1.021771e-06,1.003380e-05,8.066151e-09,3.517993e-08,1.578568e-06,1.310436e-02,...,1.253715e-06,5.214195e-06,6.812498e-06,1.762883e-05,7.953946e-06,1.695986e-07,7.607798e-08,3.163661e-04,1.206462e-07,2.385954e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10352,d08c477d0d6b6060163c61f030699bb3,4.584595e-08,1.496215e-04,2.501201e-07,3.168227e-06,8.652122e-06,2.776078e-05,2.052213e-02,7.277240e-03,2.091364e-01,...,2.550815e-02,3.208853e-03,5.249872e-06,1.272376e-01,2.071573e-05,6.198020e-03,1.251202e-06,1.224297e-04,6.092362e-07,2.855371e-06
10353,a8b462c178d69221ee633d425785133c,1.436987e-05,1.010737e-09,4.701095e-07,1.638541e-08,2.745391e-07,2.241749e-05,8.305233e-07,1.227100e-04,1.174185e-08,...,3.495316e-05,4.230937e-05,5.603171e-08,7.009358e-09,7.861452e-09,2.014514e-09,4.227806e-06,1.083687e-07,4.291380e-08,2.143064e-06
10354,8913db55e9966d331e7759f748e2abef,1.538961e-03,4.681865e-07,4.958053e-05,2.931596e-04,1.616257e-04,5.336579e-07,4.022614e-07,6.014989e-06,3.937980e-03,...,6.113286e-07,3.334933e-08,6.462503e-06,2.931564e-06,3.206040e-05,4.989489e-06,4.336434e-07,1.291783e-07,2.659614e-06,1.695278e-05
10355,2bf6d0177046ba8936973513f3eafce0,3.897708e-06,1.107374e-10,7.295774e-10,3.876619e-10,1.444610e-07,1.769690e-11,1.202277e-07,3.099302e-08,1.361867e-10,...,1.513208e-09,6.974926e-09,3.917347e-10,6.394393e-11,7.732171e-11,8.278305e-12,5.187356e-08,1.816232e-09,1.028645e-07,4.594130e-10


In [212]:
os.makedirs('../submits', exist_ok=True)
submissions.to_csv('../submits/logreg_vgg16_bottleneck.csv', index=False, header=True)