### 开始

In [1]:
# 检查Python版本
from sys import version_info
if version_info.major != 3:
    raise Exception('请使用Python3来完成此项目')

### 数据预处理

#### 获取数据
[Dogs vs. Cats Redux: Kernels Edition
](https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition)

In [2]:
# download data and unzip it
# from urllib.request import urlretrieve
import subprocess
import os
from tqdm import tqdm
from zipfile import ZipFile

train_url = ['kaggle','competitions','download','-c','dogs-vs-cats-redux-kernels-edition',
            '-f','train.zip','-p','./']
test_url = ['kaggle','competitions','download','-c','dogs-vs-cats-redux-kernels-edition',
            '-f','test.zip','-p','./']
sample_csv_url = ['kaggle','competitions','download','-c','dogs-vs-cats-redux-kernels-edition',
                  '-f','sample_submission.csv','-p','./']


def download_unzip_dataset(url, zip_file_path, folder_path, unzip=True):
    if os.path.exists(zip_file_path):
        print("file is exist, no need download")
    else:
        print("download now")
#         urlretrieve(url, zip_file_path)
        subp = subprocess.run(url)
#         subp.wait()
    
    if unzip:
        if os.path.exists(folder_path):
            print("files found")
        else:
            print("unzip now")
            zipf = ZipFile(zip_file_path)
            zipf.extractall()
            print("unzip end")

download_unzip_dataset(train_url, 'train.zip', 'train/')
download_unzip_dataset(test_url, 'test.zip', 'test/')

file is exist, no need download
files found
file is exist, no need download
files found


#### 分离数据集，dog和cat图片分别放入train2/dogs, train2/cats

In [3]:
import os
import shutil

def split_train_set(old_dir, new_dir):
    file_list = os.listdir(old_dir)
    file_cats = filter(lambda x:x[:3] == 'cat', file_list)
    file_dogs = filter(lambda x:x[:3] == 'dog', file_list)

    if os.path.exists(new_dir):
        shutil.rmtree(new_dir)
    os.mkdir(new_dir)
    
    dogs_path = os.path.join(new_dir, 'dogs')
    cats_path = os.path.join(new_dir, 'cats')
    os.mkdir(dogs_path)
    os.mkdir(cats_path)
    
    # 此处要注意： os.symlink(src, dst)
    # dst是从它所在的目录去选择src,所以src必须是相对于dst的relative path
    for filename in file_cats:
        os.symlink('../../'+old_dir+filename, cats_path+'/'+filename)
    
    for filename in file_dogs:
        os.symlink(old_dir+filename, dogs_path+'/'+filename)
        
    print("split over")

split_train_set('train/', 'train2/')


split over


### Test Model in single picture

为了复用代码，同时也保证测试的一致性，将model的预测试封装为一个函数

下面的几个pre-trained model都是在ImageNet图片库上训练过的，因为ImageNet很大，已经包含有足够多种类的‘dog'和'cat'，所以这些模型能够学习并提取"Dogs_vs_Cats”数据集里的特征(可以把本数据集理解为ImageNet的子集)

In [6]:
from keras.preprocessing import image
# from keras.applications.resnet50 import preprocess_input, decode_predictions
from keras.applications import *
from keras.layers import Input
import numpy as np


def trained_model_test(MODEL, image_size, preprocess_input, decode_predictions):
#     input_tensor = Input((image_size[0], image_size[1], 3)) # shape: width, height, channel
    
#     if preprocess_input:
#         x = preprocess(input_tensor)
#     else:
#         x = input_tensor
        
    # using the pre-training weights in ImageNet dataset
    base_model = MODEL(weights='imagenet')
    
    img = image.load_img('test/1.jpg', target_size=image_size)
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
        
    preds = base_model.predict(x)
    print('preds shape:', preds.shape)
#     print(preds[:5])
    decode_preds = decode_predictions(preds, top=3)
    print('decode_pred shape:',decode_preds)
    print('predicted:', decode_preds[0])
    
    return
    

- ResNet50

In [7]:
from keras.applications import resnet50

trained_model_test(resnet50.ResNet50, (224,224), resnet50.preprocess_input, resnet50.decode_predictions)

preds shape: (1, 1000)
decode_pred shape: [[('n02105412', 'kelpie', 0.24543868), ('n02113186', 'Cardigan', 0.110969715), ('n02106166', 'Border_collie', 0.09240467)]]
predicted: [('n02105412', 'kelpie', 0.24543868), ('n02113186', 'Cardigan', 0.110969715), ('n02106166', 'Border_collie', 0.09240467)]


array([[1.31234847e-04, 8.57069972e-06, 2.03391191e-05, 1.04978262e-05,
        3.51435847e-05, 3.04519108e-05, 1.98745583e-05, 2.87555285e-05,
        1.01201025e-04, 9.98234555e-06, 1.64194523e-06, 1.32972320e-06,
        4.40749687e-07, 1.44929209e-06, 3.55652560e-06, 3.53966448e-06,
        2.73672276e-06, 9.98054165e-05, 5.52410260e-04, 4.18776872e-05,
        9.48150955e-06, 6.28416387e-07, 1.69006091e-06, 2.89736136e-06,
        1.74253898e-06, 2.04213256e-06, 2.00341333e-06, 2.19998356e-06,
        3.27403068e-05, 1.87724399e-05, 4.65155244e-05, 3.08207859e-06,
        1.15437670e-05, 6.37757284e-06, 5.29125173e-05, 2.16581284e-05,
        2.49987752e-05, 8.06592925e-06, 1.24051564e-06, 4.57808937e-06,
        1.80651364e-06, 3.97660278e-06, 5.11140843e-06, 1.14342211e-05,
        2.63906372e-06, 8.06809931e-06, 2.54208567e-06, 1.57599072e-06,
        8.67023118e-06, 1.44446255e-06, 2.08846486e-05, 1.87419209e-05,
        2.00932300e-05, 1.17357795e-05, 4.94386768e-06, 2.077375

- Xception

In [17]:
from keras.applications import xception

trained_model_test(xception.Xception, (299,299), xception.preprocess_input, xception.decode_predictions)

predicted: [('n02106550', 'Rottweiler', 0.9479482), ('n02105412', 'kelpie', 0.0116197495), ('n02107142', 'Doberman', 0.006018338)]


- Inception V3

In [5]:
from keras.applications import inception_v3

trained_model_test(inception_v3.InceptionV3, (299,299), inception_v3.preprocess_input, inception_v3.decode_predictions)

predicted: [('n02106550', 'Rottweiler', 0.90469307), ('n02089078', 'black-and-tan_coonhound', 0.02317639), ('n02105412', 'kelpie', 0.010496103)]


### 特征迁移

#### 准备训练集和测试集
对于样本数非常多的数据集，可以利用generator函数来减少计算的次数

In [16]:
from keras.preprocessing.image import *

image_size = (224,224)

image_gen = ImageDataGenerator()
train_generator = image_gen.flow_from_directory('train2', 
                                                target_size=image_size, 
                                                shuffle=False, # our data will be in order
                                                batch_size=16)
test_generator = image_gen.flow_from_directory('test', 
                                               target_size=image_size, 
                                               shuffle=False, # our data will be in order
                                               batch_size=16, 
                                               class_mode=None, # this means our generator will only yield batches of data, no labels
                                              )


Found 25000 images belonging to 2 classes.
Found 0 images belonging to 0 classes.


#### 提取特征
- 利用 pre-trained 模型从train/test dataset中提取出特征，然后使用自定义的fully-connected层在这些提取的特征集上训练

In [None]:
from keras.applications import resnet50
import h5py

x = Input((image_size[0], image_size[1], 3)) # shape: width, height, channel
base_model = resnet50.ResNet50(input_tensor=resnet50.preprocess_input(x) weights='imagenet')
model = Model(base_mode.input, GlobalAveragePooling2D()(base_model.output))

# the predict_generator method returns the output of a model, given
# a generator that yields batches of numpy data
bottleneck_features_train = model.predict_generator(train_generator, train_generator.nb_sample)
# save the output as a Numpy array
# np.save(open('bottleneck_features_train.npy', 'w'), bottleneck_features_train)

bottleneck_features_test = model.predict_generator(test_generator, test_generator.nb_sample)
# np.save(open('bottleneck_features_test.npy', 'w'), bottleneck_features_test)

# save train features, train labels, test features to one file
with h5py.File("pre_out") as h:
        h.create_dataset("train", data=bottleneck_features_train)
        h.create_dataset("label", data=train_generator.classes)
        h.create_dataset("test", data=bottleneck_features_test)

- 为了增加代码复用，方便调试其它的pre-trained model，将上面的2个步骤封装为一个函数

In [16]:
from keras.preprocessing.image import *
from keras.applications import resnet50
from keras.applications import xception
from keras.applications import inception_v3
import h5py

def get_pre_features_from_images(MODEL, image_size, preprocess_input, model_name):
    image_gen = ImageDataGenerator()
    train_generator = image_gen.flow_from_directory('train2', 
                                                target_size=image_size, 
                                                shuffle=False, # our data will be in order
                                                batch_size=16)
    
    test_generator = image_gen.flow_from_directory('test', 
                                               target_size=image_size, 
                                               shuffle=False, # our data will be in order
                                               batch_size=16, 
                                               class_mode=None, # this means our generator will only yield batches of data, no labels
                                              )
    
    ## use pre-trained model to get features from image generator
    x = Input((image_size[0], image_size[1], 3)) # shape: width, height, channel
    x = preprocess_input(x)
    base_model = MODLE(input_tensor=x, weights='imagenet')
    
    # the predict_generator method returns the output of a model, given
    # a generator that yields batches of numpy data
    pre_features_train = base_model.predict_generator(train_generator, train_generator.nb_sample)
    pre_features_test = model.predict_generator(test_generator, test_generator.nb_sample)
    
    # save the output to h5 file
    out_filename = model_name + "_pre_out.h5"
    with h5py.File(out_filename) as h:
        h.create_dataset("train", data=pre_features_train)
        h.create_dataset("label", data=train_generator.classes)
        h.create_dataset("test", data=pre_features_test)
        
get_pre_features_from_images(resnet50.ResNet50, (224,224), resnet50.preprocess_input, "resnet50")

#### 载入数据

In [12]:
import h5py
import numpy as np
from sklearn.utils import shuffle
np.random.seed(66)

train_data = []
test_data = []

#------- single pre-trained mode
with h5py.File('ResNet50_pre_out.h5', 'r') as h:
    train_data.append(np.array(h['train']))
    train_labels = np.array(h['label'])
    test_data.appen(np.array(h['test']))

# 
train_data = np.concatenate(train_data, axis=1)
test_data = np.concatenate(test_data, axis=1)

# 预存的X_train, y_train是按顺序存放的，前12500是猫，后12500是狗, 这里打乱顺序，使之随机存放
# Note: 打乱的是存放存放顺序，并不改变 X_train , y_train的对应关系
train_data, train_labels = shuffle(train_data, train_labels)

OSError: Unable to open file (unable to open file: name = 'pre_features_train', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

#### 构建模型
- pre-trained模型已经训练过，所以不再需要fit， 除非又加入了新的layer构成了新模型

In [31]:
## build model: xception + Dropout + Dense
from keras.models import Sequential
from keras.models import Model
from keras.layers import GlobalAveragePooling2D
from keras.layers import Activation, Dropout, Flatten, Dense

# from keras.applications import xception

model = Sequential()
# model.add(Flatten(input_shape=train_data.shape[1:]))
model.add(Flatten(input_shape=Input([224,224,3])))
model.add(Dense(1, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

TypeError: Tensor objects are not iterable when eager execution is not enabled. To iterate over this tensor use tf.map_fn.

#### 在pre-train features上训练模型

In [15]:
from keras.call_back import TensorBoard

history = model.fit(train_data, train_labels,
          nb_epoch=8, batch_size=128,
          validation_split = 0.2,
          )

model.save_weights('model_w.h5')

NameError: name 'train_labels' is not defined

#### 模型可视化

In [29]:
from IPython.display import SVG
from keras.utils import plot_model
from keras.utils.vis_utils import model_to_dot

# plot_model(model, to_file='model.png', show_shapes='True')
SVG(model_to_dot(model).create(prog='dot',format='svg'))

OSError: `pydot` failed to call GraphViz.Please install GraphViz (https://www.graphviz.org/) and ensure that its executables are in the $PATH.

#### predict测试集

先来旧图试试

预测整个测试集

In [None]:
y_pred = model.predict(X_test, verbose=1)

#### 写入kaggle smaple_submission.csv

In [None]:
y_pred = y_pred.clip(min=0.005, max=0.995)

import pandas as pd
from keras.preprocessing.image import *

df = pd.read_csv("sample_submission.csv")

gen = ImageDataGenerator()
test_generator = gen.flow_from_directory("test2", (224, 224), shuffle=False, 
                                         batch_size=16, class_mode=None)

for i, fname in enumerate(test_generator.filenames):
    index = int(fname[fname.rfind('/')+1:fname.rfind('.')])
    df.set_value(index-1, 'label', y_pred[i])

df.to_csv('pred.csv', index=None)
df.head(10)