### 开始

In [1]:
# 检查Python版本
from sys import version_info
if version_info.major != 3:
    
    raise Exception('请使用Python3来完成此项目')

### 数据预处理

#### 获取数据
[Dogs vs. Cats Redux: Kernels Edition
](https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition)

In [2]:
# download data and unzip it
# from urllib.request import urlretrieve
import subprocess
import os
from tqdm import tqdm
from zipfile import ZipFile

train_url = ['kaggle','competitions','download','-c','dogs-vs-cats-redux-kernels-edition',
            '-f','train.zip','-p','./']
test_url = ['kaggle','competitions','download','-c','dogs-vs-cats-redux-kernels-edition',
            '-f','test.zip','-p','./']
sample_csv_url = ['kaggle','competitions','download','-c','dogs-vs-cats-redux-kernels-edition',
                  '-f','sample_submission.csv','-p','./']


def download_unzip_dataset(url, zip_file_path, folder_path, unzip=True):
    if os.path.exists(zip_file_path):
        print("file is exist, no need download")
    else:
        print("download now")
#         urlretrieve(url, zip_file_path)
        subp = subprocess.run(url)
#         subp.wait()
    
    if unzip:
        if os.path.exists(folder_path):
            print("files found")
        else:
            print("unzip now")
            zipf = ZipFile(zip_file_path)
            zipf.extractall()
            print("unzip end")

download_unzip_dataset(train_url, 'train.zip', 'train/')
download_unzip_dataset(test_url, 'test.zip', 'test/')

file is exist, no need download
files found
file is exist, no need download
unzip now
unzip end


#### 分离数据集，dog和cat图片分别放入train2/dogs, train2/cats

In [4]:
import os
import shutil

def split_train_set(old_dir, new_dir):
    file_list = os.listdir(old_dir)
    file_cats = filter(lambda x:x[:3] == 'cat', file_list)
    file_dogs = filter(lambda x:x[:3] == 'dog', file_list)

    if os.path.exists(new_dir):
        shutil.rmtree(new_dir)
    os.mkdir(new_dir)
    
    dogs_path = os.path.join(new_dir, 'dogs')
    cats_path = os.path.join(new_dir, 'cats')
    os.mkdir(dogs_path)
    os.mkdir(cats_path)
    
    # 此处要注意： os.symlink(src, dst)
    # dst是从它所在的目录去选择src,所以src必须是相对于dst的relative path
    for filename in file_cats:
        os.symlink('../../'+old_dir+filename, cats_path+'/'+filename)
    
    for filename in file_dogs:
        os.symlink(old_dir+filename, dogs_path+'/'+filename)
        
    print("split over")

split_train_set('train/', 'pre-train/')

# preprocess test image folder
if os.path.exists('pre-test'):
    shutil.rmtree('pre-test')
os.mkdir('pre-test')
os.symlink('../test', 'pre-test/test')


split over


### 提取特征

利用pre-trained model提取特征

#### 准备训练集和测试集
对于样本数非常多的数据集，可以利用generator函数来减少计算的次数

In [16]:
from keras.preprocessing.image import *

image_size = (224,224)

image_gen = ImageDataGenerator()
train_generator = image_gen.flow_from_directory('pre-train', 
                                                target_size=image_size, 
                                                shuffle=False, # our data will be in order
                                                batch_size=16)
test_generator = image_gen.flow_from_directory('pre-test', 
                                               target_size=image_size, 
                                               shuffle=False, # our data will be in order
                                               batch_size=16, 
                                               class_mode=None, # this means our generator will only yield batches of data, no labels
                                              )


Found 25000 images belonging to 2 classes.
Found 0 images belonging to 0 classes.


#### 提取特征
- 利用 pre-trained 模型从train/test dataset中提取出特征，然后使用自定义的fully-connected层在这些提取的特征集上训练

In [None]:
from keras.applications import resnet50
import h5py

x = Input((image_size[0], image_size[1], 3)) # shape: width, height, channel
base_model = resnet50.ResNet50(input_tensor=resnet50.preprocess_input(x) weights='imagenet')

# the predict_generator method returns the output of a model, given
# a generator that yields batches of numpy data
bottleneck_features_train = base_model.predict_generator(train_generator, train_generator.nb_sample)
# save the output as a Numpy array
# np.save(open('bottleneck_features_train.npy', 'w'), bottleneck_features_train)

bottleneck_features_test = model.predict_generator(test_generator, test_generator.nb_sample)
# np.save(open('bottleneck_features_test.npy', 'w'), bottleneck_features_test)

with h5py.File("pre_out") as h:
        h.create_dataset("train", data=bottleneck_features_train)
        h.create_dataset("label", data=train_generator.classes)
        h.create_dataset("test", data=bottleneck_features_test)

#### 为了增加代码复用，方便调试其它的pre-trained model，将上面的2个步骤封装为一个函数

In [3]:
from keras.preprocessing.image import *
from keras.applications import resnet50
from keras.applications import xception
from keras.applications import inception_v3
from keras.layers import Input, GlobalAveragePooling2D
from keras.models import Model
import h5py

def get_pre_features_from_images(MODEL, image_size, preprocess_input, model_name):
    image_gen = ImageDataGenerator()
    train_generator = image_gen.flow_from_directory('pre-train', 
                                                target_size=image_size, 
                                                shuffle=False, # our data will be in order
                                                batch_size=16)
    
    test_generator = image_gen.flow_from_directory('pre-test', 
                                               target_size=image_size, 
                                               shuffle=False, # our data will be in order
                                               batch_size=16, 
                                               class_mode=None, # this means our generator will only yield batches of data, no labels
                                              )
    
    ## use pre-trained model to get features from image generator
    x = Input((image_size[0], image_size[1], 3)) # shape: width, height, channel
    x = preprocess_input(x)
    base_model = MODEL(input_tensor=x, weights='imagenet')
    model = Model(base_model.input, GlobalAveragePooling2D()(base_model.output))
    
    # the predict_generator method returns the output of a model, given
    # a generator that yields batches of numpy data
    pre_features_train = model.predict_generator(train_generator, train_generator.nb_sample)
    pre_features_test = model.predict_generator(test_generator, test_generator.nb_sample)
    
    # save the output to h5 file
    out_filename = model_name + "_pre_out.h5"
    with h5py.File(out_filename) as h:
        h.create_dataset("train", data=pre_features_train)
        h.create_dataset("label", data=train_generator.classes)
        h.create_dataset("test", data=pre_features_test)

Using TensorFlow backend.


In [4]:
get_pre_features_from_images(resnet50.ResNet50, (224,224), resnet50.preprocess_input, "ResNet50")

Found 25000 images belonging to 2 classes.
Found 12500 images belonging to 1 classes.


ValueError: Input 0 is incompatible with layer global_average_pooling2d_1: expected ndim=4, found ndim=2

In [None]:
get_pre_features_from_images(xception.Xception, (224,224), resnet50.preprocess_input, "Xception")

In [None]:
get_pre_features_from_images(inception_v3.InceptionV3, (224,224), resnet50.preprocess_input, "InceptionV3")