## 检测python版本
这里我们使用的python的版本为3.6.5

In [90]:
import sys
sys.version

'3.6.5 |Anaconda, Inc.| (default, Apr 26 2018, 08:42:37) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

## 数据预处理
为了更好的训练模型，这里对数据进行一定程度对预处理
- 删除训练数据集中过大（`500*500`以上）和过小（`100*100`以下）的图片。
- 将图片按照一定的目录结构归类。

In [91]:
import os, shutil
from PIL import Image

# 删除不合尺寸
def pick_bad_pics(path, bad_path):
    bad_list = []
    img_list = os.listdir(path)
    for img_name in img_list:
        im_path = os.path.join(path, img_name)
        im = Image.open(im_path)
        w, h = im.size
        if w > 500 or h > 500 or w < 100 or h < 100:
            bad_list.append(img_name)
            shutil.move(im_path, os.path.join(bad_path, img_name))
    print(len(bad_list))
    # return bad_list

pick_bad_pics("data/train", "data/train_bad")

0


## 读取加载数据集，归一化处理
将用于训练的数据集加载到内存，等待处理。主要是转化为ndarray类型到数据，方便后续到计算和处理.因为选择到预训练模型，对于图片到要求都是`299*299`大小，这里我们读取数据时，图片统一调整到这个尺寸。
- 加载训练集数据。
- 加载测试集数据。
- 输出一个经过正规化的、Numpy array 格式的图像数据。

In [92]:
import glob, cv2
import numpy as np
from tqdm import tqdm

# 加载训练集
def load_train_data():

    cat = glob.glob("data/train/cat.*.jpg")
    dog = glob.glob("data/train/dog.*.jpg")
    train_data = np.zeros(((len(cat)+len(dog)), 299, 299, 3), dtype=np.uint8)
    train_targ = np.array([0]*len(cat) + [1]*len(dog))

    i = 0
    for img_name in tqdm(cat):
        img = cv2.imread(img_name)
        train_data[i] = cv2.resize(img,(299, 299))
        i += 1
    for img_name in tqdm(dog):
        img = cv2.imread(img_name)
        train_data[i] = cv2.resize(img,(299, 299))
        i += 1

    return train_data, train_targ

# 加载测试集
def load_test_data():

    test = glob.glob("data/test/*.jpg")
    test_data = np.zeros((len(test), 299, 299, 3), dtype=np.uint8)
    
    i = 0
    for img_name in tqdm(test):
        img = cv2.imread(img_name)
        test_data[i] = cv2.resize(img,(299, 299))
        i += 1
    
    return test_data

In [93]:
# 处理，加载训练集数据
train_data, train_targ = load_train_data()

100%|██████████| 12400/12400 [00:45<00:00, 274.56it/s]
100%|██████████| 12395/12395 [00:48<00:00, 254.27it/s]


In [94]:
# 处理，加载测试集数据
test_data = load_test_data()

100%|██████████| 12500/12500 [00:49<00:00, 251.38it/s]


## 拆分验证集
对标记数据进行处理，拆分验证集

In [95]:
from sklearn.model_selection import train_test_split

# 划分数据
x_train, x_valid, y_train, y_valid = train_test_split(train_data, train_targ, test_size=0.2)

## 模型InceptionV3
预训练模型InceptionV3

In [103]:
import keras
import pandas as pd

# 构建InceptionV3
def buid_inceptionv3():

    # 获取基础模型，不保留顶层的全连接网络
    input_tensor = keras.Input(shape=(299, 299, 3)) 
    input_tensor = keras.layers.Lambda(keras.applications.inception_v3.preprocess_input)(input_tensor)
    base_model   = keras.applications.inception_v3.InceptionV3(input_tensor=input_tensor, include_top=False)

    # 锁定模型，保护处理
    for layer in base_model.layers:
        layer.trainable = False

    # 空域信号施加全局平均池化，dropout处理防止过拟合，重建全连接层
    x = keras.layers.GlobalAveragePooling2D()(base_model.output)
    x = keras.layers.Dropout(0.25)(x)
    x = keras.layers.Dense(1, activation='sigmoid', kernel_initializer='he_normal')(x)

    # 配置模型
    result = keras.models.Model(inputs=base_model.input, outputs=x)
    result.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])

    # 返回
    print('InceptionV3 has %d layers.' % len(result.layers))
    return result


# 训练
inceptionv3_obj = buid_inceptionv3()
inceptionv3_obj.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_valid, y_valid))

# 预测输出
inceptionv3_predict = inceptionv3_obj.predict(x_test)
inceptionv3_predict = inceptionv3_predict.clip(min=0.005, max=0.995)
inceptionv3_predict = inceptionv3_predict.flatten(order = 'F')

# 保存结果
submission = pd.DataFrame(data = {'id':(np.arange(len(x_test))+1), 'label': inceptionv3_predict})
submission.to_csv('inceptionv3_submission.csv',index=False)

# 可视化模型
keras.utils.plot_model(inceptionv3_obj, to_file='model_inceptionv3.png')

Train on 19836 samples, validate on 4959 samples
Epoch 1/10


KeyboardInterrupt: 

## 模型Xception
预训练模型Xception

In [104]:
# 构建xception
def buid_xception():

    # 获取基础模型，不保留顶层的全连接网络
    input_tensor = keras.Input(shape=(299, 299, 3)) 
    input_tensor = keras.layers.Lambda(keras.applications.xception.preprocess_input)(input_tensor)
    base_model   = keras.applications.xception.Xception(input_tensor=input_tensor, include_top=False)

    # 锁定模型，保护处理
    for layer in base_model.layers:
        layer.trainable = False

    # 空域信号施加全局平均池化，dropout处理防止过拟合，重建全连接层
    x = keras.layers.GlobalAveragePooling2D()(base_model.output)
    x = keras.layers.Dropout(0.25)(x)
    x = keras.layers.Dense(1, activation='sigmoid', kernel_initializer='he_normal')(x)

    # 配置模型
    result = keras.models.Model(inputs=base_model.input, outputs=x)
    result.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])

    # 返回
    print('xception has %d layers.' % len(result.layers))
    return result

# 训练
xception_obj = buid_xception()
xception_obj.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_valid, y_valid))

# 预测输出
xception_predict = xception_obj.predict(x_test)
xception_predict = xception_obj.clip(min=0.005, max=0.995)
xception_predict = xception_obj.flatten(order = 'F')

# 保存结果
submission = pd.DataFrame(data = {'id':(np.arange(len(x_test))+1), 'label': xception_predict})
submission.to_csv('xception_submission.csv',index=False)

# 可视化模型
keras.utils.plot_model(xception_obj, to_file='model_xception.png')

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels_notop.h5
 6488064/83683744 [=>............................] - ETA: 3:05

KeyboardInterrupt: 

## 模型Inception ResnetV2
预训练模型Inception ResnetV2

In [None]:
# 构建Inception ResnetV2
def buid_inception_resnet_v2():

    # 获取基础模型，不保留顶层的全连接网络
    input_tensor = keras.Input(shape=(299, 299, 3)) 
    input_tensor = keras.layers.Lambda(keras.applications.inception_resnet_v2.preprocess_input)(input_tensor)
    base_model   = keras.applications.inception_resnet_v2.InceptionResNetV2(input_tensor=input_tensor, include_top=False)

    # 锁定模型，保护处理
    for layer in base_model.layers:
        layer.trainable = False

    # 空域信号施加全局平均池化，dropout处理防止过拟合，重建全连接层
    x = keras.layers.GlobalAveragePooling2D()(base_model.output)
    x = keras.layers.Dropout(0.25)(x)
    x = keras.layers.Dense(1, activation='sigmoid', kernel_initializer='he_normal')(x)

    # 配置模型
    result = keras.models.Model(inputs=base_model.input, outputs=x)
    result.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])

    # 返回
    print('Inception ResnetV2 has %d layers.' % len(result.layers))
    return result

# 训练
inception_resnet_v2_obj = buid_inceptionv3()
inception_resnet_v2_obj.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_valid, y_valid))

# 预测输出
inception_resnet_v2_predict = inception_resnet_v2_obj.predict(x_test)
inception_resnet_v2_predict = inception_resnet_v2_predict.clip(min=0.005, max=0.995)
inception_resnet_v2_predict = inception_resnet_v2_predict.flatten(order = 'F')

# 保存结果
submission = pd.DataFrame(data = {'id':(np.arange(len(x_test))+1), 'label': inception_resnet_v2_predict})
submission.to_csv('inception_resnet_v2_submission.csv',index=False)

# 可视化模型
keras.utils.plot_model(inception_resnet_v2_obj, to_file='model_inception_resnet_v2.png')

## 提取特征，融合模型
将多个模型到特征向量融合训练