## 检测python版本
这里我们使用的python的版本为3.6

In [None]:
import sys
sys.version

## 数据预处理
为了更好的训练模型，这里对数据进行一定程度对预处理
- 删除训练数据集中过大（`500*500`以上）和过小（`100*100`以下）的图片。

In [2]:
import os, shutil
from PIL import Image

# 删除不合尺寸
def pick_bad_pics(path, bad_path):

    # 没有目录，补充创建
    if not os.path.isdir(bad_path):
        os.mkdir(bad_path)
    
    bad_list = []
    img_list = os.listdir(path)
    for img_name in img_list:
        im_path = os.path.join(path, img_name)
        im = Image.open(im_path)
        w, h = im.size
        if w > 500 or h > 500 or w < 10 or h < 10:
            bad_list.append(img_name)
            shutil.move(im_path, os.path.join(bad_path, img_name))
    print(len(bad_list))

In [3]:
pick_bad_pics("data/train", "data/train_bad")

0


## 读取加载数据集，归一化处理
将用于训练的数据集加载到内存，等待处理。主要是转化为ndarray类型到数据，方便后续到计算和处理。
- 加载训练集数据。
- 加载测试集数据。
- 输出一个经过正规化的、Numpy array 格式的图像数据。

In [4]:
import glob, cv2
import numpy as np
from tqdm import tqdm

# 加载训练集
def load_train_data(image_size):

    cat = glob.glob("data/train/cat.*.jpg")
    dog = glob.glob("data/train/dog.*.jpg")
    train_data = np.zeros(((len(cat)+len(dog)), image_size[0], image_size[1], 3), dtype=np.uint8)
    train_targ = np.array([0]*len(cat) + [1]*len(dog))

    i = 0
    for img_name in tqdm(cat):
        img = cv2.imread(img_name)
        train_data[i] = cv2.resize(img,image_size)
        i += 1
    for img_name in tqdm(dog):
        img = cv2.imread(img_name)
        train_data[i] = cv2.resize(img,image_size)
        i += 1

    return train_data, train_targ

# 加载测试集
def load_test_data(image_size):

    test = glob.glob("data/test/*.jpg")
    test_data = np.zeros((len(test), image_size[0], image_size[1], 3), dtype=np.uint8)

    for img_name in tqdm(test):
        index = int(img_name[img_name.rfind('/')+1:img_name.rfind('.')])
        #print("index=%d name=%s" % (index, img_name))
        img = cv2.imread(img_name)
        test_data[index-1] = cv2.resize(img,image_size)
    
    return test_data

## 244*244数据预处理
 - 用作ResNet50模型的筛选训练

In [5]:
# 处理，加载训练集数据
train_data, train_targ = load_train_data((244,244))

# 处理，加载测试集数据
test_data = load_test_data((244,244))

# 展示
print(train_data.shape, train_targ.shape, test_data.shape)

100%|██████████| 12499/12499 [00:37<00:00, 336.65it/s]
100%|██████████| 12499/12499 [00:38<00:00, 326.85it/s]
100%|██████████| 12500/12500 [00:38<00:00, 328.03it/s]

(24998, 244, 244, 3) (24998,) (12500, 244, 244, 3)





## 拆分验证集
对标记数据进行处理，拆分验证集

In [6]:
from sklearn.model_selection import train_test_split

# 划分数据
x_train, x_valid, y_train, y_valid = train_test_split(train_data, train_targ, test_size=0.2)

## 模型ResNet50
预训练模型ResNet50

In [9]:
import keras
import pandas as pd

# 构建ResNet50
def buid_resnet50():

    # 获取基础模型，不保留顶层的全连接网络
    input_tensor = keras.Input(shape=(244, 244, 3)) 
    input_tensor = keras.layers.Lambda(keras.applications.resnet50.preprocess_input)(input_tensor)
    base_model   = keras.applications.resnet50.ResNet50(input_tensor=input_tensor, include_top=False)

    # 锁定模型，保护处理
    for layer in base_model.layers:
        layer.trainable = False

    # 空域信号施加全局平均池化，dropout处理防止过拟合，重建全连接层
    x = keras.layers.GlobalAveragePooling2D()(base_model.output)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(1, activation='sigmoid')(x)

    # 配置模型
    result = keras.models.Model(inputs=base_model.input, outputs=x)
    result.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])

    # 返回
    print('ResNet50 has %d layers.' % len(result.layers))
    return result

# 创建
resnet50_obj = buid_resnet50()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  (fname, cnt))
  (fname, cnt))


ResNet50 has 179 layers.


In [None]:
# 训练
resnet50_obj.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=(x_valid, y_valid))

Train on 19998 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [None]:
# 预测输出
resnet50_predict = resnet50_obj.predict(test_data)
resnet50_predict = resnet50_predict.clip(min=0.005, max=0.995)
resnet50_predict = resnet50_predict.flatten(order = 'F')

In [None]:
# 保存结果
submission = pd.DataFrame(data = {'id':(np.arange(len(test_data))+1), 'label': resnet50_predict})
submission.to_csv('submission_resnet50.csv',index=False)
submission.head(20)