In [10]:
#以TensorFlow为后端，在深度学习平台Keras中加载所需的库
import numpy as np 
np.random.seed(2018)
import os 
import glob 
import cv2 
import datetime
import pandas as pd 
import time
import warnings
warnings.filterwarnings("ignore")
from sklearn.cross_validation import KFold 
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution2D,MaxPooling2D,ZeroPadding2D
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping 
from keras.utils import np_utils
from sklearn.metrics import log_loss
from keras import  __version__  as keras_version
# Parameters
# ––––––––––
# x : type
#	Description of parameter `x`. 
def rezize_image(img_path):
    img = cv2.imread(img_path)
    img_resized = cv2.resize(img, (32, 32), cv2.INTER_LINEAR) 
    return img_resized
#从相应的文件夹名称加载培训示例，其中为每种类型都有一个文件夹
def load_training_samples():
#用于保存培训输入和输出变量的变量
    train_input_variables = []
    train_input_variables_id = []
    train_label = []
    # 扫描鱼类型的每个文件夹中的所有图像
    print('Start Reading Train Images')
    folders = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']
    for fld in folders:
        folder_index = folders.index(fld)
        print('Load folder (} (Index: (})'.format(fld, folder_index)) 
        imgs_path = os.path.join('..', 'input', 'train', fld, '*.jpg') 
        files = glob.glob(imgs_path)
        for file in files:
            file_base = os.path.basename(file)
            # 调整图像大小
            resized_img = rezize_image(file)
            # 将处理后的图像附加到分类器的输入/输出变量中
            train_input_variables.append(resized_img) 
            train_input_variables_id.append(file_base) 
            train_label.append(folder_index)
        return train_input_variables, train_input_variables_id, train_label

#加载测试样本，用于测试模型的训练效果。
def load_testing_samples():
# 从测试文件夹中扫描图像
    imgs_path = os.path.join('..', 'input', 'test_stgl', '*.jpg')
    files = sorted(glob.glob(imgs_path))
    # 保存测试样本的变量 
    testing_samples = []
    testing_samples_id = []
    #处理图像并将它们附加到我们拥有的数组中。 
    for file in files:
        file_base = os.path.basename(file)
        # Image resizing
        resized_img = rezize_image(file) 
        testing_samples.append(resized_img) 
        testing_samples_id.append(file_base)
    return testing_samples, testing_samples_id
# 格式化图像以适应我们的模型
def format_results_for_types(predictions, test_id, info):
    model_results = pd.DataFrame(predictions, columns=['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER','SHARK', 'YFT'])
    model_results.loc[:, 'image'] = pd.Series(test_id, index=model_results.index)
    sub_file = 'testOutput_' + info + '.csv' 
    model_results.to_csv(sub_file, index=False)
def load_normalize_training_samples():
    # 调用Load函数以加载和调整训练样本的大小
    training_samples, training_label, training_samples_id = load_training_samples()
    # 将加载和调整大小的数据转换为Numpy格式 
    training_samples = np.array(training_samples, dtype=np.uint8) 
    training_label = np.array(training_label, dtype=np.uint8)
    # 重塑训练样本
    training_samples = training_samples.transpose((0,3,1,2))
    # 将培训样本和培训标签转换为浮动格式
    training_samples = training_samples.astype('float32')
    training_samples = training_samples/255
    training_label = np_utils.to_categorical(training_label, 8) 
    return training_samples, training_label, training_samples_id
#加载和规范化测试样本以适应我们的模型
def load_normalize_testing_samples():
    # 调用LOAD函数以加载和调整测试样本的大小
    testing_samples, testing_samples_id = load_testing_samples()
    # 将加载和调整大小的数据转换为Numpy格式
    testing_samples = np.array(testing_samples, dtype=np.uint8)
    # 重塑测试样本
    testing_samples = testing_samples.transpose((0,3,1,2))
    # 将测试样本转换为浮动格式
    testing_samples = testing_samples.astype('float32') 
    testing_samples = testing_samples / 255
    return testing_samples, testing_samples_id

def merge_several_folds_mean(data, num_folds):
    a = np.array(data[O])
    for i in range(l, num_folds): 
        a += np.array(data[i])
    a /= num_folds 
    return a.tolist()
# 创建CNN模型体系结构
def create_cnn_model_arch():
    pool_size = 2 # 我们将在整个过程中使用2x2池化层
    conv_depth_l = 32 
    conv_depth_2 = 64
    drop_prob = 0.5  
    hidden_size = 32 
    num_classes = 8 
    # Conv [32] –> Conv [32] –> Pool 
    cnn_model = Sequential()
    cnn_model.add(ZeroPadding2D((l, l), input_shape=(3, 32, 32), dim_ordering='th'))
    cnn_model.add(Convolution2D(conv_depth_l, kernel_size, kernel_size, activation='relu',
    dim_ordering='th'))
    cnn_model.add(ZeroPadding2D((l, l), dim_ordering='th')) 
    cnn_model.add(Convolution2D(conv_depth_l, kernel_size, kernel_size,activation='relu', dim_ordering='th'))
    cnn_model.add(MaxPooling2D(pool_size=(pool_size, pool_size), strides=(2, 2),dim_ordering='th'))
    # Conv [64] –> Conv [64] –> Pool 
    cnn_model.add(ZeroPadding2D((l, l), dim_ordering='th'))
    cnn_model.add(Convolution2D(conv_depth_2, kernel_size, kernel_size, activation='relu',dim_ordering='th'))
    cnn_model.add(ZeroPadding2D((l, l), dim_ordering='th')) 
    cnn_model.add(Convolution2D(conv_depth_2, kernel_size, kernel_size,activation='relu',dim_ordering='th')) 
    cnn_model.add(MaxPooling2D(pool_size=(pool_size, pool_size),strides=(2, 2),dim_ordering='th'))
    # Now flatten to lD, apply FC then ReLU (with dropout) and finally softmax(output layer)
    cnn_model.add(Flatten()) 
    cnn_model.add(Dense(hidden_size, activation='relu')) 
    cnn_model.add(Dropout(drop_prob)) 
    cnn_model.add(Dense(hidden_size, activation='relu')) 
    cnn_model.add(Dropout(drop_prob)) 
    cnn_model.add(Dense(num_classes, activation='softmax'))
    # 启动随机梯度下降优化器
    stochastic_gradient_descent = SGD(lr=le-2, decay=le-6, momentum=0.9,nesterov=True)
    cnn_model.compile(optimizer=stochastic_gradient_descent,
    # 使用随机梯度下降优化器
    loss='categorical_crossentropy')# 使用交叉熵损失函数
    return cnn_model
#以折叠交叉验证为验证方法的模型 
def create_model_with_kfold_cross_validation(nfolds=10):
    batch_size = 16 # 在每次迭代中，我们同时考虑32个训练示例。
    num_epochs = 30 # 我们在整个训练集上迭代2OO次。
    random_state =51 # 在同一平台上控制结果重复性的随机性
    # 在将训练样本输入到创建的CNN模型之前加载和规范化
    training_samples, training_samples_target, training_samples_id =load_normalize_training_samples() 
    yfull_train = dict()
    # 提供培训/测试指标，以分割培训样本中的数据
    # which is splitting data into lO consecutive folds with shuffling 
    kf = KFold(len(train_id), n_folds=nfolds, shuffle=True,random_state=random_state)
    fold_number = 0 # 折数初值
    sum_score = 0 #总分(每次迭代时将增加)
    trained_models = [] # 存储每个迭代的模型
    # 获取培训/测试样本
    #t培训/测试指数
    for train_index,test_index in kf: 
        cnn_model = create_cnn_model_arch()
        training_samples_X = training_samples[train_index] # 获取训练输入变量
        training_samples_Y = training_samples_target[train_index] # 获取培训输出/标签变量
        validation_samples_X = training_samples[test_index] # 获取验证输入变量
        validation_samples_Y = training_samples_target[test_index] # 获取验证输出/标签变量
        fold_number += 1
        print('Fold number {} from {}'.format(fold_number, nfolds)) 
        callbacks = [
                EarlyStopping(monitor='val_loss', patience=3, verbose=0),
        ]
        # 拟合CNN模型，给出定义的设置
        cnn_model.fit(training_samples_X, training_samples_Y,batch_size=batch_size,
        nb_epoch=num_epochs, shuffle=True, verbose=2,
        validation_data=(validation_samples_X,
        validation_samples_Y), callbacks=callbacks)
        # 基于验证集的训练模型泛化能力度量
        predictions_of_validation_samples = cnn_model.predict(validation_samples_X.astype('float32'), batch_size=batch_size, verbose=2)
        current_model_score = log_loss(Y_valid, predictions_of_validation_samples)
        print('Current model score log_loss: ', current_model_score) 
        sum_score += current_model_score*len(test_index)
        # 存储有效预测
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = predictions_of_validation_samples[i]
            # 存储经过训练的模型
            trained_models.append(cnn_model)
            # 用当前模型计算的分数增量和得分值
        overall_score = sum_score/len(training_samples) 
        print("Log_loss train independent avg: ", overall_score)
        #在此阶段报告模型损失
        overall_settings_output_string = 'loss_' + str(overall_score) +'_folds_' + str(nfolds) + '_ep_' + str(num_epochs)
        return overall_settings_output_string, trained_models
#测试模型的训练效果
def test_generality_crossValidation_over_test_set( overall_settings_output_string, cnn_models):
    batch_size = 16 # 在每次迭代中，我们同时考虑32个训练示例。
    fold_number = 0 # 折叠迭代器
    number_of_folds = len(cnn_models) # 根据训练步骤中使用的值创建折叠数
    yfull_test = [] # 变量来保存测试集的总体预测。
    #在测试集上执行实际的交叉验证测试过程 
    for j in range(number_of_folds):
        model = cnn_models[j] 
        fold_number += 1
        print('Fold number {} out of {}'.format(fold_number, number_of_folds))
        #加载和正规化测试样本
        testing_samples, testing_samples_id =load_normalize_testing_samples()
        #在当前测试折叠上调用当前模型
        test_prediction = model.predict(testing_samples,batch_size=batch_size, verbose=2) 
        yfull_test.append(test_prediction)
    test_result = merge_several_folds_mean(yfull_test, number_of_folds)
    overall_settings_output_string = 'loss_' +overall_settings_output_string \
    + '_folds_' + str(number_of_folds)
    format_results_for_types(test_result, testing_samples_id, overall_settings_output_string)
#开始模型培训和测试
if __name__== '_main_':
    info_string, models = create_model_with_kfold_cross_validation() 
    test_generality_crossValidation_over_test_set(info_string, models)
