In [1]:
import ipynb_importer
import easy_util
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization

importing Jupyter notebook from easy_util.ipynb


Using TensorFlow backend.


#### 一些可配置参数

In [9]:
image_size = 64                # 用于训练的图片大小
train_size = 200               # 训练集大小，从原训练集中取前train_size个样本
test_size = 200                # 测试集大小
epochs = 5                     # 训练轮数
learning_rate = 0.001          # 学习率
batch_size = 64                # 训练时每个batch包含的样本数
n_folds = 5                    # 进行几折交叉校验(cross validation)
threshold = 0.2                # 预测值大于threshold被视为包含该标签
loss = 'binary_crossentropy'   # 损失函数，同keras中model.fit的同名参数
metrics = ['accuracy', easy_util.F2_score(threshold=threshold)] # 评估函数，同keras中model.fit的同名参数

datapath = '../dataset/'       # 数据集所在的文件夹路径
weights_path = '../models/'    # 保存模型权重的路径
output_path = '../output/'     # 结果文件的写出路径
output_file_name = 'test.csv'  # 结果文件名字

#### 读入数据集并调整图片大小

In [4]:
x_train, y_train, x_test, labels, df_test = easy_util.get_model_input(datapath=datapath,image_size=image_size, \
                                                                      train_size=train_size, test_size=test_size)

100%|█████████████████████████████████████████████| 200/200 [00:07<00:00, 26.61it/s]
100%|█████████████████████████████████████████████| 200/200 [00:07<00:00, 27.93it/s]


#### 可适当调整参数后运行下面三个cell中的一个，也可构造自己的模型
* model from keras_starter

In [5]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(image_size, image_size, 3)))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))

* model from https://www.kaggle.com/kelexu/keras-lb-0-913

In [7]:
model = Sequential()
model.add(BatchNormalization(input_shape=(image_size, image_size,3)))
model.add(Conv2D(32, kernel_size=(3, 3),padding='same', activation='relu'))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, kernel_size=(3, 3),padding='same', activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(128, kernel_size=(3, 3),padding='same', activation='relu'))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(256, kernel_size=(3, 3),padding='same', activation='relu'))
model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))

ValueError: Negative dimension size caused by subtracting 3 from 2 for 'conv2d_10/convolution' (op: 'Conv2D') with input shapes: [?,2,2,256], [3,3,256,256].

* model from https://www.kaggle.com/petrosgk/keras-vgg19-0-93028-private-lb

In [None]:
from keras.applications.vgg19 import VGG19
base_model = VGG19(include_top=False,
                       weights='imagenet',
                       input_shape=(image_size, image_size, 3))
model = Sequential()
# Batchnorm input
model.add(BatchNormalization(input_shape=(image_size, image_size, 3)))
# Base model
model.add(base_model)
# Classifier
model.add(Flatten())
model.add(Dense(17, activation='sigmoid'))

#### 训练模型，得到结果

In [10]:
result = easy_util.run_model(metrics=metrics, model=model, x_train=x_train, nfolds=n_folds, weights_path=weights_path, \
                         y_train=y_train, x_test=x_test, batch_size=batch_size, loss=loss, \
                         labels=labels, learning_rate_list=[learning_rate], epochs_list=[epochs])

Start KFold number 1 from 5
Split train:  160 160
Split valid:  40 40
Train on 160 samples, validate on 40 samples
Epoch 1/5
4s - loss: 0.4005 - acc: 0.8651 - FScore2: 0.5780 - val_loss: 0.2428 - val_acc: 0.9221 - val_FScore2: 0.6911
Epoch 2/5
3s - loss: 0.3683 - acc: 0.8603 - FScore2: 0.6064 - val_loss: 0.3558 - val_acc: 0.9221 - val_FScore2: 0.5970
Epoch 3/5
2s - loss: 0.4226 - acc: 0.8327 - FScore2: 0.5619 - val_loss: 0.2651 - val_acc: 0.9191 - val_FScore2: 0.6422
Epoch 4/5
2s - loss: 0.3608 - acc: 0.8691 - FScore2: 0.5814 - val_loss: 0.2439 - val_acc: 0.9191 - val_FScore2: 0.6504
Start KFold number 2 from 5
Split train:  160 160
Split valid:  40 40
Train on 160 samples, validate on 40 samples
Epoch 1/5
3s - loss: 0.3496 - acc: 0.8669 - FScore2: 0.6066 - val_loss: 0.3005 - val_acc: 0.9147 - val_FScore2: 0.6969
Epoch 2/5
3s - loss: 0.3465 - acc: 0.8787 - FScore2: 0.6069 - val_loss: 0.2701 - val_acc: 0.9132 - val_FScore2: 0.6938
Epoch 3/5
3s - loss: 0.3412 - acc: 0.8625 - FScore2: 0.6

#### 生成可提交文件

In [11]:
easy_util.write_output(result=result, df_test=df_test, output_path=output_path, file_name=output_file_name, threshold=threshold)

100%|████████████████████████████████████████████| 200/200 [00:00<00:00, 311.98it/s]
