In [1]:
from keras.preprocessing.image import ImageDataGenerator, load_img
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from sklearn.model_selection import train_test_split
from keras.applications.resnet50 import ResNet50
from keras.applications.xception import Xception  # 20190814_1_Add_Xception
from keras.applications.inception_v3 import InceptionV3  # 20190814_2_Add_InceptionV3

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import PIL
%matplotlib inline

np.random.seed(101)

Using TensorFlow backend.


In [2]:
# 讀取 cats & dogs 資料集
#    --> Train path: .\ml-marathon-final\data\train_data
#    --> Test path: .\ml-marathon-final\data\test_data

TRAIN_DATA_PATH = "./ml-marathon-final/data/train_data"
TEST_DATA_PATH = "./ml-marathon-final/data/test_data"

train_filenames = os.listdir(TRAIN_DATA_PATH)
test_filenames = os.listdir(TEST_DATA_PATH)

labels = []
for filename in train_filenames:
    category = filename.split('.')[0]
    if category == 'dog':
        labels.append(1)   # dog: 1
    else:
        labels.append(0)   # cat: 0
        
df = pd.DataFrame({
    'filename': train_filenames,
    'label': labels
})

df.info()
df.head(5)
#df.tail(5)
#print(dogs_filenames[0])
#image = load_img(DOG_DATA_PATH + "/" + dogs_filenames[0])
#plt.imshow(image)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
filename    4000 non-null object
label       4000 non-null int64
dtypes: int64(1), object(1)
memory usage: 62.6+ KB


Unnamed: 0,filename,label
0,cat.10001.jpg,0
1,cat.10002.jpg,0
2,cat.10005.jpg,0
3,cat.10008.jpg,0
4,cat.10024.jpg,0


In [3]:
# train data 佔 0.8, valid data 佔 0.2
df["label"] = df["label"].replace({0: 'cat', 1: 'dog'}) 

train_df, validate_df = train_test_split(df, test_size=0.20, random_state=42)
train_df = train_df.reset_index(drop=True)
validate_df = validate_df.reset_index(drop=True)

total_train = train_df.shape[0]
total_validate = validate_df.shape[0]

print("train_df.shape", train_df.shape)
print("validate_df.shape", validate_df.shape)
print()
print("train_df['label'].value_counts()\n", train_df['label'].value_counts())
print()
print("validate_df['label'].value_counts()\n", validate_df['label'].value_counts())

train_df.shape (3200, 2)
validate_df.shape (800, 2)

train_df['label'].value_counts()
 dog    1622
cat    1578
Name: label, dtype: int64

validate_df['label'].value_counts()
 cat    422
dog    378
Name: label, dtype: int64


In [4]:
# 20190814_2_Add_InceptionV3 ==>
IMAGE_WIDTH=299
IMAGE_HEIGHT=299
# 20190814_2_Add_InceptionV3 <==
# 20190814_2_Add_InceptionV3 IMAGE_WIDTH=224
# 20190814_2_Add_InceptionV3 IMAGE_HEIGHT=224
IMAGE_SIZE=(IMAGE_WIDTH, IMAGE_HEIGHT)
IMAGE_CHANNELS=3

batch_size = 8
num_classes = 2 # 類別的數量，cat & dog 共有 2 個類別
epochs = 30 # 訓練的 epochs 數量

In [5]:
train_datagen = ImageDataGenerator(
    rotation_range=40,
    rescale=1./255,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1,
)

# 20190814_2_restore_final_exam_2_config ==>
"""
    rotation_range=40,
    rescale=1./255,
    shear_range=0.2,               # 20190813_3_Modify_from_0.1_to_0.2
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.2,         # 20190813_3_Modify_from_0.1_to_0.2
    height_shift_range=0.2,        # 20190813_3_Modify_from_0.1_to_0.2
    channel_shift_range=10,        # 20190813_3_Add
    brightness_range=(1.1, 1.2)    # 20190813_3_Add
"""
# 20190814_2_restore_final_exam_2_config <==

train_generator = train_datagen.flow_from_dataframe(
    train_df, 
    TRAIN_DATA_PATH, 
    x_col='filename',
    y_col='label',
    target_size=IMAGE_SIZE,
    class_mode='categorical',
    batch_size=batch_size
)

Found 3200 validated image filenames belonging to 2 classes.


In [6]:
validation_datagen = ImageDataGenerator(rescale=1./255)
validation_generator = validation_datagen.flow_from_dataframe(
    validate_df, 
    TRAIN_DATA_PATH, 
    x_col='filename',
    y_col='label',
    target_size=IMAGE_SIZE,
    class_mode='categorical',
    batch_size=batch_size
)

Found 800 validated image filenames belonging to 2 classes.


In [7]:
# 20190813_3_Add_callbacks ==>
earlystop = EarlyStopping(monitor='val_acc', 
                          min_delta=0, 
                          patience=10, 
                          verbose=1, 
                          mode='auto', 
                          baseline=None, 
                          restore_best_weights=True)   # 20190813_From_False_to_True
# 20190813_3_Add_callbacks <==

# 20190813_3_Add_callbacks ==>
reduced_lr = ReduceLROnPlateau(monitor='val_acc', 
                               factor=0.2, 
                               patience=5, 
                               verbose=1, 
                               mode='auto', 
                               min_delta=0.0001, 
                               cooldown=0, 
                               min_lr=1e-12)

callbacks = [earlystop, reduced_lr]
# 20190813_3_Add_callbacks <==

In [8]:
# 20190814_2_Add_InceptionV3  ==>
"""
# 以訓練好的 ResNet50 為基礎來建立模型，
# 捨棄 ResNet50 頂層的 fully connected layers
resnet50 = ResNet50(include_top=False, 
                    weights='imagenet', 
                    input_tensor=None,
                    input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS))
                    
x = resnet50.output

"""
inception_v3 = InceptionV3(include_top=False, 
                           weights='imagenet', 
                           input_tensor=None, 
                           input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS), 
                           pooling=None, 
                           classes=2)

x = inception_v3.output
# 20190814_2_Add_InceptionV3 <==

x = Flatten()(x)

# 增加 DropOut layer
x = Dropout(0.5)(x)

# 增加 Dense layer，以 softmax 產生個類別的機率值
output_layer = Dense(num_classes, activation='softmax', name='softmax')(x)

# 20190814_1_Add_Xception model = Model(inputs=resnet50.input, outputs=output_layer)
model = Model(inputs=inception_v3.input, outputs=output_layer)   # 20190814_2_Add_InceptionV3
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=1e-5, amsgrad=True),  # 20190815_1_add_amsgrad
              metrics=['accuracy'])   

history = model.fit_generator(train_generator, 
                              epochs=epochs,
                              validation_data=validation_generator,
                              validation_steps=total_validate//batch_size,
                              steps_per_epoch=total_train//batch_size, 
                              verbose=1, 
                              callbacks=callbacks)  # 20190813_3_Add_callbacks

W0815 21:14:29.243908 11484 deprecation_wrapper.py:119] From D:\Anaconda3\envs\keras37\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0815 21:14:29.252576 11484 deprecation_wrapper.py:119] From D:\Anaconda3\envs\keras37\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0815 21:14:29.257252 11484 deprecation_wrapper.py:119] From D:\Anaconda3\envs\keras37\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0815 21:14:29.267081 11484 deprecation_wrapper.py:119] From D:\Anaconda3\envs\keras37\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0815 21:14:29.267081 11484 deprecation_wrapper.py:1

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 149, 149, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 149, 149, 32) 96          conv2d_1[0][0]                   
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 149, 149, 32) 0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
conv2d_2 (

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30

Epoch 00012: ReduceLROnPlateau reducing learning rate to 1.9999999494757505e-06.
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Restoring model weights from the end of the best epoch

Epoch 00017: ReduceLROnPlateau reducing learning rate to 3.999999989900971e-07.
Epoch 00017: early stopping


### Test generator

In [9]:
test_filenames = os.listdir(TEST_DATA_PATH)
test_df = pd.DataFrame({
    'filename': test_filenames
})
test_samples = test_df.shape[0]

y_test_df = pd.read_csv("./ml-marathon-final/manual_results.csv")

In [10]:
test_gen = ImageDataGenerator(rescale=1./255)
test_generator = test_gen.flow_from_dataframe(
    test_df, 
    TEST_DATA_PATH, 
    x_col='filename',
    y_col=None,
    class_mode=None,
    target_size=IMAGE_SIZE,
    batch_size=batch_size,
    shuffle=False
)

Found 400 validated image filenames.


In [11]:
predict = model.predict_generator(test_generator, steps=np.ceil(test_samples/batch_size))

In [12]:
test_df['label'] = np.argmax(predict, axis=1)

In [13]:
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y_test_df['Predicted'], test_df['label'])

auc

0.9899989998999901

In [14]:
"""

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
PLEASE MODIFY BELOW FILENAME EVERYTIME BUILD
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

"""


output = test_df
output = output.drop('filename', axis=1)
output
output.to_csv('Final_Exam_11.csv', header=["Predicted"], index_label='ID')

In [15]:
"""

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
PLEASE MODIFY BELOW FILENAME EVERYTIME BUILD
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

"""

predproba_df = pd.DataFrame(predict)
predproba_df.to_csv('Predicted_proba_Inception_2.csv')

predproba_df

Unnamed: 0,0,1
0,0.999427,5.731941e-04
1,0.975272,2.472769e-02
2,0.999754,2.462864e-04
3,0.000095,9.999053e-01
4,0.999870,1.299892e-04
5,0.998467,1.532534e-03
6,0.000099,9.999012e-01
7,0.999228,7.717590e-04
8,0.999977,2.330353e-05
9,0.999599,4.007075e-04


### Wrong predicted Number

205 isn't a dog, neither a cat.

Final_Exam_2: 48, 140, 200, 241, 255

# 結果紀錄

### [Final_Exam_1]
   1. ResNet50 model
   2. AUC score: 
        (1) Test: 0.9850485048504851
        (2) Kaggle public: 0.98000
        
### [Final_Exam_2]
   1. ResNet50 model
   2. Modified items:
        (1) "rotation_range" from 15 to 40.
        (2) "epochs" from 10 to 30.
   3. AUC score: 
        (1) Test: 0.9874737473747375
        (2) Kaggle public: 0.98250
        
### [Final_Exam_3]
   1. ResNet50 model
   2. Training data use 4000 instead of 3200(80%).
   3. Add EarlyStopping, ReduceLROnPlateau callbacks.
   4. AUC score: 
        (1) Test: 0.9875737573757375
        (2) Kaggle public: 0.98250
        
### [Final_Exam_4]  Predicted_proba_Xception_1.csv
   1. Xception model
   2. Based on Final_Exam_2 to add callbacks.
   3. Modified items:
        (1) "rotation_range" 40.
        (2) "shear_range" 0.2.
        (3) "width_shift_range" 0.2
        (4) "height_shift_range" 0.2.
        (5) add "channel_shift_range=10"
        (6) add "brightness_range=(1.1, 1.2)"
   4. AUC score: 
        (1) Test: 0.9925242524252424
        (2) Kaggle public: 0.98750
        
### [Final_Exam_5]  Predicted_proba_Inception_1.csv
   1. Inception V3 model
   2. Based on Final_Exam_2.
   3. AUC score: 
        (1) Test: 0.9899989998999901
        (2) Kaggle public: N/A
        
### [Final_Exam_6]  Predicted_proba_Xception_2.csv
   1. Xception model
   2. Based on Final_Exam_4.
   3. Restore Final_Exam_4 items:
        (1) "shear_range" 0.1.
        (2) "width_shift_range" 0.1.
        (3) "height_shift_range" 0.1.
        (4) remove "channel_shift_range=10"
        (5) remove "brightness_range=(1.1, 1.2)"
   3. AUC score: 
        (1) Test: 0.994949494949495
        (2) Kaggle public: 0.99000
        
### [Final_Exam_7]   Predicted_proba_Xception_3.csv
   1. Xception model
   2. Based on Final_Exam_6.
   3. Modified items:
        (1) set "amsgrad=True".
   3. AUC score: 
        (1) Test: 0.9949994999499949
        (2) Kaggle public: 
        
### [Final_Exam_8]  Predicted_proba_Xception_4.csv  /  Final_Exam_8_proba.csv
   1. Xception model
   2. Based on Final_Exam_7.
   3. Modified items:
        (1) reduced_lr(factor=0.3)
   3. AUC score: 
        (1) Test: 0.994949494949495
        (2) Test probability: 0.9997999799979997
        (2) Kaggle public: 0.99692
 
### [Final_Exam_9]  Predicted_proba_Xception_5.csv  /  Final_Exam_9_proba.csv
   1. Xception model
   2. Based on Final_Exam_8.
   3. Modified items:
        (1) reduced_lr(factor=0.2)
   3. AUC score: 
        (1) Test: 0.9925242524252424
        (2) Test probability: 0.9998249824982498
        (2) Kaggle public: 0.99740
        
### [Final_Exam_10]  Predicted_proba_NASNetLarge_1.csv  /  Final_Exam_10_proba.csv
   1. NASNetLarge model
   2. Based on Final_Exam_9.
   3. AUC score: 
        (1) Test: 0.9875737573757375
        (2) Test probability: 0.9991499149914991
        (2) Kaggle public: 0.99667
        
### [Final_Exam_11]  Predicted_proba_Inception_2.csv  /  Final_Exam_11_proba.csv
   1. Inception V3 model
   2. Based on Final_Exam_5.
   3. Modified items:
        (1)set "amsgrad=True".
   4. AUC score: 
        (1) Test: 0.9899989998999901
        (2) Test probability: 0.9997999799979997
        (2) Kaggle public: 0.99820

In [16]:
proba_auc = roc_auc_score(y_test_df['Predicted'], predict[:, 1])
proba_auc

0.9997999799979997

In [17]:
"""

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
PLEASE MODIFY BELOW FILENAME EVERYTIME BUILD
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

"""

proba_output = test_df.copy()
proba_output = proba_output.drop('filename', axis=1)
proba_output['label'] = predict[:, 1]
proba_output.to_csv('Final_Exam_11_proba.csv', header=["Predicted"], index_label='ID')