In [1]:
from keras.preprocessing.image import ImageDataGenerator, load_img
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from sklearn.model_selection import train_test_split
from keras.applications.resnet50 import ResNet50
from keras.applications.xception import Xception  # 20190814_1_Add_Xception
from keras.applications.inception_v3 import InceptionV3  # 20190814_2_Add_InceptionV3

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import PIL
%matplotlib inline

np.random.seed(101)

Using TensorFlow backend.


In [2]:
# 讀取 cats & dogs 資料集
#    --> Train path: .\ml-marathon-final\data\train_data
#    --> Test path: .\ml-marathon-final\data\test_data

TRAIN_DATA_PATH = "./ml-marathon-final/data/train_data"
TEST_DATA_PATH = "./ml-marathon-final/data/test_data"

train_filenames = os.listdir(TRAIN_DATA_PATH)
test_filenames = os.listdir(TEST_DATA_PATH)

labels = []
for filename in train_filenames:
    category = filename.split('.')[0]
    if category == 'dog':
        labels.append(1)   # dog: 1
    else:
        labels.append(0)   # cat: 0
        
df = pd.DataFrame({
    'filename': train_filenames,
    'label': labels
})

df.info()
df.head(5)
#df.tail(5)
#print(dogs_filenames[0])
#image = load_img(DOG_DATA_PATH + "/" + dogs_filenames[0])
#plt.imshow(image)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
filename    4000 non-null object
label       4000 non-null int64
dtypes: int64(1), object(1)
memory usage: 62.6+ KB


Unnamed: 0,filename,label
0,cat.10001.jpg,0
1,cat.10002.jpg,0
2,cat.10005.jpg,0
3,cat.10008.jpg,0
4,cat.10024.jpg,0


In [3]:
# train data 佔 0.8, valid data 佔 0.2
df["label"] = df["label"].replace({0: 'cat', 1: 'dog'}) 

train_df, validate_df = train_test_split(df, test_size=0.20, random_state=42)
train_df = train_df.reset_index(drop=True)
validate_df = validate_df.reset_index(drop=True)

total_train = train_df.shape[0]
total_validate = validate_df.shape[0]

print("train_df.shape", train_df.shape)
print("validate_df.shape", validate_df.shape)
print()
print("train_df['label'].value_counts()\n", train_df['label'].value_counts())
print()
print("validate_df['label'].value_counts()\n", validate_df['label'].value_counts())

train_df.shape (3200, 2)
validate_df.shape (800, 2)

train_df['label'].value_counts()
 dog    1622
cat    1578
Name: label, dtype: int64

validate_df['label'].value_counts()
 cat    422
dog    378
Name: label, dtype: int64


In [4]:
# 20190814_2_Add_InceptionV3 ==>
IMAGE_WIDTH=299
IMAGE_HEIGHT=299
# 20190814_2_Add_InceptionV3 <==
# 20190814_2_Add_InceptionV3 IMAGE_WIDTH=224
# 20190814_2_Add_InceptionV3 IMAGE_HEIGHT=224
IMAGE_SIZE=(IMAGE_WIDTH, IMAGE_HEIGHT)
IMAGE_CHANNELS=3

batch_size = 8     # 64 會 OOM
num_classes = 2 # 類別的數量，cat & dog 共有 2 個類別
epochs = 150 # 訓練的 epochs 數量

In [5]:
train_datagen = ImageDataGenerator(
    rotation_range=40,             # 20190816_1_Modify_from_40_to_70
    rescale=1./255,
    shear_range=0.1,               
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.1,         
    height_shift_range=0.1
)

# 20190815_1_ORG_final_exam_2_config ==>
"""
    rotation_range=40,
    rescale=1./255,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1,
"""
# 20190815_1_ORG_final_exam_2_config <==

train_generator = train_datagen.flow_from_dataframe(
    train_df, 
    TRAIN_DATA_PATH, 
    x_col='filename',
    y_col='label',
    target_size=IMAGE_SIZE,
    class_mode='categorical',
    batch_size=batch_size
)

Found 3200 validated image filenames belonging to 2 classes.


In [6]:
validation_datagen = ImageDataGenerator(rescale=1./255)
validation_generator = validation_datagen.flow_from_dataframe(
    validate_df, 
    TRAIN_DATA_PATH, 
    x_col='filename',
    y_col='label',
    target_size=IMAGE_SIZE,
    class_mode='categorical',
    batch_size=batch_size
)

Found 800 validated image filenames belonging to 2 classes.


In [7]:
# 20190813_3_Add_callbacks ==>
"""
earlystop = EarlyStopping(# 20190816_4_From_val_acc_to_val_loss monitor='val_acc', 
                          monitor='val_acc',    # 20190816_4_From_val_acc_to_val_loss
                          min_delta=0, 
                          patience=40, 
                          verbose=1, 
                          mode='auto', 
                          baseline=None, 
                          restore_best_weights=True)   # 20190813_From_False_to_True
"""
# 20190813_3_Add_callbacks <==

# 20190813_3_Add_callbacks ==>
reduced_lr = ReduceLROnPlateau(# 20190816_4_From_val_acc_to_val_loss monitor='val_acc', 
                               monitor='val_acc',    # 20190816_4_From_val_acc_to_val_loss
                               factor=0.5, 
                               patience=5, 
                               verbose=1, 
                               mode='auto', 
                               min_delta=0.0001, 
                               cooldown=0, 
                               min_lr=1e-12)

#callbacks = [earlystop, reduced_lr]
callbacks = [reduced_lr]
# 20190813_3_Add_callbacks <==

In [8]:
# 20190814_2_Add_InceptionV3  ==>
"""
# 以訓練好的 ResNet50 為基礎來建立模型，
# 捨棄 ResNet50 頂層的 fully connected layers
resnet50 = ResNet50(include_top=False, 
                    weights='imagenet', 
                    input_tensor=None,
                    input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS))
                    
x = resnet50.output

"""
inception_v3 = InceptionV3(include_top=False, 
                           weights='imagenet', 
                           input_tensor=None, 
                           input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS), 
                           pooling=None, 
                           classes=2)

x = inception_v3.output
# 20190814_2_Add_InceptionV3 <==

x = Flatten()(x)

# 增加 DropOut layer
x = Dropout(0.5)(x)

# 增加 Dense layer，以 softmax 產生個類別的機率值
output_layer = Dense(num_classes, activation='softmax', name='softmax')(x)

# 20190814_1_Add_Xception model = Model(inputs=resnet50.input, outputs=output_layer)
model = Model(inputs=inception_v3.input, outputs=output_layer)   # 20190814_2_Add_InceptionV3
model.summary()

model.compile(loss='categorical_crossentropy',
# 20190816_4_Adam_variable              optimizer=Adam(lr=1e-5, amsgrad=True),  # 20190815_1_add_amsgrad
              optimizer=Adam(lr=1e-5, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0, amsgrad=True),   # 20190816_4_Adam_variable
              metrics=['accuracy'])   

history = model.fit_generator(train_generator, 
                              epochs=epochs,
                              validation_data=validation_generator,
                              validation_steps=total_validate//batch_size,
                              steps_per_epoch=total_train//batch_size, 
                              verbose=1, 
                              callbacks=callbacks)  # 20190813_3_Add_callbacks

W0817 18:37:33.448040 11612 deprecation_wrapper.py:119] From D:\Anaconda3\envs\keras37\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0817 18:37:33.459041 11612 deprecation_wrapper.py:119] From D:\Anaconda3\envs\keras37\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0817 18:37:33.460045 11612 deprecation_wrapper.py:119] From D:\Anaconda3\envs\keras37\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0817 18:37:33.471006 11612 deprecation_wrapper.py:119] From D:\Anaconda3\envs\keras37\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0817 18:37:33.471006 11612 deprecation_wrapper.py:1

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 149, 149, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 149, 149, 32) 96          conv2d_1[0][0]                   
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 149, 149, 32) 0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
conv2d_2 (

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150

Epoch 00011: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-06.
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150

Epoch 00016: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-06.
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150

Epoch 00021: ReduceLROnPlateau reducing learning rate to 1.249999968422344e-06.
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150

Epoch 00026: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-07.
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150

Epoch 00031: ReduceLROnPlateau reducing learning rate to 3.12499992105586e-07.
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150

Epoch 00036: ReduceLROnPlateau reducing learning rate to 1.56249996052793e-07.
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 

Epoch 56/150

Epoch 00056: ReduceLROnPlateau reducing learning rate to 9.765624753299562e-09.
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150

Epoch 00061: ReduceLROnPlateau reducing learning rate to 4.882812376649781e-09.
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150

Epoch 00066: ReduceLROnPlateau reducing learning rate to 2.4414061883248905e-09.
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150

Epoch 00071: ReduceLROnPlateau reducing learning rate to 1.2207030941624453e-09.
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150

Epoch 00076: ReduceLROnPlateau reducing learning rate to 6.103515470812226e-10.
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150

Epoch 00085: ReduceLROnPlateau reducing learning rate to 3.051757735406113e-10.
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150

Epoch 00090: ReduceLROnPlateau reducing learning rate to

Epoch 109/150
Epoch 110/150

Epoch 00110: ReduceLROnPlateau reducing learning rate to 9.536742923144104e-12.
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150

Epoch 00115: ReduceLROnPlateau reducing learning rate to 4.768371461572052e-12.
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150

Epoch 00120: ReduceLROnPlateau reducing learning rate to 2.384185730786026e-12.
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150

Epoch 00125: ReduceLROnPlateau reducing learning rate to 1.192092865393013e-12.
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150

Epoch 00130: ReduceLROnPlateau reducing learning rate to 1e-12.
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


### Test generator

In [9]:
test_filenames = os.listdir(TEST_DATA_PATH)
test_df = pd.DataFrame({
    'filename': test_filenames
})
test_samples = test_df.shape[0]

y_test_df = pd.read_csv("./ml-marathon-final/manual_results.csv")

In [10]:
test_gen = ImageDataGenerator(rescale=1./255)
test_generator = test_gen.flow_from_dataframe(
    test_df, 
    TEST_DATA_PATH, 
    x_col='filename',
    y_col=None,
    class_mode=None,
    target_size=IMAGE_SIZE,
    batch_size=batch_size,
    shuffle=False
)

Found 400 validated image filenames.


In [11]:
predict = model.predict_generator(test_generator, steps=np.ceil(test_samples/batch_size))

In [12]:
test_df['label'] = np.argmax(predict, axis=1)

In [13]:
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y_test_df['Predicted'], test_df['label'])

auc

0.9925742574257426

In [14]:
"""

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
PLEASE MODIFY BELOW FILENAME EVERYTIME BUILD
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

"""


output = test_df
output = output.drop('filename', axis=1)
output
output.to_csv('Final_Exam_18.csv', header=["Predicted"], index_label='ID')

In [15]:
"""

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
PLEASE MODIFY BELOW FILENAME EVERYTIME BUILD
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

"""

predproba_df = pd.DataFrame(predict)
predproba_df.to_csv('Predicted_proba_Inception_7.csv')

predproba_df.head(5)

Unnamed: 0,0,1
0,0.999611,0.000389
1,0.899892,0.100108
2,0.999539,0.000461
3,9e-06,0.999991
4,0.999989,1.1e-05


### Wrong predicted Number

205 isn't a dog, neither a cat.

Final_Exam_2: 48, 140, 200, 241, 255

# 結果紀錄

### [Final_Exam_1]
   1. ResNet50 model
   2. AUC score: 
        (1) Test: 0.9850485048504851
        (2) Kaggle public: 0.98000
        
### [Final_Exam_2]
   1. ResNet50 model
   2. Modified items:
        (1) "rotation_range" from 15 to 40.
        (2) "epochs" from 10 to 30.
   3. AUC score: 
        (1) Test: 0.9874737473747375
        (2) Kaggle public: 0.98250
        
### [Final_Exam_3]  Predicted_proba_ResNet50_1.csv
   1. ResNet50 model
   2. Training data use 4000 instead of 3200(80%).
   3. Add EarlyStopping, ReduceLROnPlateau callbacks.
   4. AUC score: 
        (1) Test: 0.9875737573757375
        (2) Kaggle public: 0.98250
        
### [Final_Exam_4]  Predicted_proba_Xception_1.csv
   1. Xception model
   2. Based on Final_Exam_2 to add callbacks.
   3. Modified items:
        (1) "rotation_range" 40.
        (2) "shear_range" 0.2.
        (3) "width_shift_range" 0.2
        (4) "height_shift_range" 0.2.
        (5) add "channel_shift_range=10"
        (6) add "brightness_range=(1.1, 1.2)"
   4. AUC score: 
        (1) Test: 0.9925242524252424
        (2) Kaggle public: 0.98750
        
### [Final_Exam_5]  Predicted_proba_Inception_1.csv
   1. Inception V3 model
   2. Based on Final_Exam_2.
   3. AUC score: 
        (1) Test: 0.9899989998999901
        (2) Kaggle public: N/A
        
### [Final_Exam_6]  Predicted_proba_Xception_2.csv
   1. Xception model
   2. Based on Final_Exam_4.
   3. Restore Final_Exam_4 items:
        (1) "shear_range" 0.1.
        (2) "width_shift_range" 0.1.
        (3) "height_shift_range" 0.1.
        (4) remove "channel_shift_range=10"
        (5) remove "brightness_range=(1.1, 1.2)"
   3. AUC score: 
        (1) Test: 0.994949494949495
        (2) Kaggle public: 0.99000
        
### [Final_Exam_7]   Predicted_proba_Xception_3.csv
   1. Xception model
   2. Based on Final_Exam_6.
   3. Modified items:
        (1) set "amsgrad=True".
   3. AUC score: 
        (1) Test: 0.9949994999499949
        (2) Kaggle public: 
        
### [Final_Exam_8]  Predicted_proba_Xception_4.csv  /  Final_Exam_8_proba.csv
   1. Xception model
   2. Based on Final_Exam_7.
   3. Modified items:
        (1) reduced_lr(factor=0.3)
   3. AUC score: 
        (1) Test: 0.994949494949495
        (2) Test probability: 0.9997999799979997
        (2) Kaggle public: 0.99692
 
### [Final_Exam_9]  Predicted_proba_Xception_5.csv  /  Final_Exam_9_proba.csv
   1. Xception model
   2. Based on Final_Exam_8.
   3. Modified items:
        (1) reduced_lr(factor=0.2)
   3. AUC score: 
        (1) Test: 0.9925242524252424
        (2) Test probability: 0.9998249824982498
        (2) Kaggle public: 0.99740
        
### [Final_Exam_10]  Predicted_proba_NASNetLarge_1.csv  /  Final_Exam_10_proba.csv
   1. NASNetLarge model
   2. Based on Final_Exam_9.
   3. AUC score: 
        (1) Test: 0.9875737573757375
        (2) Test probability: 0.9991499149914991
        (2) Kaggle public: 0.99667
        
### [Final_Exam_11]  Predicted_proba_Inception_2.csv  /  Final_Exam_11_proba.csv
   1. Inception V3 model
   2. Based on Final_Exam_5.
   3. Modified items:
        (1)set "amsgrad=True".
   4. AUC score: 
        (1) Test: 0.9899989998999901
        (2) Test probability: 0.9997999799979997
        (2) Kaggle public: 0.99820
        
### [Final_Exam_12]  Predicted_proba_InceptionResNetV2_1.csv  /  Final_Exam_12_proba.csv
   1. InceptionResNetV2 model
   2. Based on Final_Exam_11.
   4. AUC score: 
        (1) Test: 0.9577207720772076
        (2) Test probability: 0.9893489348934894
        (2) Kaggle public: NA
        
### [Final_Exam_13]  Predicted_proba_Inception_3.csv  /  Final_Exam_13_proba.csv
   1. Inception V3 model
   2. Based on Final_Exam_11.
   3. Modified items:
        (1) "rotation_range" from 40 to 70.
        (2) Add channel_shift_range=10.
   4. AUC score: 
        (1) Test: 0.9925742574257426
        (2) Test probability: 0.9996749674967497
        (2) Kaggle public: NA
        
### [Final_Exam_14]  Predicted_proba_VGG16_1.csv  /  Final_Exam_14_proba.csv
   1. VGG16 model
   2. Configurations based on Final_Exam_11.
   4. AUC score: 
        (1) Test: 0.9825732573257325
        (2) Test probability: 0.9990749074907491
        (2) Kaggle public: NA
        
### [Final_Exam_15]  Predicted_proba_Inception_4.csv  /  Final_Exam_15_proba.csv
   1. Inception V3 model
   2. Based on Final_Exam_11.
   3. Modified items:
        (1) batch_size from 8 to 16.
   4. AUC score: 
        (1) Test: 0.9875737573757375
        (2) Test probability: 0.9995749574957495
        (2) Kaggle public: NA
        
### [Final_Exam_16]  Predicted_proba_Inception_5.csv  /  Final_Exam_16_proba.csv
   1. Inception V3 model
   2. Based on Final_Exam_11.
   3. Modified items:
        (1) batch_size from 8 to 32.
   4. AUC score: 
        (1) Test: 0.99004900490049
        (2) Test probability: 0.9996249624962497
        (2) Kaggle public: NA
        
### [Final_Exam_17]  Predicted_proba_Inception_6.csv  /  Final_Exam_17_proba.csv
   1. Inception V3 model
   2. Based on Final_Exam_11.
   3. Modified items:
        (1) Adam: epsilon=1e-9.
        (2) patience from 5 to 3.
   4. AUC score: 
        (1) Test: 0.99004900490049
        (2) Test probability: 0.9998249824982498
        (2) Kaggle public: 0.99792
        
### [Final_Exam_18]  Predicted_proba_Inception_7.csv  /  Final_Exam_18_proba.csv
   1. Inception V3 model
   2. Based on Final_Exam_11.
   3. Modified items:
        (1) epoch = 150, 
        (2) Remove EarlyStopping
        (3) ReduceLROnPlateau patience=5, factor=0.5
   4. AUC score: 
        (1) Test: 0.9925742574257426
        (2) Test probability: 0.9998249824982498
        (2) Kaggle public: 0.99750

In [16]:
proba_auc = roc_auc_score(y_test_df['Predicted'], predict[:, 1])
proba_auc

0.9998249824982498

In [17]:
"""

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
PLEASE MODIFY BELOW FILENAME EVERYTIME BUILD
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

"""

proba_output = test_df.copy()
proba_output = proba_output.drop('filename', axis=1)
proba_output['label'] = predict[:, 1]
proba_output.to_csv('Final_Exam_18_proba.csv', header=["Predicted"], index_label='ID')