In [1]:
from keras.preprocessing.image import ImageDataGenerator, load_img
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from sklearn.model_selection import train_test_split
from keras.applications.resnet50 import ResNet50

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import PIL
%matplotlib inline

np.random.seed(101)

Using TensorFlow backend.


In [2]:
# 讀取 cats & dogs 資料集
#    --> Train path: .\ml-marathon-final\data\train_data
#    --> Test path: .\ml-marathon-final\data\test_data

TRAIN_DATA_PATH = "./ml-marathon-final/data/train_data"
TEST_DATA_PATH = "./ml-marathon-final/data/test_data"

train_filenames = os.listdir(TRAIN_DATA_PATH)
test_filenames = os.listdir(TEST_DATA_PATH)

labels = []
for filename in train_filenames:
    category = filename.split('.')[0]
    if category == 'dog':
        labels.append(1)   # dog: 1
    else:
        labels.append(0)   # cat: 0
        
df = pd.DataFrame({
    'filename': train_filenames,
    'label': labels
})

df.info()
df.head(5)
#df.tail(5)
#print(dogs_filenames[0])
#image = load_img(DOG_DATA_PATH + "/" + dogs_filenames[0])
#plt.imshow(image)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
filename    4000 non-null object
label       4000 non-null int64
dtypes: int64(1), object(1)
memory usage: 62.6+ KB


Unnamed: 0,filename,label
0,cat.10001.jpg,0
1,cat.10002.jpg,0
2,cat.10005.jpg,0
3,cat.10008.jpg,0
4,cat.10024.jpg,0


In [3]:
# train data 佔 0.8, valid data 佔 0.2
df["label"] = df["label"].replace({0: 'cat', 1: 'dog'}) 

train_df, validate_df = train_test_split(df, test_size=0.20, random_state=42)
train_df = train_df.reset_index(drop=True)
validate_df = validate_df.reset_index(drop=True)

total_train = train_df.shape[0]
total_validate = validate_df.shape[0]

print("train_df.shape", train_df.shape)
print("validate_df.shape", validate_df.shape)
print()
print("train_df['label'].value_counts()\n", train_df['label'].value_counts())
print()
print("validate_df['label'].value_counts()\n", validate_df['label'].value_counts())

train_df.shape (3200, 2)
validate_df.shape (800, 2)

train_df['label'].value_counts()
 dog    1622
cat    1578
Name: label, dtype: int64

validate_df['label'].value_counts()
 cat    422
dog    378
Name: label, dtype: int64


In [4]:
IMAGE_WIDTH=224
IMAGE_HEIGHT=224
IMAGE_SIZE=(IMAGE_WIDTH, IMAGE_HEIGHT)
IMAGE_CHANNELS=3

batch_size = 8
num_classes = 2 # 類別的數量，cat & dog 共有 2 個類別
epochs = 10 # 訓練的 epochs 數量

In [5]:
train_datagen = ImageDataGenerator(
    rotation_range=15,
    rescale=1./255,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1
)

train_generator = train_datagen.flow_from_dataframe(
    train_df, 
    TRAIN_DATA_PATH, 
    x_col='filename',
    y_col='label',
    target_size=IMAGE_SIZE,
    class_mode='categorical',
    batch_size=batch_size
)

Found 3200 validated image filenames belonging to 2 classes.


In [6]:
validation_datagen = ImageDataGenerator(rescale=1./255)
validation_generator = validation_datagen.flow_from_dataframe(
    validate_df, 
    TRAIN_DATA_PATH, 
    x_col='filename',
    y_col='label',
    target_size=IMAGE_SIZE,
    class_mode='categorical',
    batch_size=batch_size
)

Found 800 validated image filenames belonging to 2 classes.


In [7]:
# 以訓練好的 ResNet50 為基礎來建立模型，
# 捨棄 ResNet50 頂層的 fully connected layers
resnet50 = ResNet50(include_top=False, 
                    weights='imagenet', 
                    input_tensor=None,
                    input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS))
x = resnet50.output
x = Flatten()(x)

# 增加 DropOut layer
x = Dropout(0.5)(x)

# 增加 Dense layer，以 softmax 產生個類別的機率值
output_layer = Dense(num_classes, activation='softmax', name='softmax')(x)

model = Model(inputs=resnet50.input, outputs=output_layer)
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=1e-5),
              metrics=['accuracy'])

history = model.fit_generator(train_generator, 
                              epochs=epochs,
                              validation_data=validation_generator,
                              validation_steps=total_validate//batch_size,
                              steps_per_epoch=total_train//batch_size, 
                              verbose=1)

W0812 20:44:38.163216 16480 deprecation_wrapper.py:119] From D:\Anaconda3\envs\keras37\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0812 20:44:38.171993 16480 deprecation_wrapper.py:119] From D:\Anaconda3\envs\keras37\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0812 20:44:38.175499 16480 deprecation_wrapper.py:119] From D:\Anaconda3\envs\keras37\lib\site-packages\keras\backend\tensorflow_backend.py:4185: The name tf.truncated_normal is deprecated. Please use tf.random.truncated_normal instead.

W0812 20:44:38.188147 16480 deprecation_wrapper.py:119] From D:\Anaconda3\envs\keras37\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0812 20:44:38.188147 16480 deprecation_w

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 112, 112, 64) 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Test generator

In [45]:
test_filenames = os.listdir(TEST_DATA_PATH)
test_df = pd.DataFrame({
    'filename': test_filenames
})
test_samples = test_df.shape[0]

y_test_df = pd.read_csv("./ml-marathon-final/manual_results.csv")

In [46]:
y_test_df['Predicted']

0      0
1      0
2      0
3      1
4      0
      ..
395    1
396    0
397    0
398    0
399    0
Name: Predicted, Length: 400, dtype: int64

In [47]:
test_gen = ImageDataGenerator(rescale=1./255)
test_generator = test_gen.flow_from_dataframe(
    test_df, 
    TEST_DATA_PATH, 
    x_col='filename',
    y_col=None,
    class_mode=None,
    target_size=IMAGE_SIZE,
    batch_size=batch_size,
    shuffle=False
)

Found 400 validated image filenames.


In [48]:
predict = model.predict_generator(test_generator, steps=np.ceil(test_samples/batch_size))

In [49]:
test_df['label'] = np.argmax(predict, axis=1)

In [50]:
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y_test_df['Predicted'], test_df['label'])

auc

0.9850485048504851

In [59]:
output = test_df
output = output.drop('filename', axis=1)
output
output.to_csv('Final_Exam_1.csv', header=["Predicted"], index_label='ID')