### 开始

In [1]:
# 检查Python版本
from sys import version_info
if version_info.major != 3:
    
    raise Exception('请使用Python3来完成此项目')

### 寻找异常值

#### 构建分类模型

利用在ImageNet上训练过的pre-trained model对训练集进行预测，对比预测结果与真实图片是否一致，如否则属异常值

pre-trained model输出的是属于猫和狗的种类的概率，狗有118个品种， 猫有7个品种， 如何判断输出结果是猫还是狗呢？

- 如某张图的预测结果的Top-N中，既不是‘dogs',也不是'cats'，则认为此图为异常图

In [2]:
dog_breeds = [
 'n02085620','n02085782','n02085936','n02086079'
,'n02086240','n02086646','n02086910','n02087046'
,'n02087394','n02088094','n02088238','n02088364'
,'n02088466','n02088632','n02089078','n02089867'
,'n02089973','n02090379','n02090622','n02090721'
,'n02091032','n02091134','n02091244','n02091467'
,'n02091635','n02091831','n02092002','n02092339'
,'n02093256','n02093428','n02093647','n02093754'
,'n02093859','n02093991','n02094114','n02094258'
,'n02094433','n02095314','n02095570','n02095889'
,'n02096051','n02096177','n02096294','n02096437'
,'n02096585','n02097047','n02097130','n02097209'
,'n02097298','n02097474','n02097658','n02098105'
,'n02098286','n02098413','n02099267','n02099429'
,'n02099601','n02099712','n02099849','n02100236'
,'n02100583','n02100735','n02100877','n02101006'
,'n02101388','n02101556','n02102040','n02102177'
,'n02102318','n02102480','n02102973','n02104029'
,'n02104365','n02105056','n02105162','n02105251'
,'n02105412','n02105505','n02105641','n02105855'
,'n02106030','n02106166','n02106382','n02106550'
,'n02106662','n02107142','n02107312','n02107574'
,'n02107683','n02107908','n02108000','n02108089'
,'n02108422','n02108551','n02108915','n02109047'
,'n02109525','n02109961','n02110063','n02110185'
,'n02110341','n02110627','n02110806','n02110958'
,'n02111129','n02111277','n02111500','n02111889'
,'n02112018','n02112137','n02112350','n02112706'
,'n02113023','n02113186','n02113624','n02113712'
,'n02113799','n02113978']

cat_breeds = [
'n02123045','n02123159','n02123394','n02123597'
,'n02124075','n02125311','n02127052']

### 构建预测模型

#### 基于生成器的模型

对指定文件夹内的图片分类，如预测结果与标签不符，则列入outlier_list并保存

In [3]:
def find_outlier_in_decode_preds(model_name, decode_preds_set, filenames):
    outlier_files = []
    file_id = 0
    for index,decode_pred in enumerate(decode_preds_set):
        fname = filenames[index]
        
        ## 1st, if no dog and cat in Top-N of this prediction, this impage is a outlier
        outlier_flag = True
        for pet in decode_pred:
            if pet[0] in dog_breeds or pet[0] in cat_breeds:
                outlier_flag = False
                break;
        
        ## 2nd: 检查预测概率最高的品种是否错误
        if outlier_flag:
            print("%s is not Dog or Cat" %fname)
        else:
            pet = decode_pred[0]
            if pet[2] > 0.7:
                if pet[0] in dog_breeds and fname[:3] == 'cat': # 指猫为'dog'
                    outlier_flag = True
                    print("%s is not cat" %fname)
                    
                if pet[0] in cat_breeds and fname[:3] == 'dog':
                    outlier_flag = True
                    print("%s is not dog" %fname)
                
        if outlier_flag:
            outlier_files.append(fname)
#             print('%s is a outlier !' %fname)
    
    # save the outlier to file
    out_filename = 'outlier/' + model_name + "_outliers.txt"
    with open(out_filename,'w') as f:
        f.write(str(outlier_files))

In [4]:
from keras.preprocessing.image import *
from keras.applications import resnet50
from keras.applications import xception
from keras.applications import inception_v3
from keras.applications import inception_resnet_v2
from keras.layers import Input, GlobalAveragePooling2D
from keras.models import Model
import h5py

def find_outliers(MODEL, image_size, model_class, model_name):
    #------
    top_N = 5
    #------
    # create ImageDataGenerator, and indicate "preprocessing_functions"
    image_gen = ImageDataGenerator(preprocessing_function=model_class.preprocess_input)
    train_generator = image_gen.flow_from_directory('out_train', 
                                                target_size=image_size, 
                                                shuffle=False, # our data will be in order
                                                batch_size=16,
                                                class_mode=None)
    
    ## use pre-trained model to get features from image generator
    x = Input((image_size[0], image_size[1], 3)) # shape: width, height, channel
    ## 下面这种直接定义lambda的方式在keras模型中是错误的，keras.layers中有专门的一个层 layers.Lambda 来代替
#     lambda x:model_class.preprocess_input(x)
    base_model = MODEL(input_tensor=x, weights='imagenet')
    
    model = Model(inputs=base_model.input, outputs=base_model.output)
    
    # the predict_generator method returns the output of a model, given
    # a generator that yields batches of numpy data
    preds = model.predict_generator(train_generator, verbose=1)
    
    print("predicts shape:", preds.shape)
    
    # save the output to h5 file
#     out_filename = 'outlier/' + model_name + "_preds.h5"
#     with h5py.File(out_filename,'w') as h:
#         h.create_dataset("preds", data=preds)
    
    decode_preds = model_class.decode_predictions(preds, top=top_N)

    out_filename = 'outlier/' + model_name + "_decodepreds.txt"
    with open(out_filename,'w') as f:
        f.write(str(decode_preds))
        
    find_outlier_in_decode_preds(model_name, decode_preds, train_generator.filenames)


Using TensorFlow backend.


In [None]:
find_outliers(resnet50.ResNet50, (224,224), resnet50, "ResNet50")

Found 5 images belonging to 1 classes.
Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5
   106496/102853048 [..............................] - ETA: 37:50

In [5]:
find_outliers(xception.Xception, (299,299), xception, "Xception")

Found 5 images belonging to 1 classes.
predicts shape: (5, 1000)
Downloading data from https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json


RemoteDisconnected: Remote end closed connection without response

In [6]:
find_outliers(inception_resnet_v2.InceptionResNetV2, (299,299), inception_resnet_v2,"InceptionResNetV2")

Found 5 images belonging to 1 classes.
Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.7/inception_resnet_v2_weights_tf_dim_ordering_tf_kernels.h5
  1081344/225209952 [..............................] - ETA: 36:48

KeyboardInterrupt: 

### 剔除异常值

把'pre-train/'图片复制到'clear-train/', 把outlier list中的文件从'clear-train/'中删除

In [40]:
model_name = 'ResNet50'
def read_outliers_from_file(filename):
    f = open(filename,'r')
    rdbuf = f.read()
    obj = eval(rdbuf)
    f.close()
    
    return obj

filename = 'outlier/' + model_name + "_outliers.txt"
outlier_files = []
outlier_files += read_outliers_from_file(filename)
# outlier_files += read_outliers_from_file('outlier/Xception.txt')
# outlier_files += read_outliers_from_file('outlier/InceptionResNetV2.txt')
print(outlier_files)

['cat/cat.19.jpg']


In [48]:
import shutil
import os

def clean_data(old_dir, clean_dir, outlier_list):
    if os.path.exists(clean_dir):
        shutil.rmtree(clean_dir)
    os.mkdir(clean_dir)
    
    file_list = os.listdir(old_dir)
    for filename in file_list:
        os.symlink('../'+old_dir+filename, clean_dir+filename)
    
    for filename in outlier_list:
        os.remove(clean_dir+filename)
    print('clean over')

In [49]:
clean_data('out_train/', 'clean-train/', outlier_files)

clean over
