In [3]:
# import libraries
from IPython.display import Image, display
import numpy as np
import os
from os.path import join
from PIL import ImageFile
import pandas as pd
from matplotlib import cm
import seaborn as sns
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Flatten, GlobalAveragePooling2D
#from tensorflow.python.keras.applications.resnet50 import preprocess_input
from keras.applications.imagenet_utils import preprocess_input
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn import svm
from sklearn.mixture import GaussianMixture
from sklearn.isotonic import IsotonicRegression
import re
from tqdm import tqdm
import pdb
from sklearn.model_selection import KFold

ImageFile.LOAD_TRUNCATED_IMAGES = True
plt.style.use('fivethirtyeight')
%matplotlib inline

## Preparing Train, Test, and Validation Data

The training data is comprised of ONLY car images from the Natural Images and Stanford Cars Dataset. The validation and test data contain car images from the same datasets as well as other image types (listed below) from the Natural Images dataset.

In [8]:
# prepare images for resnet50
image_size = 224

def read_and_prep_images(img_paths, img_height=image_size, img_width=image_size):
    imgs = [load_img(img_path, target_size=(img_height, img_width)) for img_path in tqdm(img_paths)]
    img_array = np.array([img_to_array(img) for img in tqdm(imgs)])
    #output = img_array
    output = preprocess_input(img_array)
    return(output)

resnet_model = ResNet50(input_shape=(image_size, image_size, 3), weights='imagenet', 
                        include_top=False, pooling='avg')  # Since top layer is the fc layer used for predictions

def save_dataset(X, Y, fold):
    data = np.concatenate([Y.reshape(len(Y),1),X],1)
    idx_sample = np.random.choice(len(Y),len(Y),replace=False)
    data = data[idx_sample]
    #fmt = ['%s'] + ['%.18e']*X.shape[1]
    np.savetxt("/media/msesia/Samsung/data/images_animals_{:d}.csv".format(fold), data, delimiter=",", fmt='%s')
    return data

In [9]:
data_source = "/media/msesia/Samsung/data/raw_image_ver/raw_image/training"

label_mappings = {0: 'cat', 1: 'lynx', 2: 'wolf', 3: 'coyote', 4: 'cheetah', 5: 'jaguer', 
                  6: 'chimpanzee', 7: 'orangutan', 8: 'hamster', 9: 'guinea pig'}

# Import all images from natural images data set
img_paths_full = []
img_labels_full = []
for f in os.listdir(data_source):
    new_lab = label_mappings[int(f[0])]
    new_img_path = data_source + "/" + f
    img_paths_full.append(new_img_path)
    img_labels_full.append(new_lab)


# Downsample and process
kf = KFold(n_splits=10, random_state=None, shuffle=True)
fold = 0
for _, idx in kf.split(np.arange(len(img_labels_full))):
    img_paths = np.array(img_paths_full)[idx]
    img_labels = np.array(img_labels_full)[idx]
    
    X_data = read_and_prep_images(img_paths)
    
    X_data = resnet_model.predict(X_data)
    
    # Apply standard scaler to output from resnet50
    ss = StandardScaler()
    ss.fit(X_data)
    X_data = ss.transform(X_data)

    # Take PCA to reduce feature space dimensionality
    pca = PCA(n_components=512, whiten=True)
    pca = pca.fit(X_data)
    print('Explained variance percentage = %0.2f' % sum(pca.explained_variance_ratio_))
    X_data = pca.transform(X_data)

    Y_data = np.array(img_labels)
    save_dataset(X_data, Y_data, fold)

    fold = fold + 1

100%|██████████| 5000/5000 [00:02<00:00, 1922.34it/s]
100%|██████████| 5000/5000 [00:01<00:00, 4307.04it/s]


Explained variance percentage = 0.88


100%|██████████| 5000/5000 [00:02<00:00, 1960.21it/s]
100%|██████████| 5000/5000 [00:01<00:00, 3788.86it/s]


Explained variance percentage = 0.88


100%|██████████| 5000/5000 [00:02<00:00, 1947.56it/s]
100%|██████████| 5000/5000 [00:01<00:00, 3780.06it/s]


Explained variance percentage = 0.88


100%|██████████| 5000/5000 [00:03<00:00, 1290.93it/s]
100%|██████████| 5000/5000 [00:01<00:00, 4193.18it/s]


Explained variance percentage = 0.88


100%|██████████| 5000/5000 [00:02<00:00, 1855.09it/s]
100%|██████████| 5000/5000 [00:00<00:00, 8790.00it/s]


Explained variance percentage = 0.88


100%|██████████| 5000/5000 [00:02<00:00, 2063.49it/s]
100%|██████████| 5000/5000 [00:00<00:00, 8783.83it/s]


Explained variance percentage = 0.88


100%|██████████| 5000/5000 [00:02<00:00, 1959.73it/s]
100%|██████████| 5000/5000 [00:00<00:00, 6078.50it/s]


Explained variance percentage = 0.88


100%|██████████| 5000/5000 [00:02<00:00, 2269.43it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5121.82it/s]


Explained variance percentage = 0.88


100%|██████████| 5000/5000 [00:02<00:00, 2206.86it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5505.07it/s]


Explained variance percentage = 0.88


100%|██████████| 5000/5000 [00:02<00:00, 1712.93it/s]
100%|██████████| 5000/5000 [00:00<00:00, 6355.09it/s]


Explained variance percentage = 0.88
