In [1]:
# import libraries
from IPython.display import Image, display
import numpy as np
import os
from os.path import join
from PIL import ImageFile
import pandas as pd
from matplotlib import cm
import seaborn as sns
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Flatten, GlobalAveragePooling2D
#from tensorflow.python.keras.applications.resnet50 import preprocess_input
from keras.applications.imagenet_utils import preprocess_input
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn import svm
from sklearn.mixture import GaussianMixture
from sklearn.isotonic import IsotonicRegression
import re
from tqdm import tqdm
import pdb

ImageFile.LOAD_TRUNCATED_IMAGES = True
plt.style.use('fivethirtyeight')
%matplotlib inline

## Preparing Train, Test, and Validation Data

The training data is comprised of ONLY car images from the Natural Images and Stanford Cars Dataset. The validation and test data contain car images from the same datasets as well as other image types (listed below) from the Natural Images dataset.

In [2]:
data_source = "/media/msesia/Samsung/data/occ_images"

In [3]:
# Import all images from natural images data set
img_natural_paths = []
img_natural_labels = []
for d in [d for d in os.listdir(data_source + "/natural-images/")]:
    img_dir_na = data_source + "/natural-images/"+d
    new_img_paths = [join(img_dir_na,filename) for filename in os.listdir(img_dir_na)]
    img_natural_paths.append(new_img_paths)
    img_natural_labels.append([d]*len(new_img_paths))

img_natural_paths = [item for sublist in img_natural_paths for item in sublist]
img_natural_labels = [item for sublist in img_natural_labels for item in sublist]

In [4]:
# import car images from stanford cars
train_img_dir_s = data_source + "/stanford-cars-dataset/cars_train/cars_train"
all_train_img_paths_s = [join(train_img_dir_s,filename) for filename in os.listdir(train_img_dir_s)]

train_img_dir_s_test = data_source + "/stanford-cars-dataset/cars_test/cars_test"
all_train_img_paths_s_test = [join(train_img_dir_s_test,filename) for filename in os.listdir(train_img_dir_s_test)]

img_stanford_paths = all_train_img_paths_s + all_train_img_paths_s_test
img_stanford_labels = ['car'] * len(img_stanford_paths)

In [5]:
# Combine lists of images
img_paths = img_natural_paths + img_stanford_paths
img_labels = img_natural_labels + img_stanford_labels

## Feature Extraction With ResNet50

Removing the prediction layer of the pretrained Resnet50 model allows features to quickly be extracted from selected images.

In [6]:
# prepare images for resnet50
image_size = 224

def read_and_prep_images(img_paths, img_height=image_size, img_width=image_size):
    imgs = [load_img(img_path, target_size=(img_height, img_width)) for img_path in tqdm(img_paths)]
    img_array = np.array([img_to_array(img) for img in imgs])
    #output = img_array
    output = preprocess_input(img_array)
    return(output)

X_data = read_and_prep_images(img_paths)

100%|██████████| 23084/23084 [01:53<00:00, 203.50it/s]


In [7]:
resnet_model = ResNet50(input_shape=(image_size, image_size, 3), weights='imagenet', 
                        include_top=False, pooling='avg')  # Since top layer is the fc layer used for predictions

In [8]:
X_data = resnet_model.predict(X_data)



## Scaling and PCA

Reducing the dimensionality of extracted features allow for quicker training times.


In [9]:
# Apply standard scaler to output from resnet50
ss = StandardScaler()
ss.fit(X_data)
X_data = ss.transform(X_data)

# Take PCA to reduce feature space dimensionality
pca = PCA(n_components=512, whiten=True)
pca = pca.fit(X_data)
print('Explained variance percentage = %0.2f' % sum(pca.explained_variance_ratio_))
X_data = pca.transform(X_data)

Explained variance percentage = 0.86


## Save the data set

In [11]:
def make_dataset(X, Y):
    data = np.concatenate([Y.reshape(len(Y),1),X],1)
    idx_sample = np.random.choice(len(Y),len(Y),replace=False)
    data = data[idx_sample]
    #fmt = ['%s'] + ['%.18e']*X.shape[1]
    np.savetxt("/media/msesia/Samsung/data/images_cars.csv", data, delimiter=",", fmt='%s')
    return data

Y_data = np.array(img_labels)
data_save = make_dataset(X_data, Y_data)