In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [4]:
import os

In [5]:
dataset_folder = os.path.join(os.environ['HOME'], 'hse/data/hotdogs_dataset')

In [6]:
test_folder = os.path.join(dataset_folder, 'test')
train_folder = os.path.join(dataset_folder, 'train')

In [7]:
os.listdir(train_folder)

['hotdog', 'not_hotdog']

In [8]:
def get_hotdog_class_dirs(root_dir):
    return os.path.join(root_dir, 'hotdog'), os.path.join(root_dir, 'not_hotdog')

In [9]:
hotdogs_folder, not_hotdogs_folder = get_hotdog_class_dirs(train_folder)

In [10]:
os.listdir(hotdogs_folder)[:10]

['00651.jpg',
 '00337.jpg',
 '00287.jpg',
 '00506.jpg',
 '00064.jpg',
 '00132.jpg',
 '00351.jpg',
 '00334.jpg',
 '00601.jpg',
 '00107.jpg']

In [11]:
def path_join(left, right):
    return os.path.join(left, right)

In [20]:
from PIL import Image

def fix_size(orig_size, needed_size):
    """calculate size keeping the aspect ratio"""
    # 1. find small side
    smaller_side_index = np.argmin(orig_size)
    bigger_side_index = 1 - smaller_side_index
    # 2. scale by small side
    aspect_ratio = orig_size[smaller_side_index] / needed_size[smaller_side_index]
    # 3. return new size with correct aspect ratio
    size = [0, 0]
    size[smaller_side_index] = needed_size[smaller_side_index]
    size[bigger_side_index] = int(needed_size[bigger_side_index] / aspect_ratio)
    return size

def square(size):
    return size[0] == size[1]

def roi(size_with_ratio, out_size):
    """calculate region of interest"""
    # 1. find small side
    smaller_side_index = np.argmin(size_with_ratio)
    bigger_side_index = 1 - smaller_side_index
    # 2. calculate region of interest
    #    a. small side is untouched
    #    b. bigger side is cropped
    box = [0, 0, 0, 0]
    # 2.a
    box[smaller_side_index] = 0
    box[smaller_side_index+2] = out_size[smaller_side_index]
    # 2.b
    bigger_side_center = int(size_with_ratio[bigger_side_index] / 2)
    box[bigger_side_index] = bigger_side_center - out_size[bigger_side_index] / 2
    # + whole number to eliminate problems with non-integer centers:
    box[bigger_side_index+2] = box[bigger_side_index] + out_size[bigger_side_index]
    return box

def preprocess_images(image_dir, out_size):
    """preprocess images"""
    if not square(out_size):
        raise NotImplementedError

    image_files = os.listdir(image_dir)
    good_images = []
    bad_files = []
    # 1. load image - if fails, skip
    for file in image_files:
        fp = open(path_join(image_dir, file), "rb")
        try:
            img = Image.open(fp).convert('RGB')
            size_with_ratio = fix_size(img.size, out_size)
            img = img.resize(size_with_ratio, Image.HAMMING)
            if not square(size_with_ratio):  # crop center if image is not a square
                img = img.crop(box=roi(size_with_ratio, out_size))
            # 2. create tuple of image name and resized image object
            good_images.append((file, img))
        except IOError:  # bad image/not an image
            bad_files.append(file)
    # 3. return tuples list and list of bad images (for debug primarily)
    return good_images, bad_files

In [1]:
import img_processing

In [21]:
hotdog_images, hotdog_errors = preprocess_images(hotdogs_folder, (224, 224))
not_hotdog_images, not_hotdog_errors = preprocess_images(not_hotdogs_folder, (224, 224))

In [43]:
def to_array_multi(images):
    """convert each image into an array and return a list of those"""
    names = []
    data = []
    for name, img in images:
        img.load()
        arr = np.asarray(img, dtype="uint8")
        arr = arr.reshape(-1)
        names.append(name)
        data.append(arr)
    return names, data

In [44]:
hotdog_names, hotdog_arrays = to_array_multi(hotdog_images)

In [45]:
not_hotdog_names, not_hotdog_arrays = to_array_multi(not_hotdog_images)

In [47]:
print(hotdog_names[:5], hotdog_arrays[:5])
print(not_hotdog_names[:5], not_hotdog_arrays[:5])

['00651.jpg', '00337.jpg', '00287.jpg', '00506.jpg', '00064.jpg'] [array([18, 18, 26, ...,  0,  0,  0], dtype=uint8), array([37, 13,  0, ...,  0,  0,  0], dtype=uint8), array([8, 8, 8, ..., 0, 0, 0], dtype=uint8), array([247, 224, 244, ...,   0,   0,   0], dtype=uint8), array([214, 136, 139, ...,   0,   0,   0], dtype=uint8)]
['00651.jpg', '00825.jpg', '00337.jpg', '00287.jpg', '00506.jpg'] [array([113, 108,  73, ...,   0,   0,   0], dtype=uint8), array([73, 66, 41, ...,  0,  0,  0], dtype=uint8), array([2, 1, 0, ..., 0, 0, 0], dtype=uint8), array([194, 205, 247, ...,   0,   0,   0], dtype=uint8), array([255, 255, 255, ...,   0,   0,   0], dtype=uint8)]


In [54]:
hotdog_names_p = pd.Series(hotdog_names)
hotdog_names_p.head()

0    00651.jpg
1    00337.jpg
2    00287.jpg
3    00506.jpg
4    00064.jpg
dtype: object

In [50]:
hotdog_data = pd.DataFrame(hotdog_arrays)
hotdog_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,150518,150519,150520,150521,150522,150523,150524,150525,150526,150527
0,18,18,26,15,15,23,15,16,21,14,...,0,0,0,0,0,0,0,0,0,0
1,37,13,0,42,15,2,41,13,1,39,...,0,0,0,0,0,0,0,0,0,0
2,8,8,8,8,8,8,9,9,9,9,...,0,0,0,0,0,0,0,0,0,0
3,247,224,244,245,223,242,244,223,242,243,...,0,0,0,0,0,0,0,0,0,0
4,214,136,139,195,115,118,195,115,118,197,...,0,0,0,0,0,0,0,0,0,0


In [51]:
not_hotdog_names_p = pd.Series(not_hotdog_names)
not_hotdog_names_p.head()

0    00651.jpg
1    00825.jpg
2    00337.jpg
3    00287.jpg
4    00506.jpg
dtype: object

In [52]:
not_hotdog_data = pd.DataFrame(not_hotdog_arrays)
not_hotdog_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,150518,150519,150520,150521,150522,150523,150524,150525,150526,150527
0,113,108,73,91,85,45,84,54,33,57,...,0,0,0,0,0,0,0,0,0,0
1,73,66,41,66,54,34,59,44,29,52,...,0,0,0,0,0,0,0,0,0,0
2,2,1,0,2,1,0,3,1,1,2,...,0,0,0,0,0,0,0,0,0,0
3,194,205,247,193,207,249,194,207,250,195,...,0,0,0,0,0,0,0,0,0,0
4,255,255,255,255,255,255,255,255,255,255,...,0,0,0,0,0,0,0,0,0,0


In [56]:
hotdog_data.to_csv('hotdog.csv', sep=',')
not_hotdog_data.to_csv('not_hotdog.csv', sep=',')

In [55]:
hotdog_labels = pd.Series([1]*len(hotdog_arrays))
not_hotdog_labels = pd.Series([0]*len(not_hotdog_arrays))

train_hotdogs_pd = pd.DataFrame(
    hotdog_names_p,
    hotdog_labels,
    hotdog_data,
#     columns=['name', 'hotdog']
)
train_not_hotdogs_pd = pd.DataFrame(
    not_hotdog_names_p,
    not_hotdog_labels,
    not_hotdog_data,
#     columns=['name', 'hotdog']
)

ValueError: Shape of passed values is (1, 708), indices imply (708, 708)

In [None]:
train_df = pd.concat([train_hotdogs_pd, train_not_hotdogs_pd])
train_df = train_df.sample(frac=1).reset_index(drop=True)  # shuffle data

In [None]:
train_df.head()

In [None]:
train_df.to_csv('train_hotdogs.csv', sep=',')