In [2]:
import os
import sys
from pathlib import Path
import multiprocessing as mp
from skimage.data import imread
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
from itertools import chain
from multiprocessing import cpu_count
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import StratifiedShuffleSplit
from keras.preprocessing.image import ImageDataGenerator

  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.


In [None]:
class IEEEPreprocessor:
    def __init__(self, train_dir, test_dir, img_resize=(32, 32), validation_split=0.2, process_count=cpu_count()):
        """
        This class is used by the classifier to preprocess data, don't forget to call the init() method
        after an object from this class gets created.
        :param train_dir: string
            The directory of training files
            
        :param test_dir: string
            The directory of test files
            
        :param img_resize: tuple(int, int)
            The resize size of the original image given by the file_path argument
            
        :param process_count: int
            The number of process you want to use to process the data.
            If you run into issues, lower this number. Its default value is equal to the number
            of cores of your CPU
        """
        
        self.process_count = process_count
        self.img_resize = img_resize
        self.train_dir = train_dir
        self.test_dir = test_dir
        self.

In [7]:
train_dir = Path('/home/abanihi/Documents/deep-data/kaggle/IEEE-camera-model/train')

In [8]:
test_dir = Path('/home/abanihi/Documents/deep-data/kaggle/IEEE-camera-model/test')

In [10]:
cameras = os.listdir(train_dir)
cameras

['HTC-1-M7',
 'iPhone-6',
 'iPhone-4s',
 'Samsung-Galaxy-Note3',
 'Motorola-Nexus-6',
 'Motorola-Droid-Maxx',
 'LG-Nexus-5x',
 'Sony-NEX-7',
 'Samsung-Galaxy-S4',
 'Motorola-X']

In [11]:
train_images = []
for camera in cameras:
    for fname in sorted(os.listdir(train_dir / camera)):
        train_images.append((camera, fname))

In [12]:
train_data = pd.DataFrame(train_images, columns=['camera', 'fname'])

In [13]:
train_data.head()

Unnamed: 0,camera,fname
0,HTC-1-M7,(HTC-1-M7)1.jpg
1,HTC-1-M7,(HTC-1-M7)10.jpg
2,HTC-1-M7,(HTC-1-M7)100.jpg
3,HTC-1-M7,(HTC-1-M7)101.jpg
4,HTC-1-M7,(HTC-1-M7)102.jpg


In [14]:
train_data.shape

(2750, 2)

In [15]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2750 entries, 0 to 2749
Data columns (total 2 columns):
camera    2750 non-null object
fname     2750 non-null object
dtypes: object(2)
memory usage: 43.0+ KB


In [16]:
test_images = []
for fname in sorted(os.listdir(test_dir)):
    test_images.append(fname)

In [17]:
test_data = pd.DataFrame(test_images, columns=['fname'])

In [18]:
test_data.head()

Unnamed: 0,fname
0,img_0002a04_manip.tif
1,img_001e31c_unalt.tif
2,img_00275cf_manip.tif
3,img_0034113_unalt.tif
4,img_00344b7_unalt.tif


In [19]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2640 entries, 0 to 2639
Data columns (total 1 columns):
fname    2640 non-null object
dtypes: object(1)
memory usage: 20.7+ KB


In [20]:
def color_stats(queue, iolock):
    while True:
        img_path = queue.get()
        if img_path is None:
            break
            
        if type(img_path) is tuple:
            img = imread(train_dir / img_path[0] / img_path[1])
            key = img_path[1]
            
        else:
            img = imread(test_dir / img_path)
            key = img_path
            
        # Some images read return info in a 2nd dim. We only want the first dim
        if img.shape == (2, ):
            img = img[0]
            
        color_info[key] = (img[:, :, 0].mean(), img[:, :, 1].mean(), img[:, :, 2].mean(),
                           img[:, :, 0].std(),  img[:, :, 1].std(),  img[:, :, 2].std())

            
            

In [22]:
cols = ['a0', 'a1', 'a2', 's0', 's1', 's2']

for col in cols:
    train_data[col] = None
    test_data[col] = None

In [23]:
train_data.head()

Unnamed: 0,camera,fname,a0,a1,a2,s0,s1,s2
0,HTC-1-M7,(HTC-1-M7)1.jpg,,,,,,
1,HTC-1-M7,(HTC-1-M7)10.jpg,,,,,,
2,HTC-1-M7,(HTC-1-M7)100.jpg,,,,,,
3,HTC-1-M7,(HTC-1-M7)101.jpg,,,,,,
4,HTC-1-M7,(HTC-1-M7)102.jpg,,,,,,


In [24]:
NCORE = cpu_count()
NCORE

4

In [25]:
color_info = mp.Manager().dict()
color_info

<DictProxy object, typeid 'dict' at 0x7f101f9c81d0>

In [26]:
# Use a queue since the image reading is a bottlenect
queue = mp.Queue(maxsize=NCORE)
iolock = mp.Lock()

In [27]:
pool = mp.Pool(NCORE, initializer=color_stats, initargs=(queue, iolock))

In [None]:
for image in train_images:
    queue.put(image) # Blocks until queue below its max size

In [None]:
for image in test_images:
    queue.put(image)

In [None]:
# Tell workers we are done
for _ in range(NCORE):
    queue.put(None)

In [None]:
pool.close()
pool.join()

In [None]:
color_info = dict(color_info)

In [None]:
for n, col in enumerate(cols):
    train_data[col] = train_data['fname'].apply(lambda x: color_info[x][n])
    test_data[col] = test_data['fname'].apply(lambda x: color_info[x][n])