## Loading data from CSV file:

In [1]:
from pathlib import Path
import json
import pandas as pd

data = Path("../pet_finder") / "data"
train_images = data / "train_images"
train_metadata = data / "train_metadata"

train = pd.read_csv(data / "train" / "train.csv")
train.set_index('PetID', inplace=True)
target = train['AdoptionSpeed']

## Useful functions:

In [2]:
# !pip3 install Pillow
from PIL import Image

def open_img(pet_id, string, img_number):
    """
    Function for internal use which opens the image 
    and identifies the associeted metadata 
    """
    images = globals()[string + '_images']  # images directory path
    metadatas = globals()[string + '_metadata']  # metadata directory path
    with open(metadatas / (pet_id + f'-{img_number}'+ '.json'), 'r') as f:
        metadata = json.load(f)  # image's metadata
    path = data / "train_images" / (pet_id + f'-{img_number}' + '.jpg') # image's path
    img  = Image.open(path)  # pet's image
    return img, metadata

In [3]:
def crop_img(pet_id, string, img_number):
    """
    Function for internal use which return
    cropped image
    """
    img, metadata = open_img(pet_id, string, img_number)
    x = metadata['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
    y = metadata['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
    return img.crop((0, 0, x, y))  # cropped image

In [4]:
import numpy as np

def img2matrix(img): return np.array(img)  # Function for internal use which
                                           # convert image to numpy array

In [5]:
def maximum_img_size(string):
    """
    Function for internal use which
    return number of images in the directory and
    the maximum image size

    Parameters 
    ----------
    string : str
        name of directory file
    """
    shape = np.array([0, 0, 0])
    n = 0 
    for pet_id in tqdm((globals()[string]).index):
        img_number = 1
        while True:
            try:
                img = crop_img(pet_id=pet_id,
                               string=string,
                               img_number=img_number)  # cropped image
                img_number += 1
                try:
                    shape = np.maximum(shape, 
                                       img2matrix(img).shape) # image shape
                    n += 1
                except ValueError:
                    pass
            except FileNotFoundError:
                break
    return (n, 
            shape[0], 
            shape[1], 
            shape[2])

In [8]:
from tqdm.notebook import tqdm
#!pip3 install h5py
import h5py

def prep_data(string, data_size=None):
    """
    Data preparation function for the keras neural network,
    which records the images in digital form on the hdf5 file
    
    N.B: data cannot be stored in memory due to their large dimensionality.
         Another solution which I find much more convenient than memmaps is hdf5 with h5py.
         Keras can run directly on h5py files, eliminating the need to manually specify the
         training on each batch.
         
    Parameters
    ----------
    string : str
        name of directory file
    data_size : int or None
        number of images in the training set.
        if data_size is None than all images 
        are taken in the dataset.

    Returns
    -------
    train_Y : List
        targets
    shape : tuple, size = 4
        shape of train file
    """
    #f = h5py.File('train.hdf5', 'w')  # open hdf5 file
                                      # in write mode
    #n, n1, n2, n3 = maximum_img_size("train")
    if data_size is not None:
        n = data_size
    #shape = (n, n1, n2, n3)
    #train_X = f.create_dataset('train_images', 
                               #shape, 
                               #dtype='float32')  # dataset
    train_Y = []
    tidx = 0
    for pet_id in tqdm((globals()[string]).index):
        try:
            img_number = 1
            while True:
                try:
                    img = crop_img(pet_id=pet_id,
                                   string=string,
                                   img_number=img_number)  # cropped image
                    matrix = img2matrix(img)  # image converted on digital form
                    try :
                        #n4, n5, _ = matrix.shape
                        #train_X[tidx] = np.pad(matrix,
                                               #(((n1 - n4) // 2, (n1 - n4 + 1) // 2), 
                                                #((n2 - n5) // 2, (n2 - n5 + 1) // 2),
                                                #(0, 0)),
                                               #'constant',
                                               #constant_values=0)  # pad matrix
                        if len(matrix.shape) == 3:
                            train_Y.append(target[pet_id])
                            img_number += 1
                            tidx += 1
                            if tidx == n:
                                #f.close()
                                return train_Y
                    except ValueError:
                        img_number += 1
                        pass
                except FileNotFoundError:
                    break
        except FileNotFoundError:
            pass
    #f.close()
    return train_Y

In [9]:
from keras.utils.io_utils import HDF5Matrix

data_size = 1000
Y = prep_data(string="train", 
              data_size=data_size)

train_X = HDF5Matrix('train.hdf5', 
                     'train_images', 
                     start=0, 
                     end=int(0.8 * data_size))
test_X = HDF5Matrix('train.hdf5', 
                    'train_images', 
                    start=int(0.8 * data_size), 
                    end=data_size)
train_Y = np.array(Y)[:int(0.8 * data_size)]
test_Y = np.array(Y)[int(0.8 * data_size):]


HBox(children=(FloatProgress(value=0.0, max=14993.0), HTML(value='')))

  f = h5py.File(datapath)


In [10]:
# !pip3 install tensorflow
# !pip3 install keras
from keras.applications import ResNet50
from keras import Sequential
from keras.layers import Dense

resnet = ResNet50(include_top=False, 
                  pooling="avg")
## Create model
model = Sequential()
model.add(resnet)
model.add(Dense(1))

model.layers[0].trainable = False

print(model.summary())


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50 (Model)             (None, 2048)              23587712  
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 2049      
Total params: 23,589,761
Trainable params: 2,049
Non-trainable params: 23,587,712
_________________________________________________________________
None


In [18]:
tf.test.is_gpu_available()
## As dataset is so heavy and no
## no gpu is available in my computer
## the training such a model is tricky

False

In [None]:
from keras.optimizers import Adam

## Compile model
model.compile(loss='mean_squared_error', 
              optimizer=Adam())

## Fit the model
model.fit(batch_size=32, 
          x=train_X, 
          y=train_Y, 
          epochs=30,
          validation_data=(test_X, test_Y),
          shuffle="batch")

## serialize weights to JSON
model_json = model.to_json()

with open("NN-regressor.json", "w") as f:
    f.write(model_json)

## serialize weights to HDF5
model.save_weights("NN-regressor.h5")
print("Saved model to disk")

Train on 800 samples, validate on 200 samples
Epoch 1/30
