# I - Imports


In [2]:
import os, os.path
import numpy as np
import pandas as pd
from PIL import Image
from keras.preprocessing.image import (img_to_array, 
                                       #load_img
                                       )

Using TensorFlow backend.


# II - Define auxiliary functions

## 2.1 - Read the images data on the disk to pandas dataframe

In [3]:
def read_from_disk(path, npixel = (30, 40)):
    """
        Reads the images data stored on the disk
        
        Arguments:
            *path: the path of the disk directory containing the images data
            *npixel: a tuple (image_width, image_height) in number of pixels
        
        Returns:
            *data: a pandas dataframe where each example is one image 
    """
    #imgs = np.empty(shape = (1, npixel[1]*npixel[0]*3 + 1) )
    imgs = pd.DataFrame(index = os.listdir(path), dtype = np.uint8,
                        columns = np.arange(npixel[1]*npixel[0]*3))
    #image_list = map(Image.open, glob('your/path/*.gif'))
    assert os.path.exists(path)
    print("Attention! Chargement de {} images! \n".format(len(os.listdir(path))))
    print("Cette opération peut prendre du temps.")
    
    for f in os.listdir(path):
        #print("Ajout de l'image {} au dataset".format(f))
        img = Image.open(os.path.join(path,f))
        img = img.resize((npixel[0],npixel[1]))
        x = img_to_array(img)
        x = x.astype(np.uint8)
        x = x.reshape(1,-1)
        if x.shape != (1, npixel[1]*npixel[0]*3):
            #The image is black
            x = np.zeros((1, npixel[1]*npixel[0]*3))
            print("The image {} is black.".format(f))
        imgs.loc[f,:] = x
    
    return imgs

## 2.2 Reads the data stored as one unique csv table on the disk

In [None]:
def read_csv_data_from_disk(path):
    """
        Reads a csv file stored on disk
        
        Arguments:
            *path: the path of the csv data on the disk
        
        Returns:
            *data: as a numpy array
    """
    assert os.path.exists(path)
    data = np.genfromtxt(path, delimiter=',')
    return data  

## 2.3 Writes a pandas dataframe on a disk under csv format

In [None]:
def write_data_csv(data, directory, name):
    """
        Writes data to path
        
        Arguments: 
            *data: a pandas dataframe
            *directory: the destination folder
            *name: name under which we save the csv file
        
        Returns:
            Nothing. 
    """
    data.to_csv(os.path.join(directory, name), sep = ',', index = True)
  

# III - Simulations

## 3.1 Initialize parameters

In [4]:
images_path = "/home/kevin/Desktop/OpenFoodFacts/OpenFoodFacts/data/images"
csv_directory = "/home/kevin/Desktop/OpenFoodFacts/OpenFoodFacts/data/csv"
npixel = (30,40)

## 3.2 Data

In [5]:
# Load images from disk to single pandas dataframe
data = read_from_disk(images_path)
#Write pandas dataframe to csv file
write_data_csv(data, csv_directory, 'data.csv')
#Read data from csv
csv_path = os.path.join(csv_directory, 'data.csv')
#data2 = read_csv_data_from_disk(csv_path)

Attention! Chargement de 3145 images! 

Cette opération peut prendre du temps.
The image 3245414172081front.4.400.jpg is black.


NameError: name 'write_data_csv' is not defined

In [7]:
import pickle

In [9]:
!ls

data.ipynb   data.py		 read.ipynb  vgg16.ipynb
data.pickle  deeplearning.ipynb  read.py


In [8]:
with open('data.pickle', 'wb') as f:
    # Pickle the 'data' pandas dataframe using the highest protocol available.
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
f.close()

In [None]:
import pprint
pkl_file = open('data.pickle', 'rb')

data = pickle.load(pkl_file)
pprint.pprint(data)

pkl_file.close()