In [1]:
reset -fs

In [2]:
# import all necessary modules
import json
import seaborn as sns
import datalab.storage as storage
import datalab.bigquery as bq
import pandas as pd
import numpy as np
import urllib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import PIL.Image as img
import glob
import os, sys
from IPython.display import Image, display
from scipy.misc import imread
from keras.preprocessing.image import array_to_img, img_to_array, load_img
%matplotlib inline

Using TensorFlow backend.


### The number of images of each class we want to download.

In [3]:
NUMBER_OF_IMAGES = 10000

### Getting the URLs of the good images from BigQuery.

In [4]:
%%sql --module epicurious_images_data
SELECT images
FROM [wellio-kadaif:recipes.recipes]
WHERE substring(url, 1, instr(url, '.com')+4) = 'http://www.epicurious.com/'
AND images IS NOT NULL
LIMIT 10000

### Executing the above query.

In [5]:
%%bigquery execute -q epicurious_images_data

images
http://assets.epicurious.com/photos/560df93d7b55306961bfe935/master/pass/108893.jpg
http://assets.epicurious.com/photos/560df9317b55306961bfe911/master/pass/108831.jpg
http://assets.epicurious.com/photos/560d7921f3a00aeb2f1cb721/6:4/w_620%2Ch_413/350226_hires.jpg
http://assets.epicurious.com/photos/5640f1459978979816f029fe/6:4/w_620%2Ch_413/sweet-potato-mac-and-cheese.jpg
http://assets.epicurious.com/photos/560d79a1f3a00aeb2f1cb83f/6:4/w_620%2Ch_413/351035_hires.jpg
http://assets.epicurious.com/photos/573e308fd44d8a8c0ea68d18/6:4/w_620%2Ch_413/51251020_cinnamon-rolls_6x4.jpg
http://assets.epicurious.com/photos/560df28ef3a00aeb2f1d5d4c/master/pass/230451.jpg
http://assets.epicurious.com/photos/560347257bdffb1205b62eaa/master/pass/51189610.jpg
http://assets.epicurious.com/photos/54b29493a801766f773fb2a0/6:4/w_620%2Ch_413/367722_tangerine-souffle_1x1.jpg
http://assets.epicurious.com/photos/560df8b7f9a84192308a8c62/master/pass/108132.jpg


### Converting query into a Pandas dataframe.

In [6]:
epicurious_images_df = bq.Query(epicurious_images_data).to_dataframe()

### Creating a new Pandas dataframe with a sample of 10,000 URLs.

In [7]:
sample_epicurious_images_df = epicurious_images_df.sample(n=10000)

### Creating a list out of the sampled Pandas dataframe.

In [8]:
sample_epicurious_images_list = sample_epicurious_images_df['images'].tolist()

### Location of the good images.

In [9]:
folder_ok = 'data/downloads/ok'

### Removing current images.

In [10]:
# Delete images to download again.
dirPath = folder_ok
fileList = os.listdir(dirPath)
for fileName in fileList:
  os.remove(dirPath + '/' + fileName)

### Downloading images.

In [11]:
i = 1
for item in sample_epicurious_images_list:
  if i < (NUMBER_OF_IMAGES + 1):
    urllib.urlretrieve(str(item), folder_ok + '/' + str(i).zfill(5) + '.jpg')
    i += 1

### Now, proceeding exactly as above for bad images.

In [12]:
%%sql --module fooddotcom_images_data
SELECT images
FROM [wellio-kadaif:recipes.recipes]
WHERE substring(url, 1, instr(url, '.com')+4) = 'http://www.food.com/'
AND images IS NOT NULL
LIMIT 10000

In [13]:
%%bigquery execute -q fooddotcom_images_data

images
"http://img.sndimg.com/food/image/upload/w_614,h_461,c_fit/v1/img/recipes/49/81/92/pictf9Haa.jpg"
"http://img.sndimg.com/food/image/upload/w_614,h_461,c_fit/v1/img/recipes/48/05/62/picxLoBBP.jpg"
"http://img.sndimg.com/food/image/upload/w_614,h_461,c_fit/v1/img/recipes/38/12/61/picjqAKLt.jpg"
"http://img.sndimg.com/food/image/upload/w_614,h_461,c_fit/v1/img/recipes/47/20/36/picnzMkEV.jpg"
http://pictures.food.com/api/file/ikuPuw3ITxW3KTWY0Sn9-baked-potatoes.png/convert?loc=/pictures.food.com/recipes/51/44/26/2Mnv6Y58SxWkPXgbUQ6e_baked%20potatoes.png&width=614&height=461&fit=crop
"http://img.sndimg.com/food/image/upload/w_614,h_461,c_fit/v1/img/recipes/38/01/58/picQ2oaM4.jpg"
"http://img.sndimg.com/food/image/upload/w_614,h_461,c_fit/v1/img/recipes/38/01/58/picBmb7Ir.jpg"
"http://img.sndimg.com/food/image/upload/w_614,h_461,c_fit/v1/img/recipes/46/93/99/picODQyGX.jpg"
"http://img.sndimg.com/food/image/upload/w_614,h_461,c_fit/v1/img/recipes/20/51/28/picVNtpEz.jpg"
"http://img.sndimg.com/food/image/upload/w_614,h_461,c_fit/v1/img/recipes/46/29/89/picBnoVvK.jpg"


In [14]:
fooddotcom_images_df = bq.Query(fooddotcom_images_data).to_dataframe()

In [15]:
sample_fooddotcom_images_df = fooddotcom_images_df.sample(n=10000)

In [16]:
sample_fooddotcom_images_list = sample_fooddotcom_images_df['images'].tolist()

In [17]:
folder_nok = 'data/downloads/nok'

In [18]:
# Delete images to download again.
dirPath = folder_nok
fileList = os.listdir(dirPath)
for fileName in fileList:
  os.remove(dirPath + '/' + fileName)

In [19]:
i = 1
for item in sample_fooddotcom_images_list:
  if i < (NUMBER_OF_IMAGES + 1):
    urllib.urlretrieve(str(item), folder_nok + '/' + str(i).zfill(5) + '.jpg')
    i += 1

### Copying downloaded images to a bucket.

In [None]:
!gsutil cp -r 'data' 'gs://wellio-kadaif-tasty-images-project-images'

Copying file://data/downloads/ok/05763.jpg [Content-Type=image/jpeg]...
Copying file://data/downloads/ok/04257.jpg [Content-Type=image/jpeg]...
Copying file://data/downloads/ok/06938.jpg [Content-Type=image/jpeg]...
Copying file://data/downloads/ok/04467.jpg [Content-Type=image/jpeg]...
- [4 files][260.6 KiB/260.6 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m -o ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://data/downloads/ok/02872.jpg [Content-Type=image/jpeg]...
Copying file://data/downloads/ok/04146.jpg [Content-Type=image/jpeg]...
Copying file://data/downloads/ok/09387.jpg [Content-Type=image/jpeg]...
Copying file://data/downloads/ok/05421.jpg [Content-Type=image/jpeg]...
Copying file://data/downloads/ok/02734.jpg [Content-Type=image/jpeg]...
Copying file://data