In [24]:
import argparse
import os
import sys
from threading import Thread
from time import sleep

import numpy as np
import requests
from bs4 import BeautifulSoup as bs
import cv2

# Cravl and preprocess

In [2]:
def __download_and_save_image(link, directory, src='fotolia'):
    print("Attempting to download: " + link)
    r = requests.get(link)
    if r.status_code == 200:

        # depends on source
        if src == 'fotolia':
            try:
                filename = r.headers['Content-Disposition'].split('filename="')[1][:-2]
            except:
                print("No Content-Disposition header present.")
                return
        elif src == 'istock':
            try:
                filename = r.headers['Content-Disposition'].split('filename=')[1]
            except:
                print("No Content-Disposition header present.")
                return

        filename = os.sep.join([directory, filename])
        print("Saving to filename: %s " % (filename))
        with open(filename, 'wb') as f:
            f.write(r.content)
    else:
        print("Couldn't download from link: " + link)

## Fotolia

In [7]:
fotolia_download_button = 'comp-download-buttons row-large'

def _get_image_url_fotolia(base_url, minVal, directory, index=0, num_retries=5):
    img_url = ""
    retries = 0
    while retries < num_retries:
        # try
        r = requests.get(base_url + str(minVal + index))
        if r.status_code == 200:
            soup = bs(r.content)
            row = soup.find_all(attrs={'class': fotolia_download_button})
            # check row
            if len(row) > 0:
                link = row[0].findChildren()[0]
                if 'href' in link.attrs:
                    img_url = link.attrs['href']
                    __download_and_save_image(img_url, directory)
                else:
                    print("Error, check: ")
                    print(link)
            else:
                print("There is no image download button.")

            break
        else:
            retries += 1

    return img_url

# function to scrape from fotolia
def fotolia_scrape(directory, minVal=137840645, n_images=100):
    # make the dir first
    if not os.path.isdir(directory):
        os.mkdir(directory)

    base_url = "https://www.fotolia.com/Content/Comp/"
    image_url_list = []
    index = 0

    # check thread list
    thread_list = []

    # start threads
    for index in range(n_images):
        th = Thread(target=_get_image_url_fotolia, args=(base_url, minVal, directory, index))
        thread_list.append(th)
        th.start()

    # join
    for th in thread_list:
        th.join()

In [19]:
directory = './images/fotolia'
fotolia_scrape(directory, n_images=200)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840645_ixMxoXghtHBthFfd6AQi9au48HaGfl6D_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840645.jpg 
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840646_UdyEeJz5LevW36AeVUkhpDEr9qUVkVe4_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840646.jpg 
There is no image download button.
There is no image download button.
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840649_BStlsq7JkcxkDs1IICixX0aYoUCjmKlo_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840649.jpg 
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840650_k0AXaJDlzemgyNgbEBesgtpUCDPehRJI_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840650.jpg 
There is no image download button.
Attempting to download: https://download.fotolia.com/Content/CompImage500/Fotol

Saving to filename: ./images/fotolia/fotolia_137840702.jpg 
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840703_NoSloAoIv4CF8AV5ZlqHHrdDuXzrKJXY_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840703.jpg 
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840704_EF4fN8l8qVyjlTCqHQFu48O4e5Fip2It_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840704.jpg 
There is no image download button.
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840706_VOvbA4pxvemLvu6nyZTRlm6JWO8jtqKE_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840706.jpg 
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840707_jK5qsMTDG6aNjLqENvHjRJyUltyIQeM0_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840707.jpg 
There is no image download button.
There is no image download button.
Attempting to downl

Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840759_G37i6LZtCyxpGyk7vvR9gOhV3Tbi91dF_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840759.jpg 
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840760_OwskdA0CRzkn0z5jWkMJUBTrHR8NHLOV_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840760.jpg 
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840761_pJBz2yETGj3MzHmYak281ZuolMkeVEPB_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840761.jpg 
There is no image download button.
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840763_68TGv5qzqtXMSLkB7BRkQVQwpnRZ8h64_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840763.jpg 
There is no image download button.
There is no image download button.
There is no image download button.
Attempting to download: https://download.fot

Saving to filename: ./images/fotolia/fotolia_137840817.jpg 
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840818_SgvPW5XemojuSkIRyzgaDwK28u87A7CS_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840818.jpg 
Attempting to download: https://v.ftcdn.net/01/37/84/08/700_F_137840819_XrlkHL9WYwyxSht64oAmXnmWYvMr82YW.mp4
No Content-Disposition header present.
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840820_JpAzPPHt8HQ4Z2Wk5L79zhhzmOf1SpYb_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840820.jpg 
There is no image download button.
There is no image download button.
There is no image download button.
There is no image download button.
Attempting to download: https://download.fotolia.com/Content/CompImage500/FotoliaComp_137840825_iqhcRJxj6QdNerXpwZNYgH06gf34XOj5_W95?download=1
Saving to filename: ./images/fotolia/fotolia_137840825.jpg 
Attempting to download: https://download

## Istock

In [17]:
istock_base_download_button = 'asset-link draggable'

def _get_istock_page_and_download(link, directory):
    _media_url = "media.istockphoto.com"
    r = requests.get(link)
    if r.status_code == 200:
        soup = bs(r.content)
        img = []
        img = list(filter(lambda x: _media_url in x.attrs['src'],
                     list(filter(lambda x: 'src' in x.attrs, soup.find_all('img')))))
        if img == []:
            print("Cannot find image.")
        else:
            img_link = img[0].attrs['src']
            __download_and_save_image(img_link, directory, src='istock')
    else:
        print("Cannot connect to : " + link)

def istock_scrape(directory, topic="abstract", n_images=100):
    ## iStock blocks you, be careful
    # raise NotImplementedError("iStockPhotos blocks you, be careful.")

    webpage = "https://www.istockphoto.com"
    base_search_url = "http://www.istockphoto.com/in/photos/%s" % topic

    r = requests.get(base_search_url)
    links_list = []
    if r.status_code == 200:
        soup = bs(r.content)
        links = map(lambda x: webpage + x.attrs['href'], soup.find_all(attrs={'class': istock_base_download_button}))
        links_list += links

        nextPageLink = soup.find_all(attrs={'id': 'next-gallery-page'})
        print("Moving to next page.")
        sleep(0.5)

        while (nextPageLink != [] and len(links_list) < n_images):
            href = webpage + nextPageLink[0].attrs['href']
            r = requests.get(href)
            if r.status_code == 200:
                soup = bs(r.content)
                links = map(lambda x: webpage + x.attrs['href'],
                            soup.find_all(attrs={'class': istock_base_download_button}))
                links_list += links
                nextPageLink = soup.find_all(attrs={'id': 'next-gallery-page'})
                print("Moving to next page.")
            else:
                nextPageLink = []
                print("No next page found.")

        thread_list = []
        ## we have the list of link, go to each link and download it
        for link in links_list:
            th = Thread(target=_get_istock_page_and_download, args=(link, directory))
            thread_list.append(th)
            th.start()
            th.join()
            sleep(1)

    # for th in thread_list:
    # 	th.join()


In [18]:
directory = './images/istock'
istock_scrape(directory, n_images=150, topic='mountains')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


Moving to next page.
Moving to next page.
Moving to next page.




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


Attempting to download: https://media.istockphoto.com/photos/starry-night-picture-id519760984
Saving to filename: ./images/istock/519760984.jpg 
Attempting to download: https://media.istockphoto.com/photos/hase-and-panoramic-view-swiss-alps-picture-id517866756
Saving to filename: ./images/istock/517866756.jpg 
Attempting to download: https://media.istockphoto.com/photos/late-autumn-sunset-on-alpine-pastures-and-mountains-in-austria-picture-id500369068
Saving to filename: ./images/istock/500369068.jpg 
Attempting to download: https://media.istockphoto.com/photos/misty-summer-mountain-hills-landscape-picture-id509636590
Saving to filename: ./images/istock/509636590.jpg 
Attempting to download: https://media.istockphoto.com/photos/alberta-wilderness-near-banff-picture-id583809524
Saving to filename: ./images/istock/583809524.jpg 
Attempting to download: https://media.istockphoto.com/photos/misty-blue-mountains-on-sunrise-picture-id613111906
Saving to filename: ./images/istock/613111906.jp

Saving to filename: ./images/istock/537215344.jpg 
Attempting to download: https://media.istockphoto.com/photos/silhouettes-of-hikers-at-sunset-picture-id483629308
Saving to filename: ./images/istock/483629308.jpg 
Attempting to download: https://media.istockphoto.com/photos/winter-landscape-picture-id637900284
Saving to filename: ./images/istock/637900284.jpg 
Attempting to download: https://media.istockphoto.com/photos/moraine-lake-in-banff-national-park-canada-picture-id500177214
Saving to filename: ./images/istock/500177214.jpg 
Attempting to download: https://media.istockphoto.com/photos/rock-mountain-on-white-background-picture-id510843026
Saving to filename: ./images/istock/510843026.jpg 
Attempting to download: https://media.istockphoto.com/photos/reflection-of-mountains-and-clouds-in-the-lake-picture-id512829000
Saving to filename: ./images/istock/512829000.jpg 
Attempting to download: https://media.istockphoto.com/photos/loneley-camper-under-milky-way-at-matterhorn-picture-id

Attempting to download: https://media.istockphoto.com/photos/austrian-alps-starting-famous-krimml-waterfalls-picture-id611622826
Saving to filename: ./images/istock/611622826.jpg 
Attempting to download: https://media.istockphoto.com/photos/young-hiker-drinking-stream-water-picture-id537405077
Saving to filename: ./images/istock/537405077.jpg 
Attempting to download: https://media.istockphoto.com/photos/young-couple-in-piggyback-ride-on-the-snow-mountain-picture-id587944098
Saving to filename: ./images/istock/587944098.jpg 
Attempting to download: https://media.istockphoto.com/photos/female-rock-climber-hanging-over-the-abyss-picture-id509492031
Saving to filename: ./images/istock/509492031.jpg 
Attempting to download: https://media.istockphoto.com/photos/young-couple-skiers-having-fun-and-jumping-in-the-air-picture-id610862276
Saving to filename: ./images/istock/610862276.jpg 
Attempting to download: https://media.istockphoto.com/photos/adventures-on-the-dolomites-with-dog-picture-id4

Saving to filename: ./images/istock/514773355.jpg 
Attempting to download: https://media.istockphoto.com/photos/teamwork-couple-climbing-helping-hand-picture-id475294806
Saving to filename: ./images/istock/475294806.jpg 
Attempting to download: https://media.istockphoto.com/photos/night-mountains-before-sunrise-in-the-egypt-picture-id618428174
Saving to filename: ./images/istock/618428174.jpg 
Attempting to download: https://media.istockphoto.com/photos/bierstadt-lake-reflection-picture-id539962652
Saving to filename: ./images/istock/539962652.jpg 
Attempting to download: https://media.istockphoto.com/photos/group-four-people-mountains-travel-concept-picture-id857293320
Saving to filename: ./images/istock/857293320.jpg 
Attempting to download: https://media.istockphoto.com/photos/group-of-people-on-peak-mountain-picture-id537417802
Saving to filename: ./images/istock/537417802.jpg 
Attempting to download: https://media.istockphoto.com/photos/road-in-colorado-picture-id469775383
Saving 

## Preprocess

In [38]:
def preprocess(foldername, size=500, suffix="_processed"):
    dest_folder = foldername + suffix
    processed = os.path.abspath(dest_folder)

    if os.path.exists(processed):
        print ("Directory %s already exists." % (processed))
        return None

    os.mkdir(dest_folder)

    for root, dirs, files in os.walk(foldername):
        for file in files:
            path = (os.sep.join([os.path.abspath(root), file]))
            img = cv2.imread(path)
            if img is not None:
                m, n, p = img.shape
                m_t, n_t = (size - m) // 2, (size - n) // 2
                final_img = np.pad(img, ((m_t, size - m - m_t), (n_t, size - n - n_t), (0, 0)), mode='constant')
                cv2.imwrite(os.sep.join([dest_folder, file]), final_img)
                print("Saved to : %s" % (file))
                print(final_img.shape)

In [40]:
preprocess('./images/fotolia', size=500)
preprocess('./images/istock', size=1024)

Saved to : fotolia_137840725.jpg
(500, 500, 3)
Saved to : fotolia_137840731.jpg
(500, 500, 3)
Saved to : fotolia_137840686.jpg
(500, 500, 3)
Saved to : fotolia_137840645.jpg
(500, 500, 3)
Saved to : fotolia_137840679.jpg
(500, 500, 3)
Saved to : fotolia_137840650.jpg
(500, 500, 3)
Saved to : fotolia_137840693.jpg
(500, 500, 3)
Saved to : fotolia_137840718.jpg
(500, 500, 3)
Saved to : fotolia_137840730.jpg
(500, 500, 3)
Saved to : fotolia_137840724.jpg
(500, 500, 3)
Saved to : fotolia_137840726.jpg
(500, 500, 3)
Saved to : fotolia_137840646.jpg
(500, 500, 3)
Saved to : fotolia_137840652.jpg
(500, 500, 3)
Saved to : fotolia_137840653.jpg
(500, 500, 3)
Saved to : fotolia_137840733.jpg
(500, 500, 3)
Saved to : fotolia_137840737.jpg
(500, 500, 3)
Saved to : fotolia_137840723.jpg
(500, 500, 3)
Saved to : fotolia_137840680.jpg
(500, 500, 3)
Saved to : fotolia_137840694.jpg
(500, 500, 3)
Saved to : fotolia_137840657.jpg
(500, 500, 3)
Saved to : fotolia_137840656.jpg
(500, 500, 3)
Saved to : fo