In [None]:
# default_exp images

# Duckduckgo Image Downloader 
> Provides an image downloader using duckduckgo image search.
>
> This library requires selenium and chromedriver
> in order to fetch image urls
> and reuses the fastai image utilities to download the images.
>
> Because all the urls come from the duckduckgo site,
> a slow version of download_images is provided in order
> to not bombard the site with simultaneous requests.

In [None]:
#hide
from nbdev.showdoc import *

**(Optional) Run the following command to setup fastai in Colab**

In [None]:
# colab
#!curl https://course.fast.ai/setup/colab | bash

### Setup Instructions 

1. Install Chrome Driver (done one-time per environment only)

**Make sure that chrome and chromedriver are both accessible in the PATH variable**

In [None]:
# chrome_linux_install
# install chromium, its driver
!apt-get update
!apt install chromium-chromedriver


2. Install python libraries (done one-time per environment only)

In [None]:
# pip_install
# install python libraries and fastai2
!pip install selenium
!pip install beautifulsoup4
!pip install html5lib
!pip install fastcore
!pip install fastai

**In the actual usage, running the following pip install will install all the requirements so the above step to install the required libraries will be a redundant step**
```
pip install git+https://github.com/butchland/ddg_images_downloader.git
```

In [None]:
#hide
#!pip install git+https://github.com/butchland/ddg_images_downloader.git

### Setup selenium webdriver to use headless Chrome

In [None]:
# exporti
# set options to be headless, ..
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [None]:
# exporti
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#### Automate search via selenium accessing DDG Image Search

> The following methods fetch the image urls retrieved from DDG Image Search into files

In [None]:
# exporti
import time
DDG_SEARCH_URL = 'https://start.duckduckgo.com'
# DDG_IMAGE_SEARCH_URL = 'https://duckduckgo.com/?q=%s&t=h_&iar=images&iax=images&ia=images'

def get_browser_page(search_string, options=chrome_options, scroll_count=20, sleep=3):
    "retrieves a ddg page containing image search results for `search_string`, scrolling down `scroll_count` times, with a delay of `sleep` seconds between scrolls"
    browser = webdriver.Chrome(options=options)
    browser.get(DDG_SEARCH_URL)
    wait = WebDriverWait(browser,10)
    input_elt = wait.until(
        EC.presence_of_element_located((By.ID, "search_form_input_homepage"))
    )
    submit_elt = browser.find_element_by_id('search_button_homepage')
    input_elt.send_keys(search_string)
    submit_elt.click()
    images_link = wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, ".js-zci-link--images"))
    )
    images_link.click()
    wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, ".tile--img__img"))
    )
    for i in range(scroll_count):
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(sleep)
    page_source = browser.page_source
    browser.quit()
    return page_source

In [None]:
from fastcore.test import *
import operator

In [None]:
page_source = get_browser_page('coconut fruit',scroll_count=3,sleep=1);len(page_source)

447170

In [None]:
test_ne(page_source,None)
test_eq(type(page_source),type('a string'))
test_ne(len(page_source),0)

In [None]:
# slow
page_source = get_browser_page('teddy bear');len(page_source)

843251

In [None]:
# exporti
from bs4 import BeautifulSoup as bs
def extract_images(page_source):
    page = bs(page_source,"html5lib")
    img_elts = page.find_all("img",attrs={"class":"tile--img__img"})
    return img_elts

In [None]:
img_elts = extract_images(page_source);len(img_elts)

736

In [None]:
test_ne(img_elts,None)
test_ne(len(img_elts),0)

In [None]:
# exporti
import re
from urllib.parse import unquote
DATA_SRC_URL_PATTERN = r'^//external-content\.duckduckgo\.com/iu/\?u=(.*)$'

def extract_url(img_elt):
    data_src = img_elt.attrs['data-src']
    if data_src:
        url_match = re.match(DATA_SRC_URL_PATTERN,data_src)
        if url_match:
            return unquote(url_match.group(1))
    return None

def extract_img_urls(img_elts):
    img_urls = [url for url in map(extract_url, img_elts) if url is not None]
    return img_urls


In [None]:
img_urls = extract_img_urls(img_elts);len(img_urls)

736

In [None]:
(img_urls[0],img_urls[50], img_urls[99])

('https://tse2.mm.bing.net/th?id=OIP.1U8UEYu_GHNSEKHqWHKRzgHaHa&pid=Api&f=1',
 'https://tse3.mm.bing.net/th?id=OIP.XDaRobDwpBbfJfVIBotOfQHaIM&pid=Api&f=1',
 'https://tse4.mm.bing.net/th?id=OIP.o4v5D8fRplBh1ExN0Wf9_AHaJ4&pid=Api&f=1')

In [None]:
test(len(img_urls),0,operator.gt)
test_eq('https://',img_urls[0][:8])

#### Get Image URLs

> get image urls given a search string while scrolling down (with sleep in between scrolls)

In [None]:
# export
# combine everything into 1 
def get_image_urls(search_string,options=chrome_options,scroll_count=20,sleep=3):
    "get image urls with `search_string` and scroll down `scroll_count` times with a delay of `sleep` seconds returns a list of urls"
    page_source = get_browser_page(search_string,options=options,scroll_count=scroll_count,sleep=sleep)
    img_elts = extract_images(page_source)
    img_urls = extract_img_urls(img_elts)
    return img_urls
  


In [None]:
img_urls = get_image_urls('apple fruit', scroll_count=3,sleep=3)

In [None]:
test(len(img_urls),0, operator.gt)
test_eq('https://', img_urls[0][:8])

In [None]:
# exporti
from pathlib import Path
# from fastcore.utils import *

#### Download search terms retrieved image urls into files

> This will download each search term into a separate file

In [None]:
# export
def download_search_url(search_item, file_path, options=chrome_options, scroll_count=20, sleep=3):
    "download image url search results for a search item `search_item` into a file `file_path`"
    results = get_image_urls(search_item, options=options, scroll_count=scroll_count, sleep=sleep)
    print(search_item, len(results))
    with open(file_path,'w') as f:
        f.writelines("%s\n" % line for line in results)

In [None]:
# export
def download_search_urls(path,search_terms, search_pattern, file_pattern='{0}.txt',
                      options=chrome_options,scroll_count=20,sleep=3):
    "download search results into a file in named after search string"
    path.mkdir(exist_ok=True)
    for o in search_terms:
        download_search_url(search_pattern.format(o),path/file_pattern.format(o),
                            options=options,scroll_count=scroll_count, sleep=sleep)

#### Example
> Download Bear Images for fastai lesson on creating your own dataset using bears

In [None]:
from fastcore.utils import * # decorate path

In [None]:
# slow
!rm -rf bears
bear_types = ['grizzly','black','teddy']
path = Path('bears')
download_search_urls(path, bear_types, '{0} bear', scroll_count=11)

In [None]:
bear_types = ['panda']
path = Path('bears')
download_search_urls(path, bear_types, '{0} bear', scroll_count=3)

panda bear 360


In [None]:
bear_path = path.ls();bear_path

(#3) [Path('bears/.DS_Store'),Path('bears/panda.txt'),Path('bears/panda')]

In [None]:
test_is(Path('bears/panda.txt') in bear_path,True)

### Download images from files stored in path

> The following methods retrieves the images from stored urls in a file.

In [None]:
# exporti
# externalized version of fastai2 internal method _download_image (copied exactly)
# see https://github.com/fastai/fastai2/blob/f9231256e2a8372949123bda36e44cb0e1493aa2/fastai2/vision/utils.py#L11
from fastai.data.external import download_url
def download_image_inner(dest, inp, timeout=4):
    i,url = inp
    suffix = re.findall(r'\.\w+?(?=(?:\?|$))', url)
    suffix = suffix[0] if len(suffix)>0  else '.jpg'
    try: download_url(url, dest/f"{i:08d}{suffix}", overwrite=True, show_progress=False, timeout=timeout)
    except Exception as e: f"Couldn't download {url}."

#### Slow version of download images 
> This is to allow the program to not overwhelm the duckduckgo server with simultaneous requests
> but allow it to pause the query with a configurable delay between image requests (default to 3 secs per batch)

In [None]:
# export
def download_images_slowly(dest,url_file=None, urls=None, max_pics=1000, delay=3, batch_size=10,timeout=4):
    "Download images listed in text file `url_file` to path `dest`, at most `max_pics` with a delay of `delay` secs for every batch of `batch_size` images with only 1 thread"
    if urls is None: urls = url_file.read().strip().split("\n")[:max_pics]
    dest = Path(dest)
    dest.mkdir(exist_ok=True)
    for inp in enumerate(urls):
        i,_ = inp
        download_image_inner(dest,inp,timeout=timeout)
        if i % batch_size == 0:
            print(f'downloaded: ', i, ' dest: ', dest)
            time.sleep(delay)

In [None]:
# export
def download_search_images_slowly(path, search_terms, file_pattern='{0}.txt', 
                                  max_pics=1000, delay=3, batch_size=10,timeout=4):
    "Download images listed in text file for each item in `search_terms`  to path `dest`/item, at most `max_pics` with a delay of `delay` secs for every batch of `batch_size` images with only 1 thread"
    for o in search_terms:
        download_images_slowly(path/o,path/file_pattern.format(o),max_pics=max_pics, delay=delay,
                               batch_size=batch_size, timeout=timeout)

In [None]:
# slow
download_search_images_slowly(path,bear_types)

downloaded:  0  dest:  bears/panda
downloaded:  10  dest:  bears/panda
downloaded:  20  dest:  bears/panda
downloaded:  30  dest:  bears/panda
downloaded:  40  dest:  bears/panda
downloaded:  50  dest:  bears/panda
downloaded:  60  dest:  bears/panda
downloaded:  70  dest:  bears/panda
downloaded:  80  dest:  bears/panda
downloaded:  90  dest:  bears/panda
downloaded:  100  dest:  bears/panda
downloaded:  110  dest:  bears/panda
downloaded:  120  dest:  bears/panda
downloaded:  130  dest:  bears/panda
downloaded:  140  dest:  bears/panda
downloaded:  150  dest:  bears/panda
downloaded:  160  dest:  bears/panda
downloaded:  170  dest:  bears/panda
downloaded:  180  dest:  bears/panda
downloaded:  190  dest:  bears/panda
downloaded:  200  dest:  bears/panda
downloaded:  210  dest:  bears/panda
downloaded:  220  dest:  bears/panda
downloaded:  230  dest:  bears/panda
downloaded:  240  dest:  bears/panda
downloaded:  250  dest:  bears/panda


KeyboardInterrupt: 

In [None]:
# slow
bear_path = path.ls()
test_is(Path('bears/panda') in bear_path,True)
panda_bears = (path/'panda').ls()
test(len(panda_bears),0,operator.gt)
test_eq(panda_bears[0].as_posix()[:14],'bears/panda/00')

In [None]:
from nbdev.export import *
notebook2script()