In [1]:
%matplotlib inline

In [2]:
import requests
from bs4 import BeautifulSoup
from joblib import Parallel, delayed
from PIL import Image
from io import BytesIO
import json
import slugify
import os
import tqdm
import pandas as pd

import logging
import time

In [3]:
headers = None


def get_with_retry(url, headers=None, params=None, max_retries=10):
    retry_count = 0
    response = None
    
    while retry_count < max_retries:
        try:
            response = requests.get(url, headers=headers, params=params)
            break
        except:
            retry_count += 1
            time.sleep(0.1)
    
    return response


def get_headers():
    global headers
    
    if headers is None:
        _headers = """accept:*/*
accept-language:en-US,en;q=0.9
user-agent:Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36"""

        _headers = _headers.split('\n')
        headers = {i.split(':')[0]: ':'.join(i.split(':')[1:]) for i in _headers}

    return headers


def process_rg_bx(rg_str):
    # We need to transform rg to str in order to parallelize.
    # For some reason, joblib raises a runtime error when using bs4 object as input.
    rg = BeautifulSoup(rg_str, "html.parser")
    
    metadata = json.loads(rg.find_all('div', {"class": "rg_meta"})[0].text)
    image_link = metadata.get("tu")  # Link to small image
    title = metadata.get("pt")
    record_id = metadata.get("id")
    image_type = metadata.get("ity")
    caption = metadata.get("s")
    
    im = get_with_retry(image_link, headers=get_headers())
    image = Image.open(BytesIO(im.content))

    data = dict(
        title=title, image=image, caption=caption,
        image_link=image_link, record_id=record_id, image_type=image_type
    )
    
    log_data = {}
    for key in data:
        if 'image' == key:
            continue
            
        log_data[key] = data[key]
        
    logging.info(json.dumps(log_data))
    
    return data


def search_image(query, num_pages=1, batch=100):
    '''
    query: query terms
    ijn: page
    start: start item number
    '''
    
    params = dict(
        ei="wX7tWs60EpfGvwTnzrrwCg",
        yv="3",
        tbm="isch",
        q=query,
        vet="10ahUKEwiOmYSbpu7aAhUX448KHWenDq4QuT0IlQIoAQ.wX7tWs60EpfGvwTnzrrwCg.i",
        ved="0ahUKEwiOmYSbpu7aAhUX448KHWenDq4QuT0IlQIoAQ",
        ijn=0,
        start=0,
        asearch="ichunk",
        # async="_id:rg_s,_pms:s,_fmt:pc",
    )
    
    url = "https://www.google.com.ph/search"
    dataset = []
    
    for ijn in range(0, num_pages):
        loginfo = 'Query: {} Page: {}'.format(query, ijn)
        logging.info(loginfo)
        print(loginfo)
        
        start = ijn * batch
        params["ijn"] = ijn
        params["start"] = start

        response = get_with_retry(url, params=params, headers=get_headers())
        html_doc = response.json()[1][1]
            
        b = BeautifulSoup(html_doc, "html.parser")
        rg_bx = b.find_all("div", {"class": "rg_bx"})
        
        if len(rg_bx) == 0:
            print(ijn)

        dataset.extend(Parallel(n_jobs=5)(delayed(process_rg_bx)(str(rg)) for rg in rg_bx))

    return dataset

In [4]:
key_search = 'common-animals'
ca = requests.get('https://www.google.com.ph/search?q={}'.format(key_search.replace('-', '+')), headers=get_headers())
ca = BeautifulSoup(ca.content, 'html.parser')
kltats = ca.find_all('div', {'class': 'kltat'})

search_terms = map(unicode, [kltat.text.lower() for kltat in kltats])

In [5]:
logging.basicConfig(
    filename='google-scraper-{}.log'.format(key_search),
    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [6]:
image_dir = '../../data/img_{}/'.format(key_search.replace('-', '_'))

In [7]:
full_dataset = []

In [8]:
%%time
# estimated_images_per_class = 5000
# num_pages_per_class = estimated_images_per_class // 100  # 100 -> batch per query

num_pages_per_class = 7

for st in search_terms:
    logging.info('Scraping search term: {}'.format(st))
    dataset = search_image(st, num_pages=num_pages_per_class)
    
    for record in dataset:
        image = record['image']
        r_fname = record['record_id']
        fmt = image.format.lower()
        r_fname = '.'.join([r_fname, fmt])
        
        st_dir = os.path.join(image_dir, slugify.slugify(st))
        if not os.path.isdir(st_dir):
            os.makedirs(st_dir)
        
        r_fname = os.path.join(st_dir, r_fname)

        record['image'].save(r_fname)
        
        record['image'] = r_fname

        full_dataset.append(record)

Query: amphibians Page: 0
Query: amphibians Page: 1
Query: amphibians Page: 2
Query: amphibians Page: 3
Query: amphibians Page: 4
Query: amphibians Page: 5
Query: amphibians Page: 6
Query: dog Page: 0
Query: dog Page: 1
Query: dog Page: 2
Query: dog Page: 3
Query: dog Page: 4
Query: dog Page: 5
Query: dog Page: 6
Query: cat Page: 0
Query: cat Page: 1
Query: cat Page: 2
Query: cat Page: 3
Query: cat Page: 4
Query: cat Page: 5
Query: cat Page: 6
Query: bird Page: 0
Query: bird Page: 1
Query: bird Page: 2
Query: bird Page: 3
Query: bird Page: 4
Query: bird Page: 5
Query: bird Page: 6
Query: deer Page: 0
Query: deer Page: 1
Query: deer Page: 2
Query: deer Page: 3
Query: deer Page: 4
Query: deer Page: 5
Query: deer Page: 6
Query: bear Page: 0
Query: bear Page: 1
Query: bear Page: 2
Query: bear Page: 3
Query: bear Page: 4
Query: bear Page: 5
Query: bear Page: 6
Query: snakes Page: 0
Query: snakes Page: 1
Query: snakes Page: 2
Query: snakes Page: 3
Query: snakes Page: 4
Query: snakes Page: 5


In [None]:
# data/img/<query-words>/query-words_result-id.jpg

In [9]:
full_data_df = pd.DataFrame(full_dataset)

In [10]:
full_data_df.head()

Unnamed: 0,caption,image,image_link,image_type,record_id,title
0,Collage of amphibians,../../data/img_common_animals/amphibians/tOSgy...,https://encrypted-tbn0.gstatic.com/images?q=tb...,png,tOSgylwuq-zi7M:,Amphibian - Wikipedia
1,Amphibians,../../data/img_common_animals/amphibians/BA01z...,https://encrypted-tbn0.gstatic.com/images?q=tb...,jpg,BA01zBROtA6QgM:,Amphibians | San Diego Zoo Animals & Plants
2,,../../data/img_common_animals/amphibians/TKxsc...,https://encrypted-tbn0.gstatic.com/images?q=tb...,jpg,TKxsciksiYOBcM:,amphibians-hero.jpg
3,Snake Head Pops Out of Frog's Maw in Mesmerizi...,../../data/img_common_animals/amphibians/mgGpN...,https://encrypted-tbn0.gstatic.com/images?q=tb...,,mgGpNhsvyK7htM:,Amphibians - 2018 News and Scientific Articles...
4,,../../data/img_common_animals/amphibians/a4k5F...,https://encrypted-tbn0.gstatic.com/images?q=tb...,jpg,a4k5Fu5KAIe9UM:,reticulated glass frog eggs.ngsversion.1465493...


In [11]:
full_data_df.to_hdf('../../data/full_data_{}.hdf'.format(key_search.replace('-', '_')), 'full_data_df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block0_values] [items->['caption', 'image', 'image_link', 'image_type', 'record_id', 'title']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [12]:
full_data_df.image.str.split('/').map(lambda x: x[-2]).nunique()

51

In [12]:
# %%time
# dataset = search_image('dog', num_pages=7, batch=100)

Query: dog Page: 0
Query: dog Page: 1
Query: dog Page: 2
Query: dog Page: 3
Query: dog Page: 4
Query: dog Page: 5
Query: dog Page: 6
CPU times: user 903 ms, sys: 367 ms, total: 1.27 s
Wall time: 49.8 s
