#### Pipeline to download images from Google

In [1]:
# !pip install python-dotenv

In [2]:
import os
import time
import pickle
import requests
from itertools import cycle
from dotenv import load_dotenv
import pandas as pd

In [3]:
load_dotenv()

True

Specify all paths

In [4]:
data_images_path = "data/images/"
labels_to_download = "data/labels/" + "models_missed_in_datasets.csv"

You may get your secret data from .env file

In [5]:
# Set the API key and Custom Search Engine ID
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
CX_ID = os.getenv("CX_ID")

You many put many APIs to be more effective (in case you have several accounts)

In [8]:
# Set the API key and Custom Search Engine ID
GOOGLE_API_KEY = [
        # API KEY 1,
        # API KEY 2
]
CX_ID = [
    # CX ID 1,
    # CX ID 2
 ]


In [6]:
def download_images_for_a_query(url, query, count=0, row=0, topic=" car photo"):
    url = url.format(q=query+topic, start=count+1)

    pieces = query.split()

    make = pieces[0]
    model = '_'.join(pieces[1:])

    query = query.replace(' ', '_')
    try:

        response = requests.get(url, timeout=30)
        if response.status_code == 429:  # API access limit is reached, we have to wait
            return 0  # Skip
        elif response.status_code != 200:
            return 0
        data = response.json()
        if 'items' not in data:
            return None

        folder_path = data_images_path + make + '/' + f"{row}_{model}"  + "/"
        os.makedirs(folder_path, exist_ok=True)

        items = data['items']
        while items:

            item = items.pop()
            image_url = item['link']
            try:
                response = requests.get(image_url, timeout=10)
            except:
                continue
            if response.status_code == 429:  # API access limit is reached, we have to wait
                return 0  # Skip
            elif response.status_code != 200:  # Most probably 403 - Forbidden user, no access
                continue  # Skip

            count += 1

            with open(f"{folder_path}{query}_{count:05d}.jpg", "wb") as file:
                file.write(response.content)
    except requests.exceptions.Timeout:
        print("Timeout reached")
        return 0
    except Exception as e:
        print("Exception: ", e)
        return 0
    finally:

        return count  # Amount of downloaded images

In [7]:
labels = pd.read_csv(labels_to_download)
labels.head(2)

Unnamed: 0,models
0,AUDI 100
1,AUDI 80


We can specify the number of images needed.

In [9]:
AMOUNT_OF_IMAGES = 50

For our case we needed real photos from different sides

In [11]:
def get_topic(n):
    if n < 30:
        return " car Top-left photo"
    elif n < 40:
        return " car Side photo"
    elif n < 50:
        return " car Back photo"
    else:
        return " car Camera photo"

In [12]:
def download_missing_data(labels, continue_with=1, url=None):
    for row, label in enumerate(labels[continue_with:], start=continue_with):
    # for row, label in enumerate(labels[57:59], start=58):
        count_of_downloaded = 0

        while count_of_downloaded < AMOUNT_OF_IMAGES:
            topic = get_topic(count_of_downloaded)
            downloaded = download_images_for_a_query(url=url, query=label, count=count_of_downloaded, row=row, topic=topic)
            if downloaded > 0:
                count_of_downloaded = downloaded
            else:
                print(f"STOPPED DUE TO REACHED LIMITS AT LABEL: {row}")
                return row - 1
        print(f"{count_of_downloaded} of {label} car are downloaded. Label number is {row}")
        time.sleep(3)

In [None]:
def run_for_all_APIs(api_keys: list, IDs: list):
    for api, cx_id in cycle(zip(api_keys, IDs)):  # Infinite loop
        url = (
                f"https://www.googleapis.com/customsearch/v1?key={api}&cx={cx_id}&q="+"{q}&start={start}&num=10&searchType=image"
        )

        try:
            with open("number_of_downloaded.pickle", "rb") as file:
                row_stopped_at = pickle.load(file)
        except FileNotFoundError:
            row_stopped_at = 0

        row_stopped_at = download_missing_data(labels['models'], continue_with=row_stopped_at+1, url=url)

        # Save the result
        with open("number_of_downloaded.pickle", "wb") as file:
            pickle.dump(row_stopped_at, file)



In [None]:
run_for_all_APIs(GOOGLE_API_KEY, IDs=CX_ID)

You can stop a script at any time. If it didn't save the last result (number of downloaded images you stopped at), then you may update it mannually. Use the cells below:

In [None]:
try:
    with open("number_of_downloaded.pickle", "rb") as file:
        number_of_downloaded = pickle.load(file)
except FileNotFoundError:
    number_of_downloaded = 0

In [None]:
# number_of_downloaded = 42

In [None]:
with open("number_of_downloaded.pickle", "wb") as file:
    pickle.dump(number_of_downloaded, file)

In [None]:
number_of_downloaded