In [2]:
# !pip install python-dotenv

In [3]:
import os
import time
import requests
from dotenv import load_dotenv
import pandas as pd

In [4]:
load_dotenv()

True

In [5]:
data_images_path = "data/images/"
labels_to_download = "data/labels/" + "models_missed_in_datasets.csv"

In [6]:
# Set the API key and Custom Search Engine ID
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
CX_ID = os.getenv("CX_ID")

In [7]:
def download_images_for_a_query(url, query, count=0):
    url = url.format(q=query, start=count+1)

    pieces = query.split()

    make = pieces[0]
    model = '_'.join(pieces[1:])

    query = query.replace(' ', '_')
    try:

        response = requests.get(url, timeout=30)
        if response.status_code == 429:  # API access limit is reached, we have to wait
            return 0  # Skip
        elif response.status_code != 200:
            return 0
        data = response.json()
        if 'items' not in data:
            return None

        folder_path = data_images_path + make + '/' + model  + "/"
        os.makedirs(folder_path, exist_ok=True)

        items = data['items']
        while items:

            item = items.pop()
            image_url = item['link']
            response = requests.get(image_url, timeout=10)
            if response.status_code == 429:  # API access limit is reached, we have to wait
                return 0  # Skip
            elif response.status_code != 200:  # Most probably 403 - Forbidden user, no access
                continue  # Skip

            count += 1

            with open(f"{folder_path}{query}_{count:05d}.jpg", "wb") as file:
                file.write(response.content)
    except requests.exceptions.Timeout:
        print("Timeout reached")
    except Exception as e:
        print("Exception: ", e)
    finally:

        return count  # Amount of downloaded images

In [8]:
labels = pd.read_csv(labels_to_download)
labels.head(2)

Unnamed: 0,models
0,AUDI 100
1,AUDI 80


In [9]:
AMOUNT_OF_IMAGES = 50

In [10]:
URL = (
        f"https://www.googleapis.com/customsearch/v1?key={GOOGLE_API_KEY}&cx={CX_ID}&q="+"{q}&start={start}&num=10&searchType=image"
)

In [11]:
def download_missing_data(labels, continue_with=1):
    for row, label in enumerate(labels[continue_with:], start=continue_with):
        count_of_downloaded = 0
        while count_of_downloaded < AMOUNT_OF_IMAGES:
            downloaded = download_images_for_a_query(url=URL, query=label, count=count_of_downloaded)
            if downloaded > 0:
                count_of_downloaded = downloaded
            else:
                return f"STOPPED DUE TO REACHED LIMITS AT LABEL: {row}"
        print(f"{count_of_downloaded} of {label} car are downloaded. Label number is {row}")
        time.sleep(10)

In [12]:
download_missing_data(labels['models'], continue_with=25)


58 of CHERY TIGGO T11 car are downloaded. Label number is 20
Timeout reached
Timeout reached
Timeout reached
Timeout reached
Timeout reached
Timeout reached
Timeout reached
Timeout reached
Timeout reached
52 of CHEVROLET AVEO car are downloaded. Label number is 21
Timeout reached
Timeout reached
50 of CHEVROLET CAPRICE car are downloaded. Label number is 22
Timeout reached
Timeout reached
Timeout reached
54 of CHEVROLET LACETTI car are downloaded. Label number is 23
54 of CHEVROLET LANOS car are downloaded. Label number is 24
Exception:  HTTPSConnectionPool(host='www.freshnessmag.com', port=443): Max retries exceeded with url: /.image/t_share/MTM1OTkxNjAzNzE3NDAyNTk0/chevy-niva-concept-01.jpg (Caused by SSLError(CertificateError("hostname 'www.freshnessmag.com' doesn't match either of 'www.saydaily.com', 'airows.com', 'artsandcraftshomes.com', 'axleaddict.com', 'beautyeditor.ca', 'bestpethomeremedies.com', 'cupcakesandcashmere.com', 'daytimeconfidential.com', 'delishably.com', 'dengard

KeyboardInterrupt: 

СКАЧАНО: 9+4 = 21