In [1]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import hashlib
import re
import base64

In [2]:
def download_image(url, directory, filename):
    if url.startswith("data:image"):
        # Handle embedded Base64-encoded images
        header, encoded_data = url.split(",", 1)
        image_extension = re.findall(r"data:image/(\w+);", header)[0]
        image_data = base64.b64decode(encoded_data)
        file_path = os.path.join(directory, f"{filename}.{image_extension}")
        with open(file_path, "wb") as file:
            file.write(image_data)
    else:
        # Handle regular image URLs
        response = requests.get(url)
        file_path = os.path.join(directory, filename)
        with open(file_path, "wb") as file:
            file.write(response.content)

In [5]:
def scrape_images(query, num_images, download_location):
    query = query.replace(" ", "+")
    url = f"https://www.google.com/search?q={query}&tbm=isch"

    os.makedirs(download_location, exist_ok=True)

    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    chromedriver_path = "./assets\chromedriver_win32\chromedriver.exe"

    service = Service(chromedriver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.get(url)

    scroll_count = 0
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        scroll_count += 1

        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "rg_i")))
        except:
            break

        if scroll_count == 10:
            break

    image_elements = driver.find_elements(By.CLASS_NAME, "rg_i")

    downloaded_images = 0
    unique_images = set()

    for i, image_element in enumerate(image_elements):
        if image_element is None:
            continue

        image_url = image_element.get_attribute("src")
        if image_url is None:
            continue

        if image_url.startswith("/"):
            image_url = f"https://www.google.com{image_url}"

        image_hash = hashlib.md5(image_url.encode()).hexdigest()
        if image_hash in unique_images:
            continue

        unique_images.add(image_hash)

        filename = f"{query}_{downloaded_images}.jpg"
        download_image(image_url, download_location, filename)
        print(f"Downloaded image: {filename}")

        downloaded_images += 1
        if downloaded_images == num_images:
            break

    print("Image scraping complete!")

    driver.quit()


In [8]:
# Specify the download path and call the scrape_images function
download_path = "./file"
query = "kucing munchkin lucu"
scrape_images(query, 40, download_path)

Downloaded image: kucing+munchkin+lucu_0.jpg
Downloaded image: kucing+munchkin+lucu_1.jpg
Downloaded image: kucing+munchkin+lucu_2.jpg
Downloaded image: kucing+munchkin+lucu_3.jpg
Downloaded image: kucing+munchkin+lucu_4.jpg
Downloaded image: kucing+munchkin+lucu_5.jpg
Downloaded image: kucing+munchkin+lucu_6.jpg
Downloaded image: kucing+munchkin+lucu_7.jpg
Downloaded image: kucing+munchkin+lucu_8.jpg
Downloaded image: kucing+munchkin+lucu_9.jpg
Downloaded image: kucing+munchkin+lucu_10.jpg
Downloaded image: kucing+munchkin+lucu_11.jpg
Downloaded image: kucing+munchkin+lucu_12.jpg
Downloaded image: kucing+munchkin+lucu_13.jpg
Downloaded image: kucing+munchkin+lucu_14.jpg
Downloaded image: kucing+munchkin+lucu_15.jpg
Downloaded image: kucing+munchkin+lucu_16.jpg
Downloaded image: kucing+munchkin+lucu_17.jpg
Downloaded image: kucing+munchkin+lucu_18.jpg
Downloaded image: kucing+munchkin+lucu_19.jpg
Downloaded image: kucing+munchkin+lucu_20.jpg
Downloaded image: kucing+munchkin+lucu_21.jp