In [6]:
import os
import asyncio
import aiohttp
import hashlib
from bs4 import BeautifulSoup
from urllib.parse import quote
import logging
from typing import List, Set
import time
import nest_asyncio
nest_asyncio.apply()

In [7]:
class AsyncImageScraper:
    def __init__(self, output_dir: str, max_images: int = 1000):
        self.output_dir = output_dir
        self.max_images = max_images
        self.downloaded_count = 0
        self.seen_urls: Set[str] = set()
        self.session = None
        
        # Setup logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)

    async def init_session(self):
        if not self.session:
            self.session = aiohttp.ClientSession()

    async def close_session(self):
        if self.session:
            await self.session.close()
            self.session = None

    async def download_image(self, url: str, source: str) -> bool:
        if url in self.seen_urls:
            return False
            
        self.seen_urls.add(url)
        try:
            async with self.session.get(url, timeout=30) as response:
                if response.status == 200:
                    content = await response.read()
                    file_hash = hashlib.md5(url.encode()).hexdigest()
                    filename = f"{source}_{file_hash}.jpg"
                    filepath = os.path.join(self.output_dir, filename)
                    
                    with open(filepath, 'wb') as f:
                        f.write(content)
                    
                    self.downloaded_count += 1
                    self.logger.info(f"Downloaded {filename}")
                    return True
                    
        except Exception as e:
            self.logger.error(f"Error downloading {url}: {e}")
        return False

    async def scrape_bing(self, query: str):
        encoded_query = quote(query)
        url = f"https://www.bing.com/images/search?q={encoded_query}"
        
        try:
            async with self.session.get(url) as response:
                if response.status == 200:
                    html = await response.text()
                    soup = BeautifulSoup(html, 'html.parser')
                    images = soup.find_all('img', {'class': 'mimg'})
                    
                    for img in images:
                        if self.downloaded_count >= self.max_images:
                            break
                            
                        image_url = img.get('src')
                        if image_url:
                            await self.download_image(image_url, 'bing')
                            await asyncio.sleep(1)
                            
        except Exception as e:
            self.logger.error(f"Error scraping Bing: {e}")

    async def scrape_shutterstock(self, query: str):
        encoded_query = quote(query)
        url = f"https://www.shutterstock.com/search/{encoded_query}"
        
        try:
            async with self.session.get(url) as response:
                if response.status == 200:
                    html = await response.text()
                    soup = BeautifulSoup(html, 'html.parser')
                    images = soup.find_all('img', {'data-testid': 'asset-image'})
                    
                    for img in images:
                        if self.downloaded_count >= self.max_images:
                            break
                            
                        image_url = img.get('src')
                        if image_url:
                            await self.download_image(image_url, 'shutterstock')
                            await asyncio.sleep(1)
                            
        except Exception as e:
            self.logger.error(f"Error scraping Shutterstock: {e}")

    async def scrape_all(self, search_terms: List[str]):
        await self.init_session()
        try:
            for term in search_terms:
                if self.downloaded_count >= self.max_images:
                    break
                    
                tasks = [
                    self.scrape_bing(term),
                    self.scrape_shutterstock(term)
                ]
                await asyncio.gather(*tasks)
                
        finally:
            await self.close_session()

# Usage
async def main():
    search_terms = [
        'pink disease on coffee leaf',
        'bệnh nấm hồng cây cà phê'
    ]
    
    scraper = AsyncImageScraper('pink_disease_images')
    await scraper.scrape_all(search_terms)

if __name__ == "__main__":
    asyncio.run(main())

INFO:__main__:Downloaded bing_ad1aa30f4820d15457e552a6c99ac6de.jpg
INFO:__main__:Downloaded bing_0fcdae9f483bf7c75d0ced7290b0832d.jpg
INFO:__main__:Downloaded bing_d945f89d70f648ee959e07c3c7bba556.jpg
INFO:__main__:Downloaded bing_6317a9e1b8d1ab139f7f8731d1f1c287.jpg
INFO:__main__:Downloaded bing_22e54241043976b87c154ff54eeaa6ac.jpg
INFO:__main__:Downloaded bing_791338a827d60b368f625a785e389c6d.jpg
INFO:__main__:Downloaded bing_1a1dad14b11b4089a06fa8efd5944939.jpg
INFO:__main__:Downloaded bing_d3456f947594cb37cb346c186f0192ed.jpg
INFO:__main__:Downloaded bing_8655c0cfd2275bd1262c2bb57513c484.jpg
INFO:__main__:Downloaded bing_198b0562f52c14e57487b34b399fa54d.jpg
INFO:__main__:Downloaded bing_d41c883e4794335c122012323eeecf39.jpg
INFO:__main__:Downloaded bing_cf85cda72428022c32cb96dae4987916.jpg
INFO:__main__:Downloaded bing_b7c3a3299fda99428d46af1e013811a0.jpg
INFO:__main__:Downloaded bing_6fddafe4504ee2373d9c1e33c30923e1.jpg
INFO:__main__:Downloaded bing_95745f62853b32377e9b34e14e761589