In [1]:
import os
import sys
import requests
import argparse
from time import sleep
from bs4 import BeautifulSoup as bs
from threading import Thread

# Constants
FOTOLIA_DOWNLOAD_BUTTON = 'comp-download-buttons row-large'
ISTOCK_BASE_DOWNLOAD_BUTTON = 'asset-link draggable'
ISTOCK_MEDIA_URL = "media.istockphoto.com"

def get_image_url_fotolia(base_url, min_val, directory, index=0, num_retries=5):
    """Retrieve and download an image from Fotolia based on a constructed URL."""
    retries = 0
    while retries < num_retries:
        try:
            response = requests.get(f'{base_url}{min_val + index}')
            if response.status_code == 200:
                soup = bs(response.content, 'lxml')
                button = soup.find_all(attrs={'class': FOTOLIA_DOWNLOAD_BUTTON})
                if button:
                    link = button[0].find("a", href=True)
                    if link:
                        download_and_save_image(link['href'], directory)
                        return link['href']
                print("No download button found.")
                break
            retries += 1
        except requests.RequestException as e:
            print(f"Request failed: {e}")
            retries += 1
    return ""

def get_istock_page_and_download(link, directory):
    """Download image from an iStock page."""
    try:
        response = requests.get(link)
        if response.status_code == 200:
            soup = bs(response.content, 'lxml')
            images = [img for img in soup.find_all('img') if 'src' in img.attrs and ISTOCK_MEDIA_URL in img.attrs['src']]
            if not images:
                print("Cannot find image.")
                return
            img_link = images[0]['src']
            download_and_save_image(img_link, directory, src='istock')
        else:
            print(f"Cannot connect to: {link}")
    except requests.RequestException as e:
        print(f"Failed to process request: {e}")

def download_and_save_image(link, directory, src='fotolia'):
    """Download and save an image from a URL."""
    print(f"Attempting to download: {link}")
    try:
        response = requests.get(link)
        if response.status_code == 200:
            filename = response.headers.get('Content-Disposition', '').split('filename=')[1].strip('"')
            filepath = os.path.join(directory, filename)
            print(f"Saving to filename: {filepath}")
            with open(filepath, 'wb') as f:
                f.write(response.content)
        else:
            print(f"Couldn't download from link: {link}")
    except Exception as e:
        print(f"Error downloading image: {e}")

def scrape_from_fotolia(directory, min_val=137840645, n_images=100):
    """Scrape images from Fotolia."""
    if not os.path.exists(directory):
        os.makedirs(directory)
    base_url = "https://www.fotolia.com/Content/Comp/"
    threads = [Thread(target=get_image_url_fotolia, args=(base_url, min_val, directory, i)) for i in range(n_images)]
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()

def scrape_from_istock(directory, topic="abstract", n_images=100):
    """Scrape images from iStock."""
    base_url = "https://www.istockphoto.com/photos"
    search_url = f"{base_url}/{topic.replace(' ', '-')}"
    try:
        response = requests.get(search_url)
        if response.status_code == 200:
            soup = bs(response.content, 'lxml')
            links = [link['href'] for link in soup.find_all('a', class_=ISTOCK_BASE_DOWNLOAD_BUTTON, href=True)]
            for link in links[:n_images]:
                get_istock_page_and_download(link, directory)
                sleep(1)
        else:
            print("Failed to retrieve search page.")
    except requests.RequestException as e:
        print(f"Failed to retrieve search page: {e}")

def main():
    """Main function to handle command line arguments and initiate scraping."""
    parser = argparse.ArgumentParser(description='Scrape images from stock photo services')
    parser.add_argument('-f', '--folder', required=True, help='Folder to save images')
    parser.add_argument('-u', '--url', required=True, help='URL keyword to determine the service')
    args = parser.parse_args()

    if "fotolia" in args.url.lower():
        scrape_from_fotolia(args.folder)
    elif "istock" in args.url.lower():
        scrape_from_istock(args.folder, n_images=150, topic='mountains')
    print("Done.")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"An error occurred: {e}")
        sys.exit(1)


usage: ipykernel_launcher.py [-h] -f FOLDER -u URL
ipykernel_launcher.py: error: the following arguments are required: -u/--url


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
