# Imports

In [236]:
# paths
import os
import sys
from pathlib import Path

# display
from tqdm.notebook import tqdm

# fastai
from fastdownload import download_url
from fastai.vision.utils import verify_images
from fastai.data.transforms import get_image_files

# scraping
import time
import requests
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager

# Imagescraper

A class that can scrape images from sites like **[artstation](https://www.artstation.com)**, **[pinterest](https://www.pinterest.com/)**, **[pixiv](https://www.pixiv.net/en/)**, **[reddit-memes](https://www.reddit.com/r/memes/)**, **[memebase](https://memebase.cheezburger.com/)**, etc.

Create a directory for anime girl images.

In [55]:
download_dir = Path("./data")
if not os.path.exists(download_dir):
    os.mkdir(download_dir)

Let's start with ArtStation.

ImageScraper class.

In [273]:
class ImageScraper():
    def __init__(self):
        # Save WebDriver to project root in dir `.wdm`
        os.environ['WDM_LOCAL'] = '1'
        self.driver_path = ChromeDriverManager().install()
        
        self.img_urls = set() # set so no duplicates
        
    def artstation_scrape(self, n_images, url):
        """
        Takes a search url and scrapes image urls from artstation and saves them in mem.
        
        Examples
        --------
        >>> scrape("./data/anime_girls", n_images=10, 
                   url="https://www.artstation.com/search?sort_by=relevance&query=girls&category_ids=3,38&medium_ids=1")
        """
        # start up chrome driver
        service = Service(executable_path=self.driver_path)
        driver = webdriver.Chrome(service=service)
        driver.get(url)
        time.sleep(5)
        
        t = 1
        image_count = 0
        while image_count < n_images:
            # 1. Click on next thumbnail
            xpath = (
                "/html/body/div[2]/app-root/app-layout/search-artwork/projects-list/div/projects-list-item[{}]/a"
                .format(t)
            )
            thumbnail = driver.find_element(By.XPATH, xpath)
            thumbnail.click()
            time.sleep(2)
        
            # 2. Get all img elements
            i = 1
            xpath = (
                "/html/body/div[2]/app-root/app-layout/search-artwork/project-view/div/div/main/div/project-asset"
                "//img[@class='img img-responsive block-center img-fit']"
            )
            imgs = driver.find_elements(By.XPATH, xpath)
            
            # 3. Add up to n_images but not over
            n_remaining = n_images - image_count
            if n_remaining < len(imgs):
                cap = n_remaining
            else:
                cap = len(imgs)
            image_count += cap
            self.img_urls.update([img.get_attribute('src') for img in imgs][:cap])
            
            # 4. Go back to click on next thumbnail
            t += 1
            driver.back()

        driver.quit()
    
    def save_urls(self, download_path):
        if not os.path.exists(download_path):
            os.mkdir(download_path)
        
        with open(download_path / "image_urls.txt", "a") as f:
            url_str = "\n".join(self.img_urls)
            f.write(url_str)
        
    def save_images(self, download_path):
        """
        Downloads all images from current instance's img_urls.
        """
        
        if not os.path.exists(download_path):
            os.mkdir(download_path)
        
        for img_url in tqdm(self.img_urls):
            img_path = download_path / (Path(img_url).stem + ".jpg")
            download_url(img_url, img_path, show_progress=False)

In [274]:
scraper = ImageScraper()

In [262]:
scraper.artstation_scrape(n_images=100, 
                          url="https://www.artstation.com/search?sort_by=relevance&query=girls&category_ids=3,38&medium_ids=1")

Now let's save the urls and images.

In [275]:
scraper.img_urls = urls

In [276]:
len(scraper.img_urls)

100

In [277]:
scraper.save_urls(download_dir / "anime_girls")

TODO
> Add a load url function that will load self.img_urls from a url.txt save

In [278]:
scraper.save_images(download_dir / "anime_girls")

  0%|          | 0/100 [00:00<?, ?it/s]

TODO
> Make a function to view the images saved

To scale this you will need to consider RAM and cloud storage.
- You will need to save urls every 100 or 1000 images
- You will need to change the save images method to save to gcs