In [30]:
# these lines are useful if you are modifying source code from a module that you are using in the notebook
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from mask_detection import settings

Using settings for ENVIRONMENT=dev


In [4]:
from mask_detection.infrastructure import FileManager

files = FileManager(settings.DATA_DIR)
help(files)

Help on FileManager in module mask_detection.infrastructure.files object:

class FileManager(builtins.object)
 |  Utility class to handle data files contained in a root folder
 |  
 |  Parameters
 |  ----------
 |  root: str
 |      absolute path to the root data folder
 |  
 |  
 |  >>> files = FileManager('/path/to/data/')
 |  >>> df = files.load('raw/iris.csv')
 |  # Loads dataframe from /path/to/data/raw/iris.csv
 |  >>> files.save(df, 'clean/iris.pickle')
 |  # saves to /path/to/data/clean/iris.pickle
 |  # end ensures 'clean' folder exists
 |  
 |  Methods defined here:
 |  
 |  __init__(self, root)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  ensure_folder_exists(self, abspath)
 |      creates subfolders if necessary
 |  
 |  get_filepath(self, relative_path, create_folders=False)
 |      Returns absolute path from relative to ``data/`` folder
 |  
 |  load(self, relative_path, *args, **kwargs)
 |      Loads .pickle and .csv from file relative 

# Libraries import

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import io
from PIL import Image
import hashlib
import selenium
from selenium import webdriver

# Function Definition

In [2]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

#The function fetch_image_urls expects three input parameters:
#query : Search term, like Covid mask
#max_links_to_fetch : Number of links the scraper is supposed to collect
#webdriver : instantiated Webdriver



def persist_image(folder_path:str,url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

def search_and_download(search_term:str,driver_path:str,target_path='./images',number_images=100):
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
        
    for elem in res:
        persist_image(target_folder,elem)

In [52]:
# Put the path for your ChromeDriver here
DRIVER_PATH = '/Users/louiserodriguez/Documents/Formation/Scraping/chromedriver'
wd = webdriver.Chrome(executable_path=DRIVER_PATH)

In [58]:
wd.get('https://google.com')

In [61]:
search_box = wd.find_element_by_css_selector('input.gLFyf')
search_box.send_keys('Masque Covid19')

In [62]:
#to close de driver 
wd.quit()

In [4]:
search_term = "masque covid19"
DRIVER_PATH = '/Users/louiserodriguez/Documents/Formation/Scraping/chromedriver'

search_and_download(
    search_term= search_term,
    driver_path= DRIVER_PATH,
    target_path='./images',
    number_images=100
)

Found: 200 search results. Extracting links from 0:200
Found: 100 image links, done!
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcRxGvObAWCRP8_oerAz2WSjZxRB7jwSCGJ2wX_9fvxLUBsQn5gy&usqp=CAU - as ./images/masque_covid19/40585c677e.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcRek6inBdCCHL4A8T-V1_MGARG04_5Pn_RPHcabRgAtSDAwes_C&usqp=CAU - as ./images/masque_covid19/5a4be2efdd.jpg
SUCCESS - saved https://static.actu.fr/uploads/2020/03/masques-coronavirus-manche-psychologue-854x641-1.jpg - as ./images/masque_covid19/9428bf2be4.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcTvqHfxE0mzPaq6YVWPHLmjOYhvVwrBbLOe2RtRuOPRHkc4Xq1D&usqp=CAU - as ./images/masque_covid19/0b817a9348.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcTJ5p64_drIuT6IgNDFs-oYLwB487uaiycLYvVKABmensmnPGlB&usqp=CAU - as ./images/masque_covid19/dd647c354f.jpg
SUCCESS - saved https://lh3.googleusercontent.com/proxy/zZv

In [5]:
#for mutliple key words 

search_terms = ["masque covid19", "masque", "crowd coronavirus mask"]

for term in search_terms:
    search_and_download(
        search_term= term,
        driver_path= DRIVER_PATH,
        target_path='./images',
        number_images=100
    )

ue/1354b735e0.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQmZdwTQRuBSSQhOlU5C08asvWtPtDvcs4KRBzdY_uNrZMq1ack&usqp=CAU - as ./images/masque/0a0261c843.jpg
SUCCESS - saved https://www.printoclock.com/media/cache/app_itemblock_medium/b5/0a/b802212878280ac73707ab571567.jpeg - as ./images/masque/1d88967ae3.jpg
SUCCESS - saved https://www.bastideleconfortmedical.com/project/resources/img/original/masquee2.jpg - as ./images/masque/f78c6ce3d9.jpg
SUCCESS - saved https://www.lecyclo.com/components/com_virtuemart/shop_image/product/masque-anti-pollution-n99-double-valves-noir-vogmask_full_3.jpg - as ./images/masque/8fd25fb748.jpg
SUCCESS - saved https://img-4.linternaute.com/DHyJZ82jpWqSbP_I-1XqL_p7ow0=/1240x/smart/a541f05d6e6b411c9742235dfa311a39/ccmcms-linternaute/15413476.jpg - as ./images/masque/97458eaf15.jpg
SUCCESS - saved https://photos.lci.fr/images/613/344/innovation-le-masque-reutilisable-a-l-infini-de-michelin-20200425-2340-4d7587-0@1x.jpeg - as ./imag