In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import io
from PIL import Image
import hashlib

# download wikipage
wikipage = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_by_continent_(data_file)"
result = requests.get(wikipage)

# if successful parse the download into a BeautifulSoup object, which allows easy manipulation 
if result.status_code == 200:
    soup = BeautifulSoup(result.content, "html.parser")
    
# find the object with HTML class wikitable sortable
table = soup.find('table',{'class':'wikitable sortable'})

# loop through all the rows and pull the text
new_table = []
for row in table.find_all('tr')[1:]:
    column_marker = 0
    columns = row.find_all('td')
    new_table.append([column.get_text() for column in columns])
    
df = pd.DataFrame(new_table, columns=['ContinentCode','Alpha2','Alpha3','PhoneCode','Name'])
df['Name'] = df['Name'].str.replace('\n','')
df

Unnamed: 0,ContinentCode,Alpha2,Alpha3,PhoneCode,Name
0,AS,AF,AFG,004,"Afghanistan, Islamic Republic of"
1,EU,AL,ALB,008,"Albania, Republic of"
2,AN,AQ,ATA,010,Antarctica (the territory South of 60 deg S)
3,AF,DZ,DZA,012,"Algeria, People's Democratic Republic of"
4,OC,AS,ASM,016,American Samoa
...,...,...,...,...,...
257,AS,YE,YEM,887,Yemen
258,AF,ZM,ZMB,894,"Zambia, Republic of"
259,AS,XD,,,United Nations Neutral Zone
260,AS,XS,,,Spratly Islands


In [2]:
import selenium
from selenium import webdriver
# This is the path I use
# DRIVER_PATH = '.../Desktop/Scraping/chromedriver 2'
# Put the path for your ChromeDriver here
DRIVER_PATH = 'C:/Users/alexa/Documents/GitHub/anic0077.github.io/misc/chromedriver'
wd = webdriver.Chrome(executable_path=DRIVER_PATH)
wd.get('https://google.com')
search_box = wd.find_element_by_css_selector('input.gLFyf')
search_box.send_keys('Dogs')

In [3]:
wd.quit()

In [4]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:float=0.5):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            #return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

In [5]:
urls = []
def persist_image(folder_path:str,url:str):
    urls.append(url)
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

In [7]:
#combining above 2 functions
def search_and_download(search_term:str,driver_path:str,number_images:int,target_path='./images'):
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=1)
        
    for elem in res:
        persist_image(target_folder,elem)

In [7]:
search_term = 'pewdiepie'

search_and_download(
    search_term = search_term,
    driver_path = DRIVER_PATH,
    number_images = 20
)

Found: 100 search results. Extracting links from 0:100
Found: 138 image links, looking for more ...
Found: 212 search results. Extracting links from 100:212
Found: 269 image links, looking for more ...
Found: 312 search results. Extracting links from 212:312
Found: 377 image links, looking for more ...
Found: 412 search results. Extracting links from 312:412
Found: 400 image links, done!
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTWVvRbOrL5_ahYRnh01D8iNMkdHdjRfoTN9w&usqp=CAU - as ./images\pewdiepie\72143ff4eb.jpg
SUCCESS - saved https://i.guim.co.uk/img/static/sys-images/Guardian/Pix/pictures/2013/11/8/1383904648819/d764ac9a-ef77-439d-afc0-275b0d59f37f-620x372.jpeg?width=1200&height=630&quality=85&auto=format&fit=crop&overlay-align=bottom%2Cleft&overlay-width=100p&overlay-base64=L2ltZy9zdGF0aWMvb3ZlcmxheXMvdGctYWdlLTIwMTQucG5n&enable=upscale&s=420017eb32db30e87acd173fda8bdf69 - as ./images\pewdiepie\3bde74b9ec.jpg
SUCCESS - saved https://www.dexerto.com/wp-co

In [8]:
# creating a list of above
to_classify = ["cat","dog","frog","snake"]

for i in to_classify:
    search_term = i

    search_and_download(
        search_term = search_term,
        driver_path = DRIVER_PATH,
        number_images = 20
    )
    


Found: 100 search results. Extracting links from 0:100
Found: 20 image links, done!
SUCCESS - saved https://img.webmd.com/dtmcms/live/webmd/consumer_assets/site_images/article_thumbnails/slideshows/is_my_cat_normal_slideshow/1800x1200_is_my_cat_normal_slideshow.jpg - as ./images\cat\0cc50d149e.jpg
SUCCESS - saved https://c.files.bbci.co.uk/151AB/production/_111434468_gettyimages-1143489763.jpg - as ./images\cat\72c56919f9.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS6d87zy2l97Gbuz1xheO71Fzw31vhLFurSyg&usqp=CAU - as ./images\cat\250c131362.jpg
SUCCESS - saved https://images.unsplash.com/photo-1615789591457-74a63395c990?ixid=MnwxMjA3fDB8MHxzZWFyY2h8MXx8ZG9tZXN0aWMlMjBjYXR8ZW58MHx8MHx8&ixlib=rb-1.2.1&w=1000&q=80 - as ./images\cat\23545ddabd.jpg
SUCCESS - saved https://i.guim.co.uk/img/media/26392d05302e02f7bf4eb143bb84c8097d09144b/446_167_3683_2210/master/3683.jpg?width=1200&height=1200&quality=85&auto=format&fit=crop&s=49ed3252c0b2ffb49cf8b508892e452d - as .