# Selenium Image Scraper

In [81]:
# modified function code derived from'https://github.com/rmei97/shiba_vs_jindo/blob/master/image_scraper.ipynb'

def null_count(list_):
    '''This will take in a list of scraped image links, counts/returns 
        how many null values the list contains. 
    '''
    null_count = 0
    
    for element in list_:
        if element == None:
            null_count += 1
            
    return null_count


def get_img_links(driver, max_imgs):
    '''Takes in the selenium driver, and the max number of images wanted to scrape, 
    finds the image links and makes them into a list of 'src' links and a list of 'data-src' links. 
    Then uses the null_count function to weigh which one has least amount of nulls.
    Replaces each null value in that index position of that list with one from the second list. 
    If the list is longer than the max_imgs param, returns up to the max amount from the list.
    If list length is less than max_imgs, returns full list.
    '''
    image_links = driver.find_elements_by_class_name('rg_i.Q4LuWd')

    src_links = [image_links[i].get_attribute('src') for i in range(len(image_links))]
    data_src_links = [image_links[i].get_attribute('data-src') for i in range(len(image_links))]

    nc_src = null_count(src_links)
    nc_datasrc = null_count(data_src_links)
    
    if nc_src > nc_datasrc:
        for i,element in enumerate(data_src_links):
            if element == None:
                data_src_links[i] = src_links[i]
        
        if len(data_src_links) > max_imgs:
            return data_src_links[:max_imgs]
        else:
            return data_src_links
        
    
    else: 
        for i,element in enumerate(src_links):
            if element == None:
                src_links[i] = data_src_links[i]
        
        if len(src_links) > max_imgs:
            return src_links[:max_imgs]
        else:
            return src_links

# def urllibfunction(link_list, data_name):
#     '''Takes in link list, uses urllib.request to retrieve each image from corresponding link, 
#         and renames it using data_name param entered in image_scrape function. 
#     '''
#     sleeps = [1,0.5,1.5,0.7]
#     for i,link in enumerate(link_list):

#         name = data_name+f'{i}.jpeg'

#         urllib.request.urlretrieve(link, name)
#         time.sleep(np.random.choice(sleeps))



def image_scrape(data_name, url, max_imgs=1000):
    """data_name = which classifier you're scraping, enter as string
    url = url to scrape, enter as string,
    path = directory to save imgs to, enter as string,
    max_imgs = set max number of imgs you want, default is 1000"""
    
    import urllib.request
    import time
    from selenium import webdriver
    import numpy as np
    import os
    from tqdm import tqdm
    tqdm._instances.clear()
    

    #instantiate driver
    driver = webdriver.Chrome('chromedriver')
    driver.get(url)

    page_scroll_sleep = 2

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(page_scroll_sleep)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
        #break #insert press load more
            try:
                element = driver.find_elements_by_class_name('mye4qd') #returns list
                element[0].click()
            except:
                break
        last_height = new_height
    
    # gets link list of images
    links = get_img_links(driver, max_imgs)
            
    sleeps = [1,0.5,1.5,0.7]
    
    # urllib save images into folder and renames using data_name string
    for i,link in enumerate(tqdm(links)):

        name = data_name+f'{i}.jpeg'

        urllib.request.urlretrieve(link, name)
        time.sleep(np.random.choice(sleeps))
        
    driver.quit()

In [51]:
# show current working directory
!pwd

/Volumes/HD storage/Documents/Flatiron/Modules/v2.1mod5_capstone/mod5-capstone-project


In [78]:
# enter directory to store image folder
import os
os.chdir('..')
!pwd

/Volumes/HD storage/Documents/Flatiron/Modules/v2.1mod5_capstone


In [43]:
# make a folder for images, name whatever you want
!mkdir example_images

In [79]:
# choose image folder as working directory
import os
os.chdir('example_images')
!pwd

/Volumes/HD storage/Documents/Flatiron/Modules/v2.1mod5_capstone/example_images


In [82]:
import os

# scrape images
land_url = 'https://www.google.com/search?hl=en&tbs=simg:CAES6wEJKEu1gh2na2Ua3wELELCMpwgaYgpgCAMSKIce-grLFdIV0RWGHv8K4gvOFswW3TfyN5441TfTPtk31jflIvk-1j4aMA6IKyiDkAQpwLJ7MMmlUArNfMlSj7N1ZmonlXv7jSf5F-_1WX-X6HpwmniQPXpJaZiAEDAsQjq7-CBoKCggIARIE5QPwiQwLEJ3twQkaWAoVCgNtYXDapYj2AwoKCC9tLzA0X3RiCiUKEmFlcmlhbCBwaG90b2dyYXBoedqliPYDCwoJL20vMDF3NWNfChgKBWJlaWdl2qWI9gMLCgkvbS8wM2h6Y3MM&q=map&tbm=isch&sa=X&ved=2ahUKEwjW4tvRrN_qAhVTIjQIHf4YB7MQsw56BAgNEAE&biw=1430&bih=761'
delta_url = 'https://www.google.com/search?q=river+delta+satellite&hl=en&source=lnms&tbm=isch&sa=X&ved=2ahUKEwj5u5ai8PrqAhWDLn0KHbjKDjMQ_AUoAXoECGIQAw&biw=1430&bih=761'

# make a folder for land tiles
!mkdir land
os.chdir('land')

display('[i] SCRAPING LAND IMAGES')
image_scrape('land', land_url, max_imgs=1000)

# go back to images folder
os.chdir('..')

# create river_delta folder
!mkdir river_delta
os.chdir('river_delta')

display('[i] SCRAPING DELTA IMAGES')
image_scrape('river_delta', delta_url, max_imgs=1000)

# go back to images folder
os.chdir('..')

/Volumes/HD storage/Documents/Flatiron/Modules/v2.1mod5_capstone/example_images/land


'[i] SCRAPING LAND IMAGES'

 39%|███▉      | 334/858 [07:34<11:52,  1.36s/it]
  3%|▎         | 29/866 [16:00<7:41:55, 33.11s/it]
 11%|█▏        | 97/858 [13:32<1:46:16,  8.38s/it]
100%|██████████| 858/858 [15:17<00:00,  1.07s/it]


/Volumes/HD storage/Documents/Flatiron/Modules/v2.1mod5_capstone/example_images/river_delta


'[i] SCRAPING DELTA IMAGES'

100%|██████████| 545/545 [09:39<00:00,  1.06s/it]


# move files into separate folders

In [87]:
!pwd

/Volumes/HD storage/Documents/Flatiron/Modules/v2.1mod5_capstone/example_images


In [88]:
!mkdir train
!mkdir test

os.chdir('train')
!mkdir land
!mkdir river_delta

os.chdir('..')
os.chdir('test')
!mkdir land
!mkdir river_delta

os.chdir('..')
!pwd

/Volumes/HD storage/Documents/Flatiron/Modules/v2.1mod5_capstone/example_images


In [89]:
#  Set up directory paths
train_dir_land = 'train/land'
train_dir_delta = 'train/river_delta'
test_dir_land = 'test/land'
test_dir_delta = 'test/river_delta'

In [90]:
import shutil
import os
import random

list_ = ['land/', 'river_delta/']
test_list = [test_dir_land, test_dir_delta]
train_list = [train_dir_land, train_dir_delta]

for i in range(len(list_)):

    files = os.listdir(list_[i])
    total = len(files)
    
    # moves 25% of files to test folders, and 75% to training folders for each class
    sample_list = random.sample(files, int(total / 4))
    for file in sample_list: 
        shutil.move(list_[i]+file, test_list[i])
    remaining_files = os.listdir(list_[i])
    for file in remaining_files: 
        shutil.move(list_[i]+file, train_list[i])

In [91]:
!pwd

/Volumes/HD storage/Documents/Flatiron/Modules/v2.1mod5_capstone/example_images


In [92]:
# remove empty land and river_delta folders
!rmdir land
!rmdir river_delta