# Selenium Web Scraping and Browsing Automation

In [9]:
import IPython
from IPython.display import IFrame, Javascript, display

# selenium-python docs
url_selenium_python_docs = IFrame(
    src='https://selenium-python.readthedocs.io/',
    width='100%',
    height='600'
)

display(url_selenium_python_docs)

In [10]:
import selenium.webdriver

dir(selenium.webdriver)

['ActionChains',
 'Android',
 'BlackBerry',
 'Chrome',
 'ChromeOptions',
 'DesiredCapabilities',
 'Edge',
 'Firefox',
 'FirefoxOptions',
 'FirefoxProfile',
 'Ie',
 'IeOptions',
 'Opera',
 'PhantomJS',
 'Proxy',
 'Remote',
 'Safari',
 'TouchActions',
 'WebKitGTK',
 'WebKitGTKOptions',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'android',
 'blackberry',
 'chrome',
 'common',
 'edge',
 'firefox',
 'ie',
 'opera',
 'phantomjs',
 'remote',
 'safari',
 'support',
 'webkitgtk']

In [11]:
# !which chromedriver geckodriver safaridriver operadriver msedgedriver

#### Note: In your machine, locate where your drivers were installed

/usr/local/bin/chromedriver
/usr/local/opt/ruby/bin/geckodriver
/usr/bin/safaridriver
/usr/local/bin/operadriver
/Users/rodelarenas/Desktop/DEV/edgedriver_mac64/msedgedriver


In [12]:
from selenium.webdriver import Chrome, Firefox, Safari, Opera, Edge

import time

In [13]:
from selenium import webdriver

def get_chrome():
    return Chrome(executable_path='/usr/local/bin/chromedriver')

def get_firefox(headless=False):
    if headless:
        options = webdriver.FirefoxOptions()
        options.add_argument('--headless')
        return Firefox(executable_path='/usr/local/opt/ruby/bin/geckodriver', options=options)
    return Firefox(executable_path='/usr/local/opt/ruby/bin/geckodriver')

def get_safari():
    return Safari(executable_path='/usr/bin/safaridriver')

def get_opera():
    return Opera(executable_path='/usr/local/bin/operadriver')

def get_msedge():
    return Edge(executable_path='/Users/rodelarenas/Desktop/DEV/edgedriver_mac64/msedgedriver')

In [14]:
# Basic Navigation
BASE_URL = 'https://google.com'

# open a webdriver
firefox = get_firefox()

# maximize, minimize
firefox.maximize_window()
time.sleep(1)

# browsing
firefox.get(BASE_URL)
time.sleep(2)

firefox.get('https://www.gmanmi.com')
time.sleep(2)

# F / B
firefox.back()
time.sleep(2)

firefox.forward()
time.sleep(5)

# firefox.close()
firefox.quit()

In [15]:
# Basis Interaction
from selenium.webdriver.common.keys import Keys


In [16]:
dir(Keys)

['ADD',
 'ALT',
 'ARROW_DOWN',
 'ARROW_LEFT',
 'ARROW_RIGHT',
 'ARROW_UP',
 'BACKSPACE',
 'BACK_SPACE',
 'CANCEL',
 'CLEAR',
 'COMMAND',
 'CONTROL',
 'DECIMAL',
 'DELETE',
 'DIVIDE',
 'DOWN',
 'END',
 'ENTER',
 'EQUALS',
 'ESCAPE',
 'F1',
 'F10',
 'F11',
 'F12',
 'F2',
 'F3',
 'F4',
 'F5',
 'F6',
 'F7',
 'F8',
 'F9',
 'HELP',
 'HOME',
 'INSERT',
 'LEFT',
 'LEFT_ALT',
 'LEFT_CONTROL',
 'LEFT_SHIFT',
 'META',
 'MULTIPLY',
 'NULL',
 'NUMPAD0',
 'NUMPAD1',
 'NUMPAD2',
 'NUMPAD3',
 'NUMPAD4',
 'NUMPAD5',
 'NUMPAD6',
 'NUMPAD7',
 'NUMPAD8',
 'NUMPAD9',
 'PAGE_DOWN',
 'PAGE_UP',
 'PAUSE',
 'RETURN',
 'RIGHT',
 'SEMICOLON',
 'SEPARATOR',
 'SHIFT',
 'SPACE',
 'SUBTRACT',
 'TAB',
 'UP',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__size

In [17]:
BASE_URL = 'https://trends.google.com/trends/trendingsearches/daily?geo=PH'

firefox = get_firefox()
firefox.get(BASE_URL)
time.sleep(.2)

firefox.maximize_window()

dataset = []
for _ in range(5):
    try:
        # pause
        time.sleep(2)

        # element should be visible from the screen before you apply your action/s
        firefox.execute_script('window.scrollBy(0, document.body.scrollHeight)', '') # need to scroll the page to find the element

        # locate the element
        load_more_class = 'feed-load-more-button'
        load_more_button = firefox.find_element_by_class_name(load_more_class)

        # do the interaction
        load_more_button.click()

        # get the feed wrapper
        feed_wrapper_class = 'feed-list-wrapper'
        feeds = firefox.find_elements_by_class_name(feed_wrapper_class)

        for fw in feeds:
            # dates
            dates_class = 'content-header-title'
            dates = fw.find_element_by_class_name(dates_class)
            feed_date = dates.text

            # titles
            titles_class = 'title'
            titles = fw.find_elements_by_class_name(titles_class)
            txt_titles = [t.text for t in titles]

            # summary texts
            summary_text_class = 'summary-text'
            summary_text =  fw.find_elements_by_class_name(summary_text_class)
            txt_summaries = [s.text for s in summary_text]

            # sources
            source_time_class = 'source-and-time'
            sources =  fw.find_elements_by_class_name(source_time_class)
            txt_sources = [sp.text for sp in sources]

            for idx, _ in enumerate(txt_titles):
                title = txt_titles[idx]
                summary_text = txt_summaries[idx]
                sources_text = txt_sources[idx]
                sources_text = sources_text.split()
                source_page = sources_text[0]
                posted_date_time = sources_text[2] + ' ' + sources_text[3]

                data = {
                    'date': feed_date,
                    'title': title,
                    'summary_text': summary_text,
                    'source_page': source_page,
                    'posted_date_time': posted_date_time
                }

                dataset.append(data)

    except:
        pass



import pandas as pd
df = pd.DataFrame(dataset)
df

Unnamed: 0,date,title,summary_text,source_page,posted_date_time
0,"Thursday, December 16, 2021",Mavericks vs Lakers,"How to watch Mavericks vs. Lakers: TV channel,...",CBSSports.com,7h ago
1,"Thursday, December 16, 2021",76ers vs Heat,"Heat vs. 76ers odds, line, spread: 2021 NBA pi...",CBSSports.com,7h ago
2,"Thursday, December 16, 2021",Cavaliers vs Rockets,How to watch Cavaliers vs. Rockets: NBA live s...,CBSSports.com,2h ago
3,"Thursday, December 16, 2021",Bucks vs Pacers,"Pacers Game Tonight: Pacers at Bucks Odds, Inj...",8,9 Seconds
4,"Thursday, December 16, 2021",Oppo Find N,Oppo's Find N is an impressive first folding p...,The,• 18h
...,...,...,...,...,...
335,"Saturday, December 11, 2021",Julia,'Titane' Filmmaker Julia Ducournau on How This...,Variety,5d ago
336,"Saturday, December 11, 2021",Raptors vs Knicks,"How to watch Raptors vs. Knicks: TV channel, N...",CBSSports.com,5d ago
337,"Saturday, December 11, 2021",Timberwolves vs Cavaliers,"Timberwolves vs. Cavaliers odds, line: 2021 NB...",CBSSports.com,5d ago
338,"Saturday, December 11, 2021",Miss Universe 2021 schedule Philippines,How to Watch Miss Universe 2021 Online,Billboard,3d ago


In [18]:
df.to_csv('scraped_data.csv', index=False)