# Web Scraping demo - for GIS Day 2021
A (mostly) brief demo of scraping web data for fun and profit

## To do
 - upgrade jupyter lab to 3.2.3 for celltags (do in venv)

## Why scrape?

# Scraping demo

### Prerequisite: install a webdriver exension

### Imports

In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.wait import WebDriverWait 
from xvfbwrapper import Xvfb
import time
import requests
import json
import base64
import os
import subprocess
import pandas

### Settings

In [2]:
pageurl = 'https://www.gjc.org/cgi-bin/listjobs.pl?view=table'
headless = False

fp = webdriver.FirefoxProfile()
options = Options()
options.headless = headless

### Open the driver

In [3]:
driver = webdriver.Firefox(fp,options=options)

### Open the page

In [4]:
driver.get(pageurl)

### Find the HTML element that contains what we want
In the inspector, we find that the html element is the `table` with the XHTML ID attribute `jobs_table`

In [8]:
table = driver.find_element_by_id('jobs_table')
table_html = table.get_attribute('outerHTML')

### Read the table with pandas

In [19]:
df = pandas.read_html(table_html)
# read_html creates a list of DFs, we need the first one
df = df[0]

# Let's turn this into a loop

In [55]:
# First, close out of the driver we opened earlier
driver.close()

In [81]:
# And create a new Pandas dataframe that everything will go into
jobs_df = pandas.DataFrame(columns=['Date', 'Title', 'Organization', 'Location'])

In [80]:
driver = webdriver.Firefox(fp,options=options)
driver.get(pageurl)

## What are our page numbers?
We need to know when to stop hitting the "Next" button...

In [58]:
pages_ribbon = [a.text for a in driver.find_elements_by_class_name('paginate_button')]
pages_ribbon

['Previous', '1', '2', '3', '4', '5', '12', 'Next']

In [59]:
total_pages = int(pages_ribbon[-2])
total_pages

12

### Loop through the pages and save them to the dataframe

In [82]:
for i in range(total_pages):
    if i+1 != total_pages:
        
        table_html = driver.find_element_by_id('jobs_table').get_attribute('outerHTML')
        page_table = pandas.read_html(table_html)[0]
        jobs_df = pandas.concat([jobs_df, page_table]) # pandas.concat expects an interable
        
        next_button = driver.find_element_by_id('jobs_table_next')
        next_button.click()
        
        # Delay a little bit so we can see it happen
        time.sleep(.5)
        
    else:
        print('done!')       

done!


### Save the data to a CSV

In [87]:
jobs_df.to_csv('out.csv')