# Scraping Google Career Webpages

## Prerequisites
* selenium
* one of the following (depending on which browser you're using)
  * firefox: [geckodriver](https://github.com/mozilla/geckodriver/releases/)
  * chrome/chromium: [chromedriver](http://chromedriver.chromium.org/)
  
## Useful Tutorials
* https://huilansame.github.io/huilansame.github.io/archivers/sleep-implicitlywait-wait
* https://wangxin1248.github.io/python/2018/09/python3-spider-8.html

In [1]:
class GoogleJob:
    """ Wraps job title, location, minimum/preferred qualifications and responsibilities """
    def __init__(self, title, location, minimum_qual, preferred_qual, responsibilities):
        self.title = title
        self.location = location
        self.minimum_qual = minimum_qual
        self.preferred_qual = preferred_qual
        self.responsibilities = responsibilities

## 1. Scraping a single page into a GoogleJob object
https://careers.google.com/jobs/results/6163626811654144-front-end-software-engineer/?company=Google&company=YouTube&employment_type=FULL_TIME&hl=en_US&jlo=en_US&q=software&sort_by=relevance

In [7]:
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import time
import csv

In [8]:
def _extract(class_name: str):
    """ Extracts the specified element by class name 
    :return: readable text in the element.
    """
    return driver.find_element_by_class_name(class_name).text

In [9]:
def scrape_url(url: str, wait: WebDriverWait):
    """ Scrape the job info from the specified Url. A broswer driver MUST be initialized beforehand.
    :param url: the url of a detailed google job page.
    :param wait: contains timeout.
    :return: a GoogleJob object.
    """
    driver.get(url)
    
    # Wait until all required elements are generated.
    wait.until(ec.presence_of_element_located((By.CLASS_NAME, 'gc-card__title')))
    wait.until(ec.presence_of_element_located((By.CLASS_NAME, 'gc-job-tags__location')))
    wait.until(ec.presence_of_element_located((By.CLASS_NAME, 'gc-job-qualifications')))
    wait.until(ec.presence_of_element_located((By.CLASS_NAME, 'gc-job-detail__section--responsibilities')))
    
    # Extract job information.
    title = _extract('gc-card__title')
    location = _extract('gc-job-tags__location')
    qualifications = _extract('gc-job-qualifications').split('\n\n')
    minimum_qual = qualifications[0].replace('Minimum qualifications:\n', '')
    preferred_qual = qualifications[1].replace('Preferred qualifications:\n', '')
    responsibilities = _extract('gc-job-detail__section--responsibilities').replace('Responsibilities\n', '', 1)
    
    return GoogleJob(title, location, minimum_qual, preferred_qual, responsibilities)

In [10]:
options = Options()
options.add_argument('-headless')
driver = Firefox(executable_path='/opt/firefox/geckodriver', options=options)

wait = WebDriverWait(driver, timeout=10)
job = scrape_url(r'https://careers.google.com/jobs/results/6163626811654144-front-end-software-engineer/?company=Google&company=YouTube&employment_type=FULL_TIME&hl=en_US&jlo=en_US&q=software&sort_by=relevance', wait)
print(vars(job))

driver.quit()

{'title': 'Front End Software Engineer', 'location': 'Pittsburgh, PA, USA', 'minimum_qual': 'BA/BS degree or equivalent practical experience.\n1 year of work experience in software development.\nExperience with server-side web frameworks such as JSP or ASP.Net.\nDevelopment experience in C, C++ or Java and experience designing modular, object-oriented JavaScript.', 'preferred_qual': '4 years of relevant work experience, including web application experience or skills using AJAX, HTML, CSS or JavaScript.\nProgramming experience in GWT.\nExperience with user interface frameworks such as XUL, Flex, AJAX, and XAML.\nKnowledge of user interface design.', 'responsibilities': "Build next-generation web applications with a focus on the client side.\nRedesign UI's, Implement new UI's, and pick up Java as necessary.\nEngage with back-end systems."}


## Write the dict representation of a GoogleJob object to CSV

In [12]:
csv_file = 'google_jobs.csv'
job_dict = vars(job)

with open(csv_file, 'w') as f:
    w = csv.DictWriter(f, job_dict.keys())
    w.writeheader()
    w.writerow(job_dict)
    
print('File written: ' + csv_file)

File written: google_jobs.csv


## Csv to Pandas DataFrame

In [13]:
import pandas

In [14]:
dat = pandas.read_csv(csv_file)
dat

Unnamed: 0,title,location,minimum_qual,preferred_qual,responsibilities
0,Front End Software Engineer,"Pittsburgh, PA, USA",BA/BS degree or equivalent practical experienc...,"4 years of relevant work experience, including...",Build next-generation web applications with a ...


---

## 2. Search & Scrape All Relevant Jobs

In [15]:
import selenium.webdriver.support.ui as ui
from selenium.common.exceptions import TimeoutException

In [23]:
options = Options()
options.add_argument('-headless')
driver = Firefox(executable_path='/opt/firefox/geckodriver', options=options)

In [24]:
# Open Google job search page.
google_job_search_url = r'https://careers.google.com/jobs/results/?company=Google&company=YouTube&employment_type=FULL_TIME&hl=en_US&jlo=en_US&q=software&sort_by=relevance'
driver.get(google_job_search_url);

# Type 'software' as the keyword in the searchbox and press RETURN.
searchbox_name = 'q'
driver.find_element_by_name(searchbox_name).send_keys('software')
driver.find_element_by_name(searchbox_name).send_keys(Keys.RETURN)

# Loop until there's no `next` hyperlink.
wait = WebDriverWait(driver, timeout=10)
urls = []

while True:
    try:
        wait.until(ec.presence_of_element_located((By.ID, 'search-results')))
        wait.until(ec.presence_of_element_located((By.XPATH, "//a[@data-gtm-ref='job-results-card']")))
        
        result_pane = driver.find_element_by_id('search-results')
        cards = result_pane.find_elements_by_xpath("//a[@data-gtm-ref='job-results-card']")
        
        for card in cards:
            urls.append(card.get_attribute('href'))
        
        # If `next` cannot be found after `timeout` seconds, it will throw a TimeoutException
        # then we can break the loop.
        wait.until(ec.presence_of_element_located((By.XPATH, "//a[@data-gtm-ref='search-results-next-click']")))
        
        # Click on `next`
        driver.find_element_by_xpath("//a[@data-gtm-ref='search-results-next-click']").send_keys(Keys.RETURN)
    except:
        break

print(urls)

['https://careers.google.com/jobs/results/6163626811654144-front-end-software-engineer/?company=Google&company=YouTube&employment_type=FULL_TIME&hl=en_US&jlo=en_US&q=softwaresoftware&sort_by=relevance', 'https://careers.google.com/jobs/results/6343639863328768-software-engineer-html5-video-google-cloud-platform/?company=Google&company=YouTube&employment_type=FULL_TIME&hl=en_US&jlo=en_US&q=softwaresoftware&sort_by=relevance', 'https://careers.google.com/jobs/results/5590805127561216-front-end-software-engineer-youtube/?company=Google&company=YouTube&employment_type=FULL_TIME&hl=en_US&jlo=en_US&q=softwaresoftware&sort_by=relevance', 'https://careers.google.com/jobs/results/5175961349980160-software-engineer-google-home/?company=Google&company=YouTube&employment_type=FULL_TIME&hl=en_US&jlo=en_US&q=softwaresoftware&sort_by=relevance', 'https://careers.google.com/jobs/results/5629038607663104-software-engineer-front-end-development/?company=Google&company=YouTube&employment_type=FULL_TIME&h

In [25]:
# For each url in urls, scrape_url(). This could take a moment.
jobs = [scrape_url(url, wait) for url in urls]
driver.quit()
jobs

[<__main__.GoogleJob at 0x7fe12f7eb7f0>,
 <__main__.GoogleJob at 0x7fe12f7eb2e8>,
 <__main__.GoogleJob at 0x7fe12f7eb668>,
 <__main__.GoogleJob at 0x7fe12f7eb470>,
 <__main__.GoogleJob at 0x7fe12f7eb748>,
 <__main__.GoogleJob at 0x7fe12f7eb278>,
 <__main__.GoogleJob at 0x7fe12f7eb6a0>,
 <__main__.GoogleJob at 0x7fe12f7eb128>,
 <__main__.GoogleJob at 0x7fe12f7eb630>,
 <__main__.GoogleJob at 0x7fe12f7eb160>,
 <__main__.GoogleJob at 0x7fe12f7eb390>,
 <__main__.GoogleJob at 0x7fe12f7eb400>,
 <__main__.GoogleJob at 0x7fe12f7eb908>,
 <__main__.GoogleJob at 0x7fe12f7eb828>,
 <__main__.GoogleJob at 0x7fe12f7eb8d0>,
 <__main__.GoogleJob at 0x7fe12f7eb7b8>,
 <__main__.GoogleJob at 0x7fe12f7eb940>,
 <__main__.GoogleJob at 0x7fe12f7eb9b0>,
 <__main__.GoogleJob at 0x7fe12f7eba58>,
 <__main__.GoogleJob at 0x7fe12f7eb9e8>,
 <__main__.GoogleJob at 0x7fe12f7eba20>,
 <__main__.GoogleJob at 0x7fe12f7eb978>,
 <__main__.GoogleJob at 0x7fe12f7eba90>,
 <__main__.GoogleJob at 0x7fe12f7ebb38>,
 <__main__.Googl

In [26]:
csv_file = 'google_jobs.csv'

with open(csv_file, 'w') as f:
    w = csv.DictWriter(f, vars(jobs[0]).keys())
    w.writeheader()
    
    for job in jobs:
        w.writerow(vars(job))
    
print('File written: ' + csv_file)

File written: google_jobs.csv
