# Scraping Facebook Career Webpages

## Prerequisites
* selenium
* one of the following (depending on which browser you're using)
  * firefox: [geckodriver](https://github.com/mozilla/geckodriver/releases/)
  * chrome/chromium: [chromedriver](http://chromedriver.chromium.org/)
  
## Useful Tutorials
* https://huilansame.github.io/huilansame.github.io/archivers/sleep-implicitlywait-wait
* https://wangxin1248.github.io/python/2018/09/python3-spider-8.html

## 1. Scraping a facebook job description page
Use `scrape_job()` provided below on single job with its url.

Example target: https://careers.google.com/jobs/results/6163626811654144-front-end-software-engineer/?company=Google&company=YouTube&employment_type=FULL_TIME&hl=en_US&jlo=en_US&q=software&sort_by=relevance

In [14]:
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.common.exceptions import TimeoutException
import selenium.webdriver.support.ui as ui

import pandas
import time
import csv

In [82]:
def scrape_job(url: str, wait: WebDriverWait, retry=3):
    """ Scrape the job info from the specified Url. A broswer driver MUST be initialized beforehand.
    :param url: the url of a detailed google job page.
    :param wait: contains timeout.
    :param retry: times to retry.
    :return: a dict wrapping all info.
    """
    for i in range(0, retry):
        driver.get(url)
    
        # Wait until all required elements are generated.
        try:
            wait.until(ec.presence_of_element_located((By.CLASS_NAME, '_8lfl')))
            wait.until(ec.presence_of_element_located((By.CLASS_NAME, '_8lfn')))
            
            # Extract job information.
            title = driver.find_element_by_class_name('_8lfl').text
            location = driver.find_element_by_class_name('_8lfn').text
            contents = driver.find_elements_by_class_name('_8lfy')
            responsibilities = contents[0].text
            minimum_qual = contents[1].text
            preferred_qual = contents[2].text
            
            return {
                'title': title,
                'loc': location,
                'minimum_qual': minimum_qual,
                'preferred_qual': preferred_qual,
                'resp': responsibilities
            }
        except TimeoutException:
            return None
        except Exception as e:
            print(e)
            continue
    
    # If all retries have failed, return None.
    return None

In [83]:
chrome_options = ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = Chrome(executable_path='/home/aesophor/Downloads/chromedriver', options=chrome_options)

wait = WebDriverWait(driver, timeout=10)
job = scrape_job(r'https://www.facebook.com/careers/jobs/2288919011218389/', wait)

if job is not None:
    print(job)

driver.quit()

{'title': 'Robotics Software Engineer', 'loc': 'MENLO PARK, CA', 'minimum_qual': 'BS degree in Computer Science or equivalent degree or experience\n3+ years coding experience using Python, PHP, or C++\nExperience in developing software applications or algorithms independently\nExperience communicating technical concepts to non-technical audiences\nExperience logically analyzing problems, as well as identifying constraints and boundaries to develop solution options\nExperience defining use cases, articulating requirements, and challenging design and quality during feature development', 'preferred_qual': 'MS or above degrees in Computer Science or related technical field\nExperience working full-stack (Back-end development, Software development, ROS development, etc.)\nExperience with robotic platforms such as ROS or ROS2\nKnowledge and experience in any of the following areas: Robot control systems (position, velocity, force control), Computer vision and imaging (or SLAM), Robot arms, F

---

## 2. Search & Scrape All Relevant Jobs

Use `scrape_jobs(keyword, wait)` provided below on all jobs relevant to a specific keyword.

Example: all jobs related to the keyword `software`.

In [4]:
def _collect_urls(wait: WebDriverWait, urls: list, page_count, url_count):
    """ Collect all urls we have to scrape """
    for i in range(0, page_count):
        try:
            time.sleep(2) # Sleep for 2 secs for the page to load or it will scream like a bitch
            
            wait.until(ec.presence_of_element_located((By.CLASS_NAME, '_2ynk')))
            wait.until(ec.presence_of_element_located((By.CLASS_NAME, '_69jm')))
            result_pane = driver.find_element_by_class_name('_2ynk')
            cards = result_pane.find_elements_by_class_name('_69jm')
            
            urls += [card.get_attribute('href') for card in cards]
            print('\rCollecting urls... {}/{}'.format(len(urls), url_count), end='')
            
            # If `next` cannot be found after `timeout` seconds, it will throw 
            # a TimeoutException, then we can break the loop.
            next_page_btn_id = '_42ft' if i == 0 else 'u_6_9'
            wait.until(ec.presence_of_element_located((By.CLASS_NAME, next_page_btn_id)))
            
            wait.until(lambda driver:EC.presence_of_element_located((By.XPATH, "//a[text()='NEXT']")))
            driver.find_element_by_xpath("//a[text()='NEXT']").send_keys(Keys.RETURN)
            
        except Exception as e:
            print(e)
            break
    print()

In [5]:
def scrape_jobs(keyword: str, wait: WebDriverWait, urls: list, start=1):
    """ Scrape info of all jobs related to the specified keyword
    :param keyword: google job search keyword.
    :param wait: contains timeout.
    :param urls: urls cache.
    :param start: the number of the record to start scraping.
    """
    items_per_page = 10
    starting_page = start // items_per_page + 1
    starting_card_no = start - (starting_page - 1) * items_per_page
    
    # Open Google job search page.
    driver.get(r'https://www.facebook.com/careers/jobs/?q={}&page={}'.format(keyword, starting_page));
    
    wait.until(ec.presence_of_element_located((By.CLASS_NAME, '_2ynk')))
    result_pane = driver.find_element_by_class_name('_2ynk')
    url_count = int(result_pane.find_element_by_class_name('_6ci_').text.split('(')[1].split(')')[0])
    page_count = (url_count // items_per_page) + 1
    
    # Loop until there's no `next` hyperlink.
    print('Collecting urls...', end='')
    
    if len(urls) != url_count:
        urls.clear()
        _collect_urls(wait, urls, page_count, url_count)
    
    with open('facebook_jobs.csv', 'w') as f:
        w = csv.DictWriter(f, fieldnames = ['title', 'loc', 'minimum_qual', 'preferred_qual', 'resp'])
        w.writeheader()
        
        for i in range(start - 1, len(urls)):
            print('\rProcessing ({}/{}): {}'.format(i, len(urls), urls[i]), end='')
            job = scrape_job(urls[i], wait)
            
            if job is not None:
                w.writerow(job)

In [6]:
# We'll cache all urls we have to scrape later in this list.
urls = []

In [7]:
chrome_options = ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = Chrome(executable_path='/home/aesophor/Downloads/chromedriver', options=chrome_options)

wait = WebDriverWait(driver, timeout=10)
scrape_jobs('software', wait, urls, start=1)
driver.quit()

NameError: name 'Firefox' is not defined