In [7]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup as soup

# Selenium Tutorial: 
# https://medium.com/@hoppy/
# how-to-test-or-scrape-javascript-rendered-websites-with-python-selenium-a-beginner-step-by-c137892216aa

In [85]:
test_site = "https://www.glassdoor.com/Job/" +\
            "jobs.htm?suggestCount=0&suggestChosen=false" +\
            "&clickSource=searchBtn&typedKeyword=data+scientist" +\
            "&sc.keyword=data+scientist&locT=&locId=&jobType="

deloitte = "https://www.glassdoor.com/Job/" +\
           "jobs.htm?suggestCount=0&suggestChosen=false" +\
           "&clickSource=searchBtn&typedKeyword=data+scientist+booz+allen+hamilton" +\
           "&sc.keyword=data+scientist+booz+allen+hamilton&locT=C&locId=1138213&jobType="

In [86]:
def simple_get(url):
    try:
        with closing(get(url, stream=True)) as resp:
            print(resp)
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None
    
def is_good_response(resp):
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

def log_error(e):
    print(e)

In [97]:
import urllib

In [100]:
from urllib import request

In [102]:
request.urlopen(test_site)

HTTPError: HTTP Error 403: Forbidden

In [87]:
# raw_html = simple_get(test_site)
deloitte

'https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=data+scientist+booz+allen+hamilton&sc.keyword=data+scientist+booz+allen+hamilton&locT=C&locId=1138213&jobType='

# Selenium Webdriver

In [132]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

# Create the link to load with webdriver
def build_link(term, locT="", locId="", jobType=""):
    domain = "https://www.glassdoor.com/Job/jobs.htm"
    header = "?suggestCount=0&suggestChosen=false&clickSource=searchBtn&"
    query = "typedKeyword={0}&sc.keyword={0}".format(term.replace(" ", "+"))
    location = "&locT{}=&locId{}=&jobType={}".format(locT, locId, jobType)
    return(domain + header + query + location)

### Webdriver Downloads

Downloads page on Selenium: https://www.seleniumhq.org/download/

In [121]:
chromedriver = "/Users/glenabastillas/Documents/BAH/Hackathon/Gender Parity/drivers/chromedriver"
firefoxdriver = "/Users/glenabastillas/Documents/BAH/Hackathon/Gender Parity/drivers/geckodriver"

In [122]:
driver = webdriver.Chrome(executable_path=chromedriver)

In [123]:
driver.get(build_link('data scientist'))

In [124]:
assert "Glassdoor" in driver.title

In [92]:
elem = driver.find_element_by_tag_name("div")

In [93]:
xpath = "//*[@class='jobDescriptionContent desc']"

In [125]:
elem = driver.find_element_by_xpath(xpath)

In [130]:
help(elem.get_attribute)

Help on method get_attribute in module selenium.webdriver.remote.webelement:

get_attribute(name) method of selenium.webdriver.remote.webelement.WebElement instance
    Gets the given attribute or property of the element.
    
    This method will first try to return the value of a property with the
    given name. If a property with that name doesn't exist, it returns the
    value of the attribute with the same name. If there's no attribute with
    that name, ``None`` is returned.
    
    Values which are considered truthy, that is equals "true" or "false",
    are returned as booleans.  All other non-``None`` values are returned
    as strings.  For attributes or properties which do not exist, ``None``
    is returned.
    
    :Args:
        - name - Name of the attribute/property to retrieve.
    
    Example::
    
        # Check if the "active" CSS class is applied to an element.
        is_active = "active" in target_element.get_attribute("class")



#### Practice Looping through Elements

In [127]:
xpath = "//*[@class='noPad']"
for e in driver.find_element_by_xpath(xpath):
    print(e)

TypeError: 'WebElement' object is not iterable

In [126]:
elem.text.split('\n')

["Work that matters, fueled by passion for pets! At Hill’s we have a purpose. Every day around the world, we transform the lives of millions of pet families through pioneering innovation, amazing nutrition, and the best and brightest people. Founded more than 75 years ago with an unwavering commitment to pet nutrition, Hills' mission is to help enrich and lengthen the special relationships between people and their pets.",
 '',
 "HILL'S® Prescription Diet® therapeutic pet foods, HILL'S® Science Diet® and HILL'S® Ideal Balance™ wellness pet foods are sold worldwide. Hill’s is a division of Colgate-Palmolive, a leading global consumer products company, tightly focused on Oral Care, Personal Care, Home Care and Pet Nutrition, with sales of products in more than 200 countries. To learn more about Hill's and Colgate, please visit http://www.hillspet.com and http://www.colgatepalmolive.com, or find us on LinkedIn, Facebook, Twitter and YouTube.",
 '',
 'Location: Topeka, Kansas, United States

In [83]:
elem

<selenium.webdriver.remote.webelement.WebElement (session="CAF9C4CA-E353-4524-B7A5-25816053FD3D", element="node-8041F5DB-3645-4D46-86FF-D351C352717E")>

In [4]:
# Import modules required to connect to the website with an API
from selenium import webdriver
#from bs4 import BeautifulSoup # For HTML parsing
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
from selenium.webdriver.common import action_chains, keys
from selenium.common.exceptions import NoSuchElementException
import numpy as np
import sys


from helperP3 import load_obj, save_obj, init_glassdoor, searchJobs, text_cleaner, get_pause


ModuleNotFoundError: No module named 'helperP3'

In [2]:
# -*- coding: utf-8 -*-


def glassdoorScrape(get_short = False):
    
    """
    Created on Tue Aug 16 22:41:30 2016
    Scrape Glassdoor website using SELENIUM
    @author: Diego De Lazzari
    """

    # call the helper
    
    
        # 1- Load existing dictionary. Check for initial dictionary. 
        # If empty initialize
            
    try:               
        jobDict = load_obj('glassDoorDict')
        link =    load_obj('glassDoorlink')
    except:
        save_obj([], 'glassDoorlink')
        save_obj({}, 'glassDoorDict')
        
        jobDict = load_obj('glassDoorDict')
        link =    load_obj('glassDoorlink')    
    
    # 2- Choose what you want to do: 
#        get_shot => Scraping for links, 
#        get_long => Scraping for data,


    get_long = (not get_short)
    
    if get_short or get_long:
        
    # 3- initialize website, cities and jobs
        
        website = "https://www.glassdoor.com/index.htm"
        
        jobName_lst = ['Data Scientist', 'Data Analyst']
        jobName = np.random.choice(jobName_lst)
    
        city_lst = ['San Jose','New York','San Francisco','Detroit','Washington','Austin','Boston','Los Angeles',' ']
        city = np.random.choice(city_lst)        
        
        # Initialize the webdriver
        
        browser = init_glassdoor()  
    
    # 4- Scrape the short list or the links (when you ae done, both are false)
    
    
    if get_short:
    
        browser.get(website)
            
        # search for jobs (short description) 
        try:    
                    update_jobDict, update_link = searchJobs(jobName, city, jobDict, link)
#                    sleep(get_pause())
        except:
            sys.exit("Error message")
            
        # save dictionary and link     
    
        save_obj(update_jobDict, 'glassDoorDict')
        save_obj(update_link, 'glassDoorlink')
        
     # 5- Scrape the job description, for every link
                    
    if get_long:        
        
        while len(link) > 0:
            
             
            try:
                rnd_job = np.random.choice(range(len(link)))
                
                ids = link[rnd_job][0]
                page = link[rnd_job][1]
                
                browser.get(page)                 
                sleep(3)
                
                # Extract text   //*[@id="JobDescContainer"]/div[1]
                desc_list = browser.find_element_by_xpath('//*[@id="JobDescContainer"]/div[1]').text
                description = text_cleaner(desc_list)
                
                # Update dictionary and remove succe
                jobDict[ids].append(description)               
                dummy=link.pop(rnd_job)
                               
                # if everything is fine, save
                save_obj(jobDict, 'glassDoorDict')
                save_obj(link, 'glassDoorlink')
                                                
                print( 'Scraped successfully ' + ids)
                
                sleep(get_pause())
            except:   
                print( ids + ' is not working! Sleep for 10 seconds and retry')
                print( 'Still missing ' + str(len(link)) + ' links' )
                sleep(8)
                
        browser.close()
        
    return