In [1]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time, random
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def get_company_info(driver):
    """Inputs webdriver and returns 
    
    Parameters
    ----------
    job_role : string
        The job role string to search
        
    num_jobs : int
        The number of jobs to fetch from glassdoor
        
    Returns
    -------
    list
        a pandas 
    """
    try:
        driver.find_element_by_xpath('.//div[@class="tab" and @data-tab-type="overview"]').click()

        try:
            headquarters = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Headquarters"]//following-sibling::*').text
        except NoSuchElementException:
            headquarters = -1

        try:
            size = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Size"]//following-sibling::*').text
        except NoSuchElementException:
            size = -1

        try:
            founded = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Founded"]//following-sibling::*').text
        except NoSuchElementException:
            founded = -1

        try:
            type_of_ownership = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
        except NoSuchElementException:
            type_of_ownership = -1

        try:
            industry = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Industry"]//following-sibling::*').text
        except NoSuchElementException:
            industry = -1

        try:
            sector = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Sector"]//following-sibling::*').text
        except NoSuchElementException:
            sector = -1

        try:
            revenue = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Revenue"]//following-sibling::*').text
        except NoSuchElementException:
            revenue = -1

        try:
            competitors = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Competitors"]//following-sibling::*').text
        except NoSuchElementException:
            competitors = -1

    except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
        headquarters = -1
        size = -1
        founded = -1
        type_of_ownership = -1
        industry = -1
        sector = -1
        revenue = -1
        competitors = -1
    return [headquarters, size, founded, type_of_ownership, industry, sector, revenue, competitors]

In [3]:
def get_all_ratings(driver):
    try:
        driver.find_element_by_xpath('.//div[@class="tab" and @data-tab-type="rating"]').click()
        try:
            comp_and_benefits = driver.find_element_by_xpath('.//span[@class="ratingType" text()="Comp & Benefits"]//following-sibling::*').text
      
        except NoSuchElementException:
            comp_and_benefits = -1
    except NoSuchElementException:
        comp_and_benefits = -1
    return [comp_and_benefits]
                                                   

In [4]:
#code works for website layout of 30/5/2020

def get_jobs(job_role,  num_jobs, verbose):
    """Inputs job title and number of jobs to fetch returns jobs dataframe
    
    Parameters
    ----------
    job_role : string
        The job role string to search
        
    num_jobs : int
        The number of jobs to fetch from glassdoor
        
    Returns
    -------
    pandas dataframe
        a pandas dataframe containing number of fields
    """
    
    options = webdriver.ChromeOptions()
    #options.add_argument('headless')
    #chromedriver 83 required
    driver = webdriver.Chrome(executable_path="C:\\Users\\user\\Desktop\\glassdoor\\chromedriver.exe", options=options)
    driver.set_window_size(1120, 1000)
    url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + job_role + '"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
    
    driver.get(url)
    jobs = []
    
    
    try:
        job_buttons = driver.find_elements_by_class_name("jl")  #jl for Job Listing. These are the buttons we're going to click.
    except ElementClickInterceptedException:
        pass
    
    count = 0
    for job_button in job_buttons:
        job_fields = []
        if count == num_jobs:
            break
        else:
            try:
                job_button.click()
            except ElementClickInterceptedException:
                time.sleep(random.uniform(0.5, 1.5))

            scrape_success = False
            while not scrape_success:
                
                time.sleep(random.uniform(0.5, 1))
                try:  
                    company_element = driver.find_element_by_xpath('.//div[@class="employerName"]')
                    if company_element.is_displayed():
                        company_name = company_element.text

                except NoSuchElementException:
                    company_name = -1 
                    
                time.sleep(random.uniform(0.5, 1))
                try:
                    rating_element = driver.find_element_by_xpath('.//span[@class="rating"]')
                    if rating_element.is_displayed():
                        rating = rating_element.text
                    
                    
                except NoSuchElementException:
                    rating = -1
                    
                time.sleep(random.uniform(0.5, 1))
                try:
                    location_element = driver.find_element_by_xpath('.//div[@class="location"]')
                    if location_element.is_displayed():
                        location = location_element.text
                    
                except NoSuchElementException:
                    location = -1

                time.sleep(random.uniform(0.5, 1))
                try:
                    job_title_element = driver.find_element_by_xpath('.//div[@class="title"]')
                    if job_title_element.is_displayed():
                        job_title = job_title_element.text
                            
                except NoSuchElementException:
                    job_title = -1
                scrape_success = True
                
                time.sleep(random.uniform(0.5, 1))
                try:
                    job_desc_element = driver.find_element_by_xpath('.//div[@class="jobDescriptionContent desc"]')
                    if job_desc_element.is_displayed():
                        job_desc = job_desc_element.text
                        
                except NoSuchElementException:
                    job_desc = -1
                
                job_fields = [job_title, rating, company_name.split('\n')[0], location, job_desc]
           
            
            
            #Click on company tab and extend list with [headquarters, size, founded, type_of_ownership, industry, sector, revenue, competitors]
            job_fields.extend(get_company_info(driver))
            
            #Click rating and extend list with [comp_and_benefits, culture_and_values, career_opportunities, work_life_balance, senior_management]
            #job_fields.extend(get_all_ratings(driver))
            
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Rating: {}".format(rating))
                print("Company Name: {}".format(company_name))
                #print("Location: {}".format(location))
                #print("Job Description: {}".format(job_desc))
                print()
            time.sleep(random.uniform(0.5, 1.5))
        count += 1
        jobs.append(job_fields)
    return jobs #pd.DataFrame(jobs) 

In [5]:
cols = ['Job Title', 'Overall Rating', 'Company Name' , 'Location', 'Job Description', 'Headquarters', 'Size', 'Founded', 'Ownership Type', 'Industry', 'Sector', 'Revenue', 'Competitors']

In [8]:
df_jobs = pd.DataFrame(get_jobs('data engineer', 1000, True), columns = cols)

Job Title: Senior Data Engineer
Rating: 4.6
Company Name: Turo
4.6

Job Title: Staff Data Engineer
Rating: 3.7
Company Name: Samsung Research America
3.7

Job Title: Data Engineer
Rating: 3.3
Company Name: Rocket Lawyer
3.3

Job Title: Big Data Engineer
Rating: 4.4
Company Name: SpringML
4.4

Job Title: Senior Data Engineer
Rating: 3.7
Company Name: WePay
3.7

Job Title: Senior Data Engineer
Rating: 3.0
Company Name: Cadent
3.0

Job Title: Senior Data Engineer- Airflow/ML
Rating: 4.9
Company Name: Cadent
3.0

Job Title: Senior Data Engineer
Rating: 3.7
Company Name: Upwork
3.7

Job Title: Data Engineer
Rating: 5.0
Company Name: Seen by Indeed
5.0

Job Title: Senior Data Engineer (San Francisco, CA)
Rating: 3.7
Company Name: Earnest
3.7

Job Title: Sr. BI Data Engineer III
Rating: 3.4
Company Name: Astreya
3.4

Job Title: Data Engineer
Rating: 3.9
Company Name: BlackLine
3.9

Job Title: Data Engineer
Rating: 4.5
Company Name: Demandbase
4.5

Job Title: Senior Data Engineer
Rating: 4.2
C

ElementNotInteractableException: Message: element not interactable: element has zero size
  (Session info: chrome=83.0.4103.61)


In [9]:
df_jobs

Unnamed: 0,Job Title,Overall Rating,Company Name,Location,Job Description,Headquarters,Size,Founded,Ownership Type,Industry,Sector,Revenue,Competitors
0,Staff Data Engineer,3.7,Samsung Research America,"Mountain View, CA",Title: Staff Data Engineer\n\nCompany: Samsung...,"Mountain View, CA",1001 to 5000 employees,1988,Subsidiary or Business Segment,Computer Hardware & Software,Information Technology,$100 to $500 million (SGD),"Sony, LG Electronics, Nokia"
1,Senior Data Engineer,4.6,Turo,"San Francisco, CA","At Turo, you will have the opportunity to use ...","San Francisco, CA",201 to 500 employees,2009,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,-1
