In [1]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
from selenium.webdriver.common.by import By

import pandas as pd
import time
import random

In [8]:
def get_jobs(keyword, num_jobs, verbose=False):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    # initializing the webdriver
    options = webdriver.ChromeOptions()
    
    # uncomment the line below if you'd like to scrape without a new Chrome window every time.
    # options.add_argument('headless')
    
    # change the path to where chromedriver is in your home folder.
    driver = webdriver.Chrome(executable_path="chromedriver", options=options)
    driver.set_window_size(1120, 1000)

    # url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + keyword + '"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
    #url = 'https://www.glassdoor.com/Job/'+keyword+'-jobs-SRCH_KO0,14.htm?sortBy=date_desc'
    #url = 'https://www.glassdoor.com/Job/'+keyword+'-jobs-SRCH_KO0,14.htm?fromAge=14' #3,7,14
    url = 'https://www.glassdoor.com/Job/'+keyword+'-jobs-SRCH_KO0,14.htm?'
    driver.get(url)
    jobs = []

    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.
        
        try:
            # let the page load, change this number based on your internet speed
            # or, wait until the webpage is loaded, instead of hardcoding it.
            time.sleep(4)

            # going through each job in this page
            #li for job listing, these are the buttons we're going to click.
            job_buttons = driver.find_elements(By.XPATH, '//li[contains(@class, "react-job")]')  
            for job_button in job_buttons:  

                print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
                if len(jobs) >= num_jobs:
                    break

                job_button.click() 
                time.sleep(0.5)

                # click X if the sign-up notice pop up
                try:
                    driver.find_element(By.XPATH, '//span[@alt="Close"]').click()  
                except NoSuchElementException:
                    pass



                # collect basic job info
                # program won't leave the while loop until it get everything
                collected_successfully = False            
                while not collected_successfully:
                    try:
                        driver.find_element(By.XPATH, '//div[@class="css-t3xrds e856ufb2"]').click()
                    except:
                        continue
                    try:
                        company_name = driver.find_element(By.XPATH, '//div[@class="css-xuk5ye e1tk4kwz5"]').text
                        location = driver.find_element(By.XPATH, '//div[@class="css-56kyx5 e1tk4kwz1"]').text
                        job_title = driver.find_element(By.XPATH, '//div[@class="css-1j389vi e1tk4kwz2"]').text
                        job_description = driver.find_elements(By.XPATH, '//div[@class="jobDescriptionContent desc"]')[0].text
                        collected_successfully = True
                    except:
                        time.sleep(5)

                try:
                    salary_estimate = driver.find_elements(By.XPATH, '//div[@class="css-dufhjo e1tk4kwz0"]//span[@class="css-1hbqxax e1wijj240"]')[0].text
                except IndexError:
                    try:
                        salary_estimate = driver.find_element(By.XPATH, '//div[@class="css-dufhjo e1tk4kwz0"]//span[@class="css-1hbqxax e1wijj240"]').text
                    except NoSuchElementException:
                        salary_estimate = -1 # need to set a not found value, it's important
                except NoSuchElementException:
                    salary_estimate = -1 # need to set a not found value, it's important

                try:
                    rating = driver.find_element(By.XPATH, '//span[@data-test="detailRating"]').text
                except NoSuchElementException:
                    rating = -1 # need to set a not found value, it's important

                # printing for debugging
                if verbose:
                    print("Job Title: {}".format(job_title))
                    print("Salary Estimate: {}".format(salary_estimate))
                    print("Job Description: {}".format(job_description[:500]))
                    print("Rating: {}".format(rating))
                    print("Company Name: {}".format(company_name))
                    print("Location: {}".format(location))


                # scrape the company overview section, -1 if none
                try:
                    driver.find_element(By.XPATH, '//div[@id="CompanyContainer"]')            

                    try:
                        size = driver.find_element(By.XPATH, '//div[@id="EmpBasicInfo"]//span[text()="Size"]//following-sibling::*').text
                    except NoSuchElementException:
                        size = -1

                    try:
                        founded = driver.find_element(By.XPATH, '//div[@id="EmpBasicInfo"]//span[text()="Founded"]//following-sibling::*').text
                    except NoSuchElementException:
                        founded = -1

                    try:
                        type_of_ownership = driver.find_element(By.XPATH, '//div[@id="EmpBasicInfo"]//span[text()="Type"]//following-sibling::*').text
                    except NoSuchElementException:
                        type_of_ownership = -1

                    try:
                        industry = driver.find_element(By.XPATH, '//div[@id="EmpBasicInfo"]//span[text()="Industry"]//following-sibling::*').text
                    except NoSuchElementException:
                        industry = -1

                    try:
                        sector = driver.find_element(By.XPATH, '//div[@id="EmpBasicInfo"]//span[text()="Sector"]//following-sibling::*').text
                    except NoSuchElementException:
                        sector = -1

                    try:
                        revenue = driver.find_element(By.XPATH, '//div[@id="EmpBasicInfo"]//span[text()="Revenue"]//following-sibling::*').text
                    except NoSuchElementException:
                        revenue = -1

                # all -1 if there is no company overview in the job posting
                except NoSuchElementException:  
                    size = -1
                    founded = -1
                    type_of_ownership = -1
                    industry = -1
                    sector = -1
                    revenue = -1


                # printing for debugging
                if verbose:
                    print("Size: {}".format(size))
                    print("Founded: {}".format(founded))
                    print("Type of Ownership: {}".format(type_of_ownership))
                    print("Industry: {}".format(industry))
                    print("Sector: {}".format(sector))
                    print("Revenue: {}".format(revenue))
                    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")


                # add job to jobs
                jobs.append({"Job Title" : job_title,
                "Salary Estimate" : salary_estimate,
                "Job Description" : job_description,
                "Rating" : rating,
                "Company Name" : company_name,
                "Location" : location,
                "Size" : size,
                "Founded" : founded,
                "Type of ownership" : type_of_ownership,
                "Industry" : industry,
                "Sector" : sector,
                "Revenue" : revenue})


            # click on the "next page" button
            try:
                driver.find_element(By.XPATH, '//span[@alt="next-icon"]').click()
            except NoSuchElementException:
                print("Scraping terminated. Needed {}, got {}.".format(num_jobs, len(jobs)))
                break
        
        except:
            return pd.DataFrame(jobs)
        
    return pd.DataFrame(jobs)

In [1]:
def fget_jobs(url, verbose=False):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    # initializing the webdriver
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    # uncomment the line below if you'd like to scrape without a new Chrome window every time.
    #options.add_argument('headless')
    
    # change the path to where chromedriver is in your home folder.
    driver = webdriver.Chrome(executable_path="chromedriver", options=options)
    driver.set_window_size(1120, 1000)

    # url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + keyword + '"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
    #keyword = keyword.replace(' ', '-')
    url = url
    driver.get(url)
    jobs = []


    # let the page load, change this number based on your internet speed
    # or, wait until the webpage is loaded, instead of hardcoding it.
    time.sleep(4)

    # going through each job in this page
    #li for job listing, these are the buttons we're going to click.
    job_buttons = driver.find_elements(By.XPATH, '//li[contains(@class, "react-job")]')  
    for job_button in job_buttons:  

        print("Progress: {}".format("" + str(len(jobs)+1) + "/" + '30'))
        
        # click X if the sign-up notice pop up
        try:
            driver.find_element(By.XPATH, '//span[@alt="Close"]').click()  
        except NoSuchElementException:
            pass

        job_button.click() 
        time.sleep(random.uniform(1,5))

        # click X if the sign-up notice pop up
        try:
            driver.find_element(By.XPATH, '//span[@alt="Close"]').click()  
        except NoSuchElementException:
            pass



        # collect basic job info
        # program won't leave the while loop until it get everything
        collected_successfully = False            
        while not collected_successfully:
            try:
                driver.find_element(By.XPATH, '//div[@class="css-t3xrds e856ufb2"]').click()
            except:
                continue
            try:
                company_name = driver.find_element(By.XPATH, '//div[@class="css-xuk5ye e1tk4kwz5"]').text
                location = driver.find_element(By.XPATH, '//div[@class="css-56kyx5 e1tk4kwz1"]').text
                job_title = driver.find_element(By.XPATH, '//div[@class="css-1j389vi e1tk4kwz2"]').text
                job_description = driver.find_elements(By.XPATH, '//div[@class="jobDescriptionContent desc"]')[0].text
                collected_successfully = True
            except:
                time.sleep(2)

        try:
            salary_estimate = driver.find_elements(By.XPATH, '//span[@class="css-1hbqxax e1wijj240"]')[0].text
        except IndexError:
            salary_estimate = driver.find_element(By.XPATH, '//span[@class="css-1hbqxax e1wijj240"]').text
        except NoSuchElementException:
            salary_estimate = -1 # need to set a not found value, it's important

        try:
            rating = driver.find_element(By.XPATH, '//span[@data-test="detailRating"]').text
        except NoSuchElementException:
            rating = -1 # need to set a not found value, it's important

        # printing for debugging
        if verbose:
            print("Job Title: {}".format(job_title))
            print("Salary Estimate: {}".format(salary_estimate))
            print("Job Description: {}".format(job_description[:500]))
            print("Rating: {}".format(rating))
            print("Company Name: {}".format(company_name))
            print("Location: {}".format(location))


        # scrape the company overview section, -1 if none
        try:
            driver.find_element(By.XPATH, '//div[@id="CompanyContainer"]')            

            try:
                size = driver.find_element(By.XPATH, '//div[@id="EmpBasicInfo"]//span[text()="Size"]//following-sibling::*').text
            except NoSuchElementException:
                size = -1

            try:
                founded = driver.find_element(By.XPATH, '//div[@id="EmpBasicInfo"]//span[text()="Founded"]//following-sibling::*').text
            except NoSuchElementException:
                founded = -1

            try:
                type_of_ownership = driver.find_element(By.XPATH, '//div[@id="EmpBasicInfo"]//span[text()="Type"]//following-sibling::*').text
            except NoSuchElementException:
                type_of_ownership = -1

            try:
                industry = driver.find_element(By.XPATH, '//div[@id="EmpBasicInfo"]//span[text()="Industry"]//following-sibling::*').text
            except NoSuchElementException:
                industry = -1

            try:
                sector = driver.find_element(By.XPATH, '//div[@id="EmpBasicInfo"]//span[text()="Sector"]//following-sibling::*').text
            except NoSuchElementException:
                sector = -1

            try:
                revenue = driver.find_element(By.XPATH, '//div[@id="EmpBasicInfo"]//span[text()="Revenue"]//following-sibling::*').text
            except NoSuchElementException:
                revenue = -1

        # all -1 if there is no company overview in the job posting
        except NoSuchElementException:  
            size = -1
            founded = -1
            type_of_ownership = -1
            industry = -1
            sector = -1
            revenue = -1


        # printing for debugging
        if verbose:
            print("Size: {}".format(size))
            print("Founded: {}".format(founded))
            print("Type of Ownership: {}".format(type_of_ownership))
            print("Industry: {}".format(industry))
            print("Sector: {}".format(sector))
            print("Revenue: {}".format(revenue))
            print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")


        # add job to jobs
        jobs.append({"Job Title" : job_title,
        "Salary Estimate" : salary_estimate,
        "Job Description" : job_description,
        "Rating" : rating,
        "Company Name" : company_name,
        "Location" : location,
        "Size" : size,
        "Founded" : founded,
        "Type of ownership" : type_of_ownership,
        "Industry" : industry,
        "Sector" : sector,
        "Revenue" : revenue})

    return pd.DataFrame(jobs)