In [2]:
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium import webdriver
from datetime import datetime
import time
import json

In [4]:
def get_jobs(keyword, num_jobs, verbose):
    
    '''Scrapes jobs from Glassdoor'''
    
    options = webdriver.ChromeOptions()

    # scraping without new chrome window popping up every time
    options.add_argument('headless')

    driver = webdriver.Chrome('chromedriver', options=options)
    driver.set_window_size(1120, 1000)

    url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword=' + keyword
    driver.get(url)
    jobs = []

    while len(jobs) < num_jobs:  # continuing looking for new jobs if true

        # letting the page load
        time.sleep(4)

        job_buttons = driver.find_elements(By.XPATH, './/a[@data-test="job-link"]')  # job-link for job listings
        

        for job_button in job_buttons: 

            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))

            if len(jobs) >= num_jobs:
                break            

            try:
                job_button.click()
                time.sleep(1)
                try:
                    driver.find_element(By.CSS_SELECTOR, '[alt="Close"]').click() # closing sign-in pop up if it appears
                except NoSuchElementException:
                    pass
            except:
                continue
            
            try:
                parent = driver.find_element(By.XPATH, './/div[@data-test="employerName"]').text
                try:
                    child = driver.find_element(By.XPATH, './/div[@data-test="employerName"]/span[@data-test="detailRating"]').text
                    company_name = parent.replace(child, "").strip()
                except NoSuchElementException:
                    company_name = parent
                location = driver.find_element(By.XPATH, './/div[@data-test="location"]').text
                job_title = driver.find_element(By.XPATH, './/div[@data-test="jobTitle"]').text
                job_description = driver.find_element(By.XPATH, './/div[@class="jobDescriptionContent desc"]').get_attribute("innerText")
                job_link = job_button.get_attribute('href')
            except:
                time.sleep(5)

            try:
                salary_estimate = driver.find_element(By.XPATH, './/div[@class="salaryTab tabSection p-std"]/div//div[contains(text(),"$")]').text
            except NoSuchElementException:
                salary_estimate = -1  # setting to -1 if value not found
            
            try:
                rating = driver.find_element(By.XPATH, './/div[@data-test="rating-info"]//div[1]').text
            except NoSuchElementException:
                rating = -1

            #Printing for debugging
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Salary Estimate: {}".format(salary_estimate))
                print("Job Description: {}".format(job_description[:500]))
                print("Rating: {}".format(rating))
                print("Company Name: {}".format(company_name))
                print("Location: {}".format(location))
                print("Job Link: {}".format(job_link))

            try:
                size = driver.find_element(By.XPATH, './/div[@id="EmpBasicInfo"]//span[text()="Size"]//following-sibling::*').text
            except NoSuchElementException:
                size = -1

            try:
                founded = driver.find_element(By.XPATH, './/div[@id="EmpBasicInfo"]//span[text()="Founded"]//following-sibling::*').text
            except NoSuchElementException:
                founded = -1

            try:
                type_of_ownership = driver.find_element(By.XPATH, './/div[@id="EmpBasicInfo"]//span[text()="Type"]//following-sibling::*').text
            except NoSuchElementException:
                type_of_ownership = -1

            try:
                industry = driver.find_element(By.XPATH, './/div[@id="EmpBasicInfo"]//span[text()="Industry"]//following-sibling::*').text
            except NoSuchElementException:
                industry = -1

            try:
                sector = driver.find_element(By.XPATH, './/div[@id="EmpBasicInfo"]//span[text()="Sector"]//following-sibling::*').text
            except NoSuchElementException:
                sector = -1

            try:
                revenue = driver.find_element(By.XPATH, './/div[@id="EmpBasicInfo"]//span[text()="Revenue"]//following-sibling::*').text
            except NoSuchElementException:
                revenue = -1
                
            if verbose:
                print("Size: {}".format(size))
                print("Founded: {}".format(founded))
                print("Type of Ownership: {}".format(type_of_ownership))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("============================================================")
            
            # adding job to jobs
            jobs.append({
                "Job Title" : job_title,
                "Salary Estimate" : salary_estimate,
                "Job Description" : job_description,
                "Rating" : rating,
                "Company Name" : company_name,
                "Location" : location,
                "Size" : size,
                "Founded" : founded,
                "Type of ownership" : type_of_ownership,
                "Industry" : industry,
                "Sector" : sector,
                "Revenue" : revenue,
                "Job Link" : job_link
            })

        #Clicking on the "next page" button
        try:
            driver.find_element(By.XPATH, './/button[@aria-label="Next"]').click()
        except NoSuchElementException:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break
    
    # Converting dictionary object into a json
    with open('glassdoor jobs '+keyword+' '+datetime.now().strftime('%m %d %Y')+'.json', 'w', encoding='utf-8') as file:
        json.dump(jobs, file, ensure_ascii=False, indent=4)


In [6]:
get_jobs("data scientist", 3000, False)

Progress: 0/3000
Progress: 1/3000
Progress: 2/3000
Progress: 3/3000
Progress: 4/3000
Progress: 5/3000
Progress: 6/3000
Progress: 7/3000
Progress: 8/3000
Progress: 9/3000
Progress: 10/3000
Progress: 11/3000
Progress: 12/3000
Progress: 13/3000
Progress: 14/3000
Progress: 15/3000
Progress: 16/3000
Progress: 17/3000
Progress: 18/3000
Progress: 19/3000
Progress: 20/3000
Progress: 21/3000
Progress: 22/3000
Progress: 23/3000
Progress: 24/3000
Progress: 25/3000
Progress: 26/3000
Progress: 27/3000
Progress: 28/3000
Progress: 29/3000
Progress: 30/3000
Progress: 31/3000
Progress: 32/3000
Progress: 33/3000
Progress: 34/3000
Progress: 35/3000
Progress: 36/3000
Progress: 37/3000
Progress: 38/3000
Progress: 39/3000
Progress: 40/3000
Progress: 41/3000
Progress: 42/3000
Progress: 43/3000
Progress: 44/3000
Progress: 45/3000
Progress: 46/3000
Progress: 47/3000
Progress: 48/3000
Progress: 49/3000
Progress: 50/3000
Progress: 51/3000
Progress: 52/3000
Progress: 53/3000
Progress: 54/3000
Progress: 55/3000
Pr

In [38]:
get_jobs("machine learning", 1000, False)

Progress: 0/1000
Progress: 1/1000
Progress: 2/1000
Progress: 3/1000
Progress: 4/1000
Progress: 5/1000
Progress: 6/1000
Progress: 7/1000
Progress: 8/1000
Progress: 9/1000
Progress: 10/1000
Progress: 11/1000
Progress: 12/1000
Progress: 13/1000
Progress: 14/1000
Progress: 15/1000
Progress: 16/1000
Progress: 17/1000
Progress: 18/1000
Progress: 19/1000
Progress: 20/1000
Progress: 21/1000
Progress: 22/1000
Progress: 23/1000
Progress: 24/1000
Progress: 25/1000
Progress: 26/1000
Progress: 27/1000
Progress: 28/1000
Progress: 29/1000
Progress: 30/1000
Progress: 31/1000
Progress: 32/1000
Progress: 33/1000
Progress: 34/1000
Progress: 35/1000
Progress: 36/1000
Progress: 37/1000
Progress: 38/1000
Progress: 39/1000
Progress: 40/1000
Progress: 41/1000
Progress: 42/1000
Progress: 43/1000
Progress: 44/1000
Progress: 45/1000
Progress: 46/1000
Progress: 47/1000
Progress: 48/1000
Progress: 49/1000
Progress: 50/1000
Progress: 51/1000
Progress: 52/1000
Progress: 53/1000
Progress: 54/1000
Progress: 55/1000
Pr

In [39]:
get_jobs("artificial intelligence", 1000, False)

Progress: 0/1000
Progress: 1/1000
Progress: 2/1000
Progress: 3/1000
Progress: 4/1000
Progress: 5/1000
Progress: 6/1000
Progress: 7/1000
Progress: 8/1000
Progress: 9/1000
Progress: 10/1000
Progress: 11/1000
Progress: 12/1000
Progress: 13/1000
Progress: 14/1000
Progress: 15/1000
Progress: 16/1000
Progress: 17/1000
Progress: 18/1000
Progress: 19/1000
Progress: 20/1000
Progress: 21/1000
Progress: 22/1000
Progress: 23/1000
Progress: 24/1000
Progress: 25/1000
Progress: 26/1000
Progress: 27/1000
Progress: 28/1000
Progress: 29/1000
Progress: 30/1000
Progress: 31/1000
Progress: 32/1000
Progress: 33/1000
Progress: 34/1000
Progress: 35/1000
Progress: 36/1000
Progress: 37/1000
Progress: 38/1000
Progress: 39/1000
Progress: 40/1000
Progress: 41/1000
Progress: 42/1000
Progress: 43/1000
Progress: 44/1000
Progress: 45/1000
Progress: 46/1000
Progress: 47/1000
Progress: 48/1000
Progress: 49/1000
Progress: 50/1000
Progress: 51/1000
Progress: 52/1000
Progress: 53/1000
Progress: 54/1000
Progress: 55/1000
Pr

In [5]:
get_jobs("deep learning", 1000, False)

Progress: 0/1000
Progress: 1/1000
Progress: 2/1000
Progress: 3/1000
Progress: 4/1000
Progress: 5/1000
Progress: 6/1000
Progress: 7/1000
Progress: 8/1000
Progress: 9/1000
Progress: 10/1000
Progress: 11/1000
Progress: 12/1000
Progress: 13/1000
Progress: 14/1000
Progress: 15/1000
Progress: 16/1000
Progress: 17/1000
Progress: 18/1000
Progress: 19/1000
Progress: 20/1000
Progress: 21/1000
Progress: 22/1000
Progress: 23/1000
Progress: 24/1000
Progress: 25/1000
Progress: 26/1000
Progress: 27/1000
Progress: 28/1000
Progress: 29/1000
Progress: 30/1000
Progress: 31/1000
Progress: 32/1000
Progress: 33/1000
Progress: 34/1000
Progress: 35/1000
Progress: 36/1000
Progress: 37/1000
Progress: 38/1000
Progress: 39/1000
Progress: 40/1000
Progress: 41/1000
Progress: 42/1000
Progress: 43/1000
Progress: 44/1000
Progress: 45/1000
Progress: 46/1000
Progress: 47/1000
Progress: 48/1000
Progress: 49/1000
Progress: 50/1000
Progress: 51/1000
Progress: 52/1000
Progress: 53/1000
Progress: 54/1000
Progress: 55/1000
Pr