## Web Scraping project: Google's Job postings

This code shows how to scrape google's job posting using python beautiful soup library and selenium.

* 🏆 This could be a great way to scrape job data compared to google's serp api for which you have to pay 🤑 beyond 100 api calls.

* 🦾 Use the data to make your job search easier.
---

The XPATHS might change overtime. Use X-path Chrome extension for XPATH selection 

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from time import sleep
import urllib.parse
import itertools

def get_all_jobs(driver, role_name, company):
    query = f"https://www.google.com/search?q={role_name}+{company}&ibp=htl;jobs#htivrt=jobs"
    print(query)
    driver.get(query)
    sleep(3)  
    
    scroll_page(driver)
    sleep(2)  
    
    # Find all listings after scrolling # .//li[@data-ved]//div[@role='treeitem']/div/div
    listings = driver.find_elements(By.XPATH, "//div[@role='treeitem']")
    jobs_data = []
    for idx, listing in enumerate(listings):
        if idx == 50:  # Limit to n listings
            break
        listing.click()
        sleep(1)  # Allow time for the job details to load
        job_data = _get_job(driver)
        job_data["role_name"] = role_name
        job_data["company"] = company
        jobs_url = driver.current_url
        jobs_data.append(job_data)
    return jobs_data

def scroll_page(driver):
    driver.maximize_window()
    driver.implicitly_wait(10)
    for i in range(50):
        try:
            element = driver.find_element(By.XPATH, f"(//div[@class='PwjeAc'])[{i + 1}]")
            driver.execute_script("arguments[0].scrollIntoView(true);", element)
            sleep(1)
        except NoSuchElementException:
            print(f"No more listings to scroll. Total listings scrolled: {i}")
            break

    driver.execute_script("arguments[0].scrollIntoView(true);", driver.find_element(By.XPATH, "(//div[@class='PwjeAc'])[1]"))
    
def _get_job(driver):
    job_container = driver.find_element(By.XPATH, '//div[@class="whazf bD1FPe"]')
    job_id = _get_job_id(driver)
    company = _get_company(driver, job_container)
    role = _get_job_role(driver, job_container)
    location = _get_job_location(driver, job_container)
    posted_date = _get_job_posted(driver, job_container)
    description = _get_job_description(driver, job_container)
    current_url = driver.current_url
    return {"id": job_id, "company": company, "role": role, "location": location, "posted_date": posted_date, "description": description, "url": current_url}


def _get_job_id(driver):
    parsed_url = urllib.parse.urlparse(driver.current_url)
    return urllib.parse.parse_qs(parsed_url.fragment)['htidocid'][0]

def _get_company(driver, job_container):
    return job_container.find_element(By.XPATH, '//div[@class="nJlQNd sMzDkb"]').text

def _get_job_role(driver, job_container):
    job_role_element = job_container.find_element(By.XPATH, '//*[@id="tl_ditc"]/div[1]/div[1]/div[1]/div[1]/div[1]/h2[1]')
    return job_role_element.get_attribute('innerText')

def _get_job_location(driver, job_container):
    job_role_element = job_container.find_element(By.XPATH, '//*[@id="tl_ditc"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[2]/div[2]')
    return job_role_element.get_attribute('innerText')

def _get_job_posted(driver, job_container):
    try:
        job_role_element = job_container.find_element(By.XPATH, '//*[@id="tl_ditc"]/div[1]/div[1]/div[3]/div[1]/span[2]/span[1]')
        return job_role_element.get_attribute('innerText')
    except NoSuchElementException:
        return None

def _get_job_description(driver, job_container):
    try:
        expand_description_button = job_container.find_element(By.XPATH, "div/div/div/div/div/div/div[@class='CdXzFe j4kHIf']")
        expand_description_button.click()
        sleep(1)  # Allow time for the description to expand
    except NoSuchElementException:
        pass
    return job_container.find_element(By.XPATH, ".//span[@class='HBvzbc']").text


if __name__ == '__main__':
    driver = webdriver.Chrome()
    role_names = ["AI Jobs"]
    companies = ["Google"]
    data = []
    for role_name, company in itertools.product(role_names, companies):
        data.extend(get_all_jobs(driver, role_name, company))
    driver.quit()

    df = pd.DataFrame(data)

https://www.google.com/search?q=AI Jobs+Google&ibp=htl;jobs#htivrt=jobs
No more listings to scroll. Total listings scrolled: 46


In [4]:
df

Unnamed: 0,id,company,role,location,posted_date,description,url,role_name
0,MFCner5zB6svY4KNAAAAAA==,Google,Staff Software Engineer Google Cloud Generativ...,"Buffalo, NY",6 days ago,A company is looking for a Staff Software Engi...,https://www.google.com/search?q=AI%20Jobs+Goog...,AI Jobs
1,oK6V-OJMwuc8EG-ZAAAAAA==,Google,"Software Engineer III, Machine Learning, Googl...","New York, NY",7 days ago,Minimum qualifications:\n• Bachelor’s degree o...,https://www.google.com/search?q=AI%20Jobs+Goog...,AI Jobs
2,5VWZ55Q2KvNDlqu2AAAAAA==,Google,"Staff Software Engineer, Google Cloud Generati...","New York, NY",15 days ago,The application window will be open until at l...,https://www.google.com/search?q=AI%20Jobs+Goog...,AI Jobs
3,ZuEz7bylo9sxdTWoAAAAAA==,Google,"Technical Solutions Consultant, AI, Google Cloud","New York, NY",Full-time,Note: By applying to this position you will ha...,https://www.google.com/search?q=AI%20Jobs+Goog...,AI Jobs
4,v5NtqqnTarKvT-lWAAAAAA==,Google,"Partner Technology Manager, Data Analytics and AI","New York, NY",206K–302K a year,Minimum qualifications:\n• 10 years of experie...,https://www.google.com/search?q=AI%20Jobs+Goog...,AI Jobs
5,wCEPMkeCoz6HwczyAAAAAA==,Google,"AI Writer, Global Communications",United States,16 days ago,Note: By applying to this position you will ha...,https://www.google.com/search?q=AI%20Jobs+Goog...,AI Jobs
6,EslB-HzOHSUr7oH9AAAAAA==,Google,"Field Solutions Developer I, Generative AI, Go...","Washington, DC",Full-time,The application window will be open until at l...,https://www.google.com/search?q=AI%20Jobs+Goog...,AI Jobs
7,lqJoixWvd3zbesVsAAAAAA==,Google,"AI Consultant, Google Cloud Consulting","Reston, VA",5 days ago,The application window will be open until at l...,https://www.google.com/search?q=AI%20Jobs+Goog...,AI Jobs
8,TFLSbILZXCLt7CZ4AAAAAA==,Google,"Senior Staff Technical Solutions Consultant, G...","Pittsburgh, PA",6 days ago,The application window will be open until at l...,https://www.google.com/search?q=AI%20Jobs+Goog...,AI Jobs
9,K158vpyUgcyNRMKQAAAAAA==,Google,"Field Solutions Developer IV, Generative AI, G...","New York, NY",13 days ago,X\n\nFor US Applcants Only\n\nThis role may al...,https://www.google.com/search?q=AI%20Jobs+Goog...,AI Jobs


### This code shows how to scrape google's job posting using python with google search api,
 - You need to create an api key to run this code

In [2]:
import requests

url = "https://www.searchapi.io/api/v1/search"
params = {
  "engine": "google_jobs",
  "q": "Google Jobs, usa",
  "location": "United States",
  "api_key": "Wc6TG5r5ixSPzTeFG7RhhXZy",
  "hl": "en",
  "gl": "us",
  "num": 10,
  "chips": "state:qaUj8fBLzExnlRjewcbP3Q==,employment_type:FULLTIME"
}

# Make 30 requests to retrieve 300 results
all_results = []
for i in range(0, 100, 10):
    params["start"] = i
    response = requests.get(url, params=params)
    results = response.json()["jobs"]
    all_results.extend(results)

# Print the number of results retrieved
print(len(all_results))

100


In [5]:
import pandas as pd

jobs = []

for job in all_results:
    title = job.get('title')
    company_name = job.get('company_name')
    salary = job.get('detected_extensions', {}).get('salary')
    posted_at = job.get('detected_extensions', {}).get('posted_at')
    web_url_link = job.get('apply_link')
    job_description = job.get('description')
    company_web_results_link = job.get('company_web_results_link')
    via = job.get('via')
    company_website = job.get('company_website')
    location = job.get('location')

    via_list = []
    if via:
        if isinstance(via, list):
            via_list.extend(via)
        else:
            via_list.append(via)

    for via_item in via_list:
        via_element = {'Title': title,
                       'Company Name': company_name,
                       'Salary': salary,
                       'Posted At': posted_at,
                       'Web URL Link': web_url_link,
                       'Job Description': job_description,
                       'Company Web Results Link': company_web_results_link,
                       'Via': via_item,
                       'Company Website': company_website,
                       'Location': location
                      }
        jobs.append(via_element)

df102 = pd.DataFrame(jobs)
df102

Unnamed: 0,Title,Company Name,Salary,Posted At,Web URL Link,Job Description,Company Web Results Link,Via,Company Website,Location
0,"Software Engineer III, Privacy Sandbox",Google,,1 day ago,https://www.linkedin.com/jobs/view/software-en...,Note: By applying to this position you will ha...,https://www.google.com/search?sca_esv=298519f3...,via LinkedIn,http://www.google.com/,"Kirkland, WA"
1,"Director, Global Communications Industry, Goog...",Google,,1 hour ago,https://www.linkedin.com/jobs/view/director-gl...,Note: By applying to this position you will ha...,https://www.google.com/search?sca_esv=298519f3...,via LinkedIn,http://www.google.com/,"San Francisco, CA"
2,"Data Engineer, Google Customer Solutions",Google,,2 days ago,https://www.linkedin.com/jobs/view/data-engine...,This role may also be located in our Playa Vis...,https://www.google.com/search?sca_esv=298519f3...,via LinkedIn,http://www.google.com/,"Chicago, IL"
3,"Software Development Manager II, Development P...",Google,,1 day ago,https://www.linkedin.com/jobs/view/software-de...,Note: By applying to this position you will ha...,https://www.google.com/search?sca_esv=298519f3...,via LinkedIn,http://www.google.com/,"Seattle, WA"
4,"Strategy and Operations Lead, Go-To-Market",Google,,1 day ago,https://www.linkedin.com/jobs/view/strategy-an...,This role may also be located in our Playa Vis...,https://www.google.com/search?sca_esv=298519f3...,via LinkedIn,http://www.google.com/,"Washington, DC"
...,...,...,...,...,...,...,...,...,...,...
95,"Group Product Manager, Machine Learning, Frame...",Google,,7 hours ago,https://www.linkedin.com/jobs/view/group-produ...,Note: By applying to this position you will ha...,https://www.google.com/search?sca_esv=298519f3...,via LinkedIn,http://www.google.com/,"Sunnyvale, CA"
96,"Staff Software Engineer, Applied Science, Core...",Google,20–28 an hour,6 days ago,https://geebo.com/jobs-online/view/id/10217943...,Minimum\nQualifications:\nBachelor's degree or...,https://www.google.com/search?sca_esv=298519f3...,via Geebo,http://www.google.com/,"Kirkland, WA"
97,"Manager, Partnerships Strategy and Operations",Google,,2 days ago,https://www.linkedin.com/jobs/view/manager-par...,This role may also be located in our Playa Vis...,https://www.google.com/search?sca_esv=298519f3...,via LinkedIn,http://www.google.com/,"New York, NY"
98,"Small/Medium Business Program Manager, Google ...",Google LLC,142K–211K a year,5 days ago,https://www.jobserve.com/us/en/extjob/SMALL-ME...,XNote: By applying to this position you will h...,https://www.google.com/search?sca_esv=298519f3...,via JobServe,http://www.google.com/,"Chicago, IL"



**References:**

- Adapted from Scraping by AIcore 
- Adapted from Google Serp API