# Indeed Scrapper


This notebook contains a function named scrape_jobs_indeed(), serving as a pipeline to automate the process of opening the browser, navigating to indeed.com, selecting the desired city and job title. Subsequently, it proceeds to scrape the initial page of job listings. Due to limitations imposed by the website's security measures, we are restricted to scraping only the first page of job offers. Attempting to access subsequent pages triggers the website's bot detection system, recognizing the user as non-human.

In [164]:
#!pip install selenium

In [165]:
import json
import pandas as pd
import numpy as np
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from concurrent.futures import ThreadPoolExecutor
import requests
import os
import warnings

# Ignore SettingWithCopyWarning
warnings.filterwarnings("ignore", category=UserWarning, module="pandas")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")

# Go get geckodriver from : https://github.com/mozilla/geckodriver/releases

### Initialization functions:

In [166]:

def ffx_preferences(dfolder, download=False):
    '''
    Sets the preferences of the firefox browser: download path.
    '''
    profile = webdriver.FirefoxProfile()
    # set download folder:
    profile.set_preference("browser.download.dir", dfolder)
    profile.set_preference("browser.download.folderList", 2)
    profile.set_preference("browser.download.manager.showWhenStarting", False)
    profile.set_preference("browser.helperApps.neverAsk.saveToDisk",
                           "application/msword,application/rtf, application/csv,text/csv,image/png ,image/jpeg, application/pdf, text/html,text/plain,application/octet-stream")

    # this allows to download pdfs automatically
    if download:
        profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf,application/x-pdf")
        profile.set_preference("pdfjs.disabled", True)

    options = Options()
    options.profile = profile
    return options


def start_up(link, geko_path, profile_path=None, browser=None):
        """
        Function to set up the browser and open the selected link.

        Args:
            link (str): The URL to open.
            geko_path (str): Path to the Gecko driver executable.
            profile_path (str): Path to the Firefox profile to be used if there is any.
            browser: Optional existing webdriver instance.

        Returns:
            browser: The initialized webdriver instance.
        """
        if not browser:
            if profile_path:
                firefox_options = webdriver.FirefoxOptions()
                firefox_options.add_argument(f'--profile={profile_path}')
                service = Service(geko_path)
                browser = webdriver.Firefox(service=service, options=firefox_options)
            else:
                profile = webdriver.FirefoxProfile()
                options = Options()
                options.profile = profile
                service = Service(geko_path)
                browser = webdriver.Firefox(service=service, options=options)
        browser.get(link)
        time.sleep(2)
        return browser
        
def check_and_click(browser, xpath, type):
    '''
    Function that checks whether the object is clickable and, if so, clicks on
    it. If not, waits one second and tries again.
    '''
    start_time = time.time()  # Record the start time
    while True:
        try:
            element = browser.find_element(By.XPATH, xpath)
            element.click()
            return "Clicked!"  # Element found and clicked successfully
        except NoSuchElementException:
            pass  # Continue if element not found
        except Exception as e:
            print(f"An error occurred: {e}")
            return False  # Other unexpected errors

        time.sleep(1)
        elapsed_time = time.time() - start_time
        if elapsed_time >= 3:
            print("** The element was not found in the page. **")
            return None  # Element not found after 5 seconds
        
def check_obscures(browser, xpath, type):
    '''
    Function that checks whether the object is being "obscured" by any element so
    that it is not clickable. Important: if True, the object is going to be clicked!
    '''
    try:
        if type == "xpath":
            browser.find_element('xpath', xpath).click()
        elif type == "id":
            browser.find_element('id', xpath).click()
        elif type == "css":
            browser.find_element('css selector', xpath).click()
        elif type == "class":
            browser.find_element('class name', xpath).click()
        elif type == "link":
            browser.find_element('link text', xpath).click()
    except (ElementClickInterceptedException, StaleElementReferenceException) as e:
        print(e)
        return False
    except NoSuchElementException:
        # Do nothing if NoSuchElementException occurs (suppress the error)
        pass
    return True

def element_exists(browser, path):
    try:
        browser.find_element('xpath', path)
        return True
    except NoSuchElementException:
        return False

### Paths:

In [167]:
# Set the path to the geckodriver executable:
geko_path='C:/Users/School/Downloads/geckodriver-v0.34.0-win64/geckodriver.exe'
link='https://www.indeed.com/jobs?q=python&l=LA'

# If fifefox profile is needed, set the path to the profile:
profile_path = ''

browser=start_up(link=link,geko_path=geko_path)


common_words = [‘university’, 'Los Angeles', 'New York','Chicago','San Fransisco' ,'Austin','Seattle','Boston','Washington','Houston','Atlanta'

## Scrape jobs:

In [169]:
# loop to scrape the data and populate the DataFrame
locations = ['Los Angeles', 'New York','Chicago','San Fransisco'] # ,'Austin','Seattle','Boston','Washington','Houston','Atlanta'] # Miami, 
jobs = ['Data Science','Mechanical Engineer','Sales','Java Developer', 'Business Analyst','Operations Manager', 'Python Developer', 'DevOps Engineer','Network Security Engineer', 'Database','Blockchain','ETL Developer']

def scrape_jobs_indeed(job_list, location_list,geko_path,link):
    data = pd.DataFrame(columns=['Job title', 'Company', 'Description', 'Job_Title_Searched', 'Location_Searched'])
    for j in jobs:
        for i in locations:
            # Start browser
            browser=start_up(link=link,geko_path=geko_path)
            # Click on the search bar
            browser.find_element(by='xpath',value='//input[@id="text-input-what"]').click()
            # Input job
            search1 = browser.find_element(by='xpath',value='//input[@id="text-input-what"]')
            search1.clear()
            search1.send_keys(j)

            # Click on the location search bar
            browser.find_element(by='xpath',value='//input[@id="text-input-where"]').click()
            # Input place
            search1 = browser.find_element(by='xpath',value='//input[@id="text-input-where"]')
            search1.clear()
            search1.send_keys(i)

            # Click on the search bar
            browser.find_element(by='xpath',value='//button[@class="yosegi-InlineWhatWhere-primaryButton"]').click()
            time.sleep(2)
            
            print("Location: ", i)
            # Dividing the page in the Container Objects, one for every hotel and extracting the wanted data from each
            containers = browser.find_elements(By.XPATH, '//li[@class="css-5lfssm eu4oa1w0"]')
            for job in containers:
                try: 
                    job.click()
                    random_sleep = np.random.randint(1, 2)
                    time.sleep(1)
                except:
                    pass
                try:
                    job_title = job.find_element('xpath', '/html/body/main/div/div[2]/div/div[5]/div/div[2]/div/div/div[2]/div[2]/div[1]/div/div[1]/div[1]/h2/span').text
                    if '\n' in job_title:
                        job_title = job_title.split('\n')[0]
                except:
                    job_title = np.nan
                try:
                    job_company = job.find_element('xpath', '/html/body/main/div/div[2]/div/div[5]/div/div[2]/div/div/div[2]/div[2]/div[1]/div/div[1]/div[2]/div/div/div/div[1]/div[1]/span/a').text
                except:
                    job_company = np.nan
                try: 
                    job_location = job.find_element('xpath', '//*[@id="jobLocationSectionWrapper"]').text
                    if '\n' in job_location:
                        job_location = job_location.split('\n')[1]
                except:
                    job_location = np.nan
                try:
                    job_description = browser.find_element('xpath', '//*[@id="jobDescriptionText"]').text
                except:
                    job_description = np.nan
                new_row = {'Job title': job_title, 'Company': job_company, 'Description':job_description, 'Location':job_location, 'Job_Title_Searched':j, 'Location_Searched':i}
                data = pd.concat([data, pd.DataFrame([new_row])], ignore_index=True)
            time.sleep(1)
            browser.quit()
    return data

data2 = scrape_jobs_indeed(jobs, locations, geko_path,link)

Location:  Los Angeles
Location:  New York
Location:  Chicago
Location:  San Fransisco
Location:  Los Angeles
Location:  New York
Location:  Chicago
Location:  San Fransisco
Location:  Los Angeles
Location:  New York
Location:  Chicago
Location:  San Fransisco
Location:  Los Angeles
Location:  New York
Location:  Chicago
Location:  San Fransisco
Location:  Los Angeles
Location:  New York
Location:  Chicago
Location:  San Fransisco
Location:  Los Angeles
Location:  New York
Location:  Chicago
Location:  San Fransisco
Location:  Los Angeles
Location:  New York
Location:  Chicago
Location:  San Fransisco
Location:  Los Angeles
Location:  New York
Location:  Chicago
Location:  San Fransisco
Location:  Los Angeles
Location:  New York
Location:  Chicago
Location:  San Fransisco
Location:  Los Angeles
Location:  New York
Location:  Chicago
Location:  San Fransisco
Location:  Los Angeles
Location:  New York
Location:  Chicago
Location:  San Fransisco
Location:  Los Angeles
Location:  New York


### Save the scraped data:

In [170]:
# Save the data
data2.to_csv('jobs.csv', index=False)

In [171]:
data2.head()

Unnamed: 0,Job title,Company,Description,Job_Title_Searched,Location_Searched,Location
0,Data Analyst,UCLA,About the California Policy Lab\nThe Californi...,Data Science,Los Angeles,"Los Angeles, CA"
1,Research Data Scientist,CEDARS-SINAI,Job Description\nThe Research Data Scientist p...,Data Science,Los Angeles,"Los Angeles, CA"
2,Data Scientist,,Expatiate Communications is a boutique managem...,Data Science,Los Angeles,"Pasadena, CA"
3,DATA SCIENTIST,Los Angeles County Department of Human Resources,EXAM NUMBER:\nPH1763B\n\nTYPE OF RECRUITMENT:\...,Data Science,Los Angeles,"Los Angeles County, CA"
4,Predictive Data Analyst - CalAIM,Heluna Health,Salary: $42.59 - $57.40 Per Hour\nSUMMARY\nCom...,Data Science,Los Angeles,"Los Angeles, CA 90014"
