In [2]:
import os
import csv
import requests
from bs4 import BeautifulSoup

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f'Directory "{directory}" created successfully.')

def write_to_csv(file_path, header, data):
    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(header)
        writer.writerows(data)

def scrape_indeed_jobs(skill, place, no_of_pages):
    main_dir = os.getcwd() + '/'
    create_directory(main_dir)

    file_name = f'{skill.title()}_{place.title()}_Jobs.csv'
    file_path = main_dir + file_name

    header = ['JOB_NAME', 'COMPANY', 'LOCATION', 'POSTED', 'APPLY_LINK']
    job_data = []

    base_url = 'https://in.indeed.com/viewjob?jk='

    headers = {
        "User-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}

    print(f'\nScraping in progress...\n')

    for page in range(no_of_pages):
        url = f'https://www.indeed.co.in/jobs?q={skill}&l={place}&start={page * 10}'
        response = requests.get(url, headers=headers)
        html = response.text

        soup = BeautifulSoup(html, "html.parser")
        if soup.find("html.parser") is None and soup.find("lxml") is not None:
            soup = BeautifulSoup(html, "lxml")

        job_cards = soup.find_all('div', class_='jobsearch-SerpJobCard')

        for job in job_cards:
            job_title_elem = job.find('span', id=lambda x: x and x.startswith('jobTitle-'))
            job_title = job_title_elem.text.strip() if job_title_elem else 'N/A'

            company_elem = job.find('span', class_='css-1x7z1ps')
            company = company_elem.text.strip() if company_elem else 'N/A'

            location_elem = job.find('div', class_='css-t4u72d')
            location = location_elem.text.strip() if location_elem else 'N/A'

            posted = job.find('span', class_='date').text.strip()
            job_link = base_url + job.get('data-jk')
            job_data.append([job_title, company, location, posted, job_link])

    write_to_csv(file_path, header, job_data)
    print(f'Jobs data written to "{file_name}" successfully.')

if __name__ == "__main__":
    skill = input('Enter your Skill: ').strip()
    place = input('Enter the location: ').strip()
    no_of_pages = int(input('Enter the #pages to scrape: '))

    scrape_indeed_jobs(skill, place, no_of_pages)


Enter your Skill: PYTHON
Enter the location: HYDERABAD
Enter the #pages to scrape: 3

Scraping in progress...

Jobs data written to "Python_Hyderabad_Jobs.csv" successfully.


In [2]:
pip install lxml

Collecting lxml
  Downloading lxml-5.0.1-cp311-cp311-win_amd64.whl (3.9 MB)
                                              0.0/3.9 MB ? eta -:--:--
     -                                        0.1/3.9 MB 8.3 MB/s eta 0:00:01
     ----------------                         1.6/3.9 MB 20.2 MB/s eta 0:00:01
     ------------------------------           3.0/3.9 MB 23.7 MB/s eta 0:00:01
     ---------------------------------------  3.9/3.9 MB 22.7 MB/s eta 0:00:01
     ---------------------------------------- 3.9/3.9 MB 20.9 MB/s eta 0:00:00
Installing collected packages: lxml
Successfully installed lxml-5.0.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import httpx
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Connection": "keep-alive",
    "Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
}

response = httpx.get("https://www.indeed.com/jobs?q=python&l=Texas", headers=HEADERS)
print(response)

<Response [403 Forbidden]>


In [5]:
pip install scrapfly-sdk

Collecting scrapfly-sdk
  Downloading scrapfly_sdk-0.8.10-py3-none-any.whl (28 kB)
Collecting loguru>=0.5 (from scrapfly-sdk)
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
                                              0.0/62.5 kB ? eta -:--:--
     ---------------------------------------- 62.5/62.5 kB ? eta 0:00:00
Collecting backoff>=1.10.0 (from scrapfly-sdk)
  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting win32-setctime>=1.0.0 (from loguru>=0.5->scrapfly-sdk)
  Downloading win32_setctime-1.1.0-py3-none-any.whl (3.6 kB)
Installing collected packages: win32-setctime, backoff, loguru, scrapfly-sdk
Successfully installed backoff-2.2.1 loguru-0.7.2 scrapfly-sdk-0.8.10 win32-setctime-1.1.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import requests
url = 'https://www.indeed.com/jobs?q=python&l=Texas'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)
print(result.content.decode())




In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService

url = 'https://www.indeed.com/jobs?q=python&l=Texas'

# Set up the Chrome browser with Selenium
chrome_path = r"C:\ProgramData\Microsoft\Windows\Start Menu\Programs\Google Chrome.lnk"
chrome_service = ChromeService(chrome_path)
driver = webdriver.Chrome(service=chrome_service)

# Load the URL using the browser
driver.get(url)

# Wait for a few seconds to ensure the JavaScript executes
driver.implicitly_wait(5)

# Get the fully-rendered HTML content
html_content = driver.page_source

# Close the browser
driver.quit()

# Now you can use BeautifulSoup to parse the HTML as before


OSError: [WinError 193] %1 is not a valid Win32 application

In [8]:
import asyncio
import json
import re
from urllib.parse import urlencode

import httpx


def parse_search_page(html: str):
    data = re.findall(r'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});', html)
    data = json.loads(data[0])
    return {
        "results": data["metaData"]["mosaicProviderJobCardsModel"]["results"],
        "meta": data["metaData"]["mosaicProviderJobCardsModel"]["tierSummaries"],
    }


async def scrape_search(client: httpx.AsyncClient, query: str, location: str, max_results: int = 50):
    def make_page_url(offset):
        parameters = {"q": query, "l": location, "filter": 0, "start": offset}
        return "https://www.indeed.com/jobs?" + urlencode(parameters)

    print(f"scraping first page of search: {query=}, {location=}")
    response_first_page = await client.get(make_page_url(0))
    data_first_page = parse_search_page(response_first_page.text)

    results = data_first_page["results"]
    total_results = sum(category["jobCount"] for category in data_first_page["meta"])
    # there's a page limit on indeed.com of 1000 results per search
    if total_results > max_results:
        total_results = max_results
    print(f"scraping remaining {total_results - 10 / 10} pages")
    other_pages = [make_page_url(offset) for offset in range(10, total_results + 10, 10)]
    for response in await asyncio.gather(*[client.get(url=url) for url in other_pages]):
        results.extend(parse_search_page(response.text))
    return results