In [2]:
import re
import json
import scrapy
from urllib.parse import urlencode

class IndeedJobSpider(scrapy.Spider):
    name = "indeed_jobs"

    def get_indeed_search_url(self, keyword, location, offset=0):
        parameters = {"q": keyword, "l": location, "filter": 0, "start": offset}
        return "https://www.indeed.com/jobs?" + urlencode(parameters)

    def start_requests(self):
        keyword_list = ['python']
        location_list = ['texas']
        for keyword in keyword_list:
            for location in location_list:
                indeed_jobs_url = self.get_indeed_search_url(keyword, location)
                yield scrapy.Request(url=indeed_jobs_url, callback=self.parse_search_results, meta={'keyword': keyword, 'location': location, 'offset': 0})

    def parse_search_results(self, response):
        pass
    


In [1]:
import re
import json
import scrapy
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlencode
from scrapy.utils.project import get_project_settings

class IndeedJobSpider(scrapy.Spider):
    name = "indeed_jobs"
    
    # Function to create Indeed search URL
    def get_indeed_search_url(self, keyword, location, offset=0):
        parameters = {"q": keyword, "l": location, "filter": 0, "start": offset}
        return "https://www.indeed.com/jobs?" + urlencode(parameters)

    # Starting point for the spider
    def start_requests(self):
        keyword_list = ['python']  # Change as per your requirements
        location_list = ['texas']   # Change as per your requirements
        for keyword in keyword_list:
            for location in location_list:
                indeed_jobs_url = self.get_indeed_search_url(keyword, location)
                yield scrapy.Request(
                    url=indeed_jobs_url,
                    callback=self.parse_search_results,
                    meta={'keyword': keyword, 'location': location, 'offset': 0}
                )

    # Parse search results and handle pagination
    def parse_search_results(self, response):
        location = response.meta['location']
        keyword = response.meta['keyword'] 
        offset = response.meta['offset']
        
        script_tag = re.findall(r'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});', response.text)
        if script_tag:
            json_blob = json.loads(script_tag[0])

            # Paginate Through Jobs Pages
            if offset == 0:
                meta_data = json_blob["metaData"]["mosaicProviderJobCardsModel"]["tierSummaries"]
                num_results = sum(category["jobCount"] for category in meta_data)
                if num_results > 1000:
                    num_results = 50  # Limiting the number for demonstration
                
                for offset in range(10, num_results + 10, 10):
                    url = self.get_indeed_search_url(keyword, location, offset)
                    yield scrapy.Request(
                        url=url,
                        callback=self.parse_search_results,
                        meta={'keyword': keyword, 'location': location, 'offset': offset}
                    )

            # Extract Jobs From Search Page
            jobs_list = json_blob['metaData']['mosaicProviderJobCardsModel']['results']
            for index, job in enumerate(jobs_list):
                if job.get('jobkey'):
                    job_url = 'https://www.indeed.com/m/basecamp/viewjob?viewtype=embedded&jk=' + job.get('jobkey')
                    yield scrapy.Request(
                        url=job_url, 
                        callback=self.parse_job, 
                        meta={
                            'keyword': keyword, 
                            'location': location, 
                            'page': round(offset / 10) + 1 if offset > 0 else 1,
                            'position': index,
                            'jobKey': job.get('jobkey'),
                        }
                    )

    # Parse individual job pages
    def parse_job(self, response):
        location = response.meta['location']
        keyword = response.meta['keyword']
        page = response.meta['page']
        position = response.meta['position']
        
        script_tag = re.findall(r"_initialData=(\{.+?\});", response.text)
        if script_tag:
            json_blob = json.loads(script_tag[0])
            job = json_blob["jobInfoWrapperModel"]["jobInfoModel"]['jobInfoHeaderModel']
            sanitizedJobDescription = json_blob["jobInfoWrapperModel"]["jobInfoModel"]['sanitizedJobDescription']
            yield {
                'keyword': keyword,
                'location': location,
                'page': page,
                'position': position,
                'company': job.get('companyName'),
                'jobkey': response.meta['jobKey'],
                'jobTitle': job.get('jobTitle'),
                'jobDescription': sanitizedJobDescription,
            }

# Set up Scrapy process with configurations
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',  # Default User-Agent
    'DOWNLOAD_DELAY': 2,  # Add delay between requests
    'FEEDS': {
        'jobs_output.json': {
            'format': 'json',
            'encoding': 'utf8',
            'store_empty': False,
            'fields': None,
            'indent': 4,
        }
    },
    'DOWNLOADER_MIDDLEWARES': {
        'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,  # Randomize User-Agent
        'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,  # Optional: configure proxy if needed
    },
    'HTTP_PROXY': 'http://your_proxy_server:port',  # Replace with your proxy if needed
})

process.crawl(IndeedJobSpider)
process.start()


2024-11-19 20:32:20 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
2024-11-19 20:32:20 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.10.0, Python 3.8.19 (default, Mar 20 2024, 19:55:45) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.3, Platform Windows-10-10.0.22631-SP0
2024-11-19 20:32:20 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-11-19 20:32:20 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2024-11-19 20:32:20 [scrapy.extensions.telnet] INFO: Telnet Password: 6be0c1d1a9d260a5
2024-11-19 20:32:21 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedE