In [6]:
!pip install scrapy
!pip install twisted
!pip install service_identity
!pip install w3lib




In [8]:
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd


class ResumeSpider(scrapy.Spider):
    name = 'resumes'
    allowed_domains = ['hireitpeople.com']
    start_urls = ['https://www.hireitpeople.com/resume-database/']

    custom_settings = {
        'FEEDS': {
            'resumes.json': {'format': 'json'},
        }
    }

    def parse(self, response):
        # 提取主分類的名稱和連結
        rows = response.css('table.hit-table tr')
        for row in rows:
            name = row.css('h4 a::text').get()
            link = row.css('h4 a::attr(href)').get()

            if name and link:
                # 跟隨分類連結，處理分頁和子連結
                yield response.follow(link, self.parse_category, meta={
                    'category_name': name.strip(),
                    'category_link': response.urljoin(link),
                })

    def parse_category(self, response):
        # 抓取每個分頁中的所有子連結
        category_name = response.meta['category_name']
        category_link = response.meta['category_link']

        sublinks = response.css('table.hit-table h4 a')
        for sublink in sublinks:
            sub_name = sublink.css('::text').get()
            sub_link = sublink.css('::attr(href)').get()

            yield {
                'Category': category_name,
                'Category Link': category_link,
                'Resume Name': sub_name.strip() if sub_name else 'N/A',
                'Resume Link': response.urljoin(sub_link) if sub_link else 'N/A'
            }

        # 正確處理分頁邏輯
        next_page = response.css('ul.pagination-custom li a::attr(href)').getall()
        for page_link in next_page:
            if "page" in page_link:  # 確保是分頁連結
                next_page_url = response.urljoin(page_link)
                self.logger.info(f"Following next page: {next_page_url}")  # Log 下一頁
                yield response.follow(next_page_url, self.parse_category, meta={
                    'category_name': category_name,
                    'category_link': category_link,
                })


# 啟動 Scrapy
if __name__ == '__main__':
    process = CrawlerProcess()
    process.crawl(ResumeSpider)
    process.start()

    # 將結果轉換為 DataFrame 並保存為 CSV
    df = pd.read_json('resumes.json')
    df.to_csv('resumes.csv', index=False)
    print(df.head())


ModuleNotFoundError: No module named 'scrapy'