In [1]:
!pip install scrapy
!pip install twisted
!pip install service_identity
!pip install w3lib
!pip install scrapy scrapy-user-agents


Collecting scrapy
  Downloading Scrapy-2.12.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=21.7.0 (from scrapy)
  Downloading twisted-24.11.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.3.1-py3-none-any.whl.metadata (2.3 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Downloading zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_6

In [None]:
# import scrapy
# from scrapy.crawler import CrawlerProcess
# import pandas as pd


# class ResumeSpider(scrapy.Spider):
#     name = 'resumes'
#     allowed_domains = ['hireitpeople.com']
#     start_urls = ['https://www.hireitpeople.com/resume-database/']

#     custom_settings = {
#         'FEEDS': {
#             'resumes.json': {'format': 'json'},
#         }
#     }

#     def parse(self, response):
#         # 提取主分類的名稱和連結
#         rows = response.css('table.hit-table tr')
#         for row in rows:
#             name = row.css('h4 a::text').get()
#             link = row.css('h4 a::attr(href)').get()

#             if name and link:
#                 # 跟隨分類連結，處理分頁和子連結
#                 yield response.follow(link, self.parse_category, meta={
#                     'category_name': name.strip(),
#                     'category_link': response.urljoin(link),
#                 })

#     def parse_category(self, response):
#         # 抓取每個分頁中的所有子連結
#         category_name = response.meta['category_name']
#         category_link = response.meta['category_link']

#         sublinks = response.css('table.hit-table h4 a')
#         for sublink in sublinks:
#             sub_name = sublink.css('::text').get()
#             sub_link = sublink.css('::attr(href)').get()

#             yield {
#                 'Category': category_name,
#                 'Category Link': category_link,
#                 'Resume Name': sub_name.strip() if sub_name else 'N/A',
#                 'Resume Link': response.urljoin(sub_link) if sub_link else 'N/A'
#             }

#         # 正確處理分頁邏輯
#         next_page = response.css('ul.pagination-custom li a::attr(href)').getall()
#         for page_link in next_page:
#             if "page" in page_link:  # 確保是分頁連結
#                 next_page_url = response.urljoin(page_link)
#                 self.logger.info(f"Following next page: {next_page_url}")  # Log 下一頁
#                 yield response.follow(next_page_url, self.parse_category, meta={
#                     'category_name': category_name,
#                     'category_link': category_link,
#                 })


# # 啟動 Scrapy
# if __name__ == '__main__':
#     process = CrawlerProcess()
#     process.crawl(ResumeSpider)
#     process.start()

#     # 將結果轉換為 DataFrame 並保存為 CSV
#     df = pd.read_json('resumes.json')
#     df.to_csv('resumes.csv', index=False)
#     print(df.head())


In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd


class ResumeSpider(scrapy.Spider):
    name = 'resumes'
    allowed_domains = ['hireitpeople.com']
    start_urls = ['https://www.hireitpeople.com/resume-database/']

    custom_settings = {
        'FEEDS': {
            'resumes_details.json': {'format': 'json'},
        },
        'DOWNLOAD_DELAY': 2,  # 延遲 2 秒
        'CONCURRENT_REQUESTS': 1,  # 降低並發數量
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_START_DELAY': 1,  # 初始延遲 1 秒
        'AUTOTHROTTLE_MAX_DELAY': 5,  # 最大延遲 5 秒
        'AUTOTHROTTLE_TARGET_CONCURRENCY': 1.0,  # 目標並發為 1
        'RETRY_HTTP_CODES': [429],  # 重試 429 狀態碼
        'RETRY_TIMES': 5,  # 最大重試次數
        'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
        'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,  # 隨機 User-Agent
    }


    def parse(self, response):
        # 提取主分類的名稱和連結
        rows = response.css('table.hit-table tr')
        for row in rows:
            name = row.css('h4 a::text').get()
            link = row.css('h4 a::attr(href)').get()

            if name and link:
                # 跟隨分類連結，處理分頁和子連結
                yield response.follow(link, self.parse_category, meta={
                    'category_name': name.strip(),
                    'category_link': response.urljoin(link),
                    'page_number': 1  # 初始化頁碼
                })

    def parse_category(self, response):
        # 抓取每個分頁中的所有子連結
        category_name = response.meta['category_name']
        category_link = response.meta['category_link']
        page_number = response.meta['page_number']

        # 抓取當前頁面的所有子連結
        sublinks = response.css('table.hit-table h4 a')
        for sublink in sublinks:
            sub_name = sublink.css('::text').get()
            sub_link = sublink.css('::attr(href)').get()

            # 跟隨子連結，提取詳細資料
            if sub_link:
                yield response.follow(sub_link, self.parse_resume_detail, meta={
                    'category_name': category_name,
                    'category_link': category_link,
                    'resume_name': sub_name.strip() if sub_name else 'N/A',
                    'resume_link': response.urljoin(sub_link),
                })

        # 處理分頁邏輯，僅抓取前兩頁
        if page_number < 2:  # 限制爬取頁數
            next_page = response.css('ul.pagination-custom li a::attr(href)').getall()
            for page_link in next_page:
                if "page" in page_link:
                    next_page_url = response.urljoin(page_link)
                    yield response.follow(next_page_url, self.parse_category, meta={
                        'category_name': category_name,
                        'category_link': category_link,
                        'page_number': page_number + 1  # 增加頁碼
                    })
                    break  # 僅選擇第一個下一頁連結

    def parse_resume_detail(self, response):
        # 提取詳細資料
        category_name = response.meta['category_name']
        category_link = response.meta['category_link']
        resume_name = response.meta['resume_name']
        resume_link = response.meta['resume_link']

        # 提取 `div.cell-sm-9` 中的所有文字內容
        detailed_info = response.css('div.cell-sm-9 *::text').getall()
        detailed_info_cleaned = " ".join([text.strip() for text in detailed_info if text.strip()])

        yield {
            'Category': category_name,
            'Category Link': category_link,
            'Resume Name': resume_name,
            'Resume Link': resume_link,
            'Detailed Info': detailed_info_cleaned if detailed_info_cleaned else 'N/A',
        }


# 啟動 Scrapy
if __name__ == '__main__':
    process = CrawlerProcess()
    process.crawl(ResumeSpider)
    process.start()

    # 將結果轉換為 DataFrame 並保存為 CSV
    df = pd.read_json('resumes_details.json')
    df.to_csv('resumes_details.csv', index=False)
    print(df.head())
