In [None]:
!pip install scrapy
!pip install twisted
!pip install service_identity
!pip install w3lib


Collecting scrapy
  Downloading Scrapy-2.12.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=21.7.0 (from scrapy)
  Downloading twisted-24.11.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Downloading zope.interface-7.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd


class ResumeSpider(scrapy.Spider):
    name = 'resumes'
    allowed_domains = ['hireitpeople.com']
    start_urls = ['https://www.hireitpeople.com/resume-database/']

    custom_settings = {
        'FEEDS': {
            'resumes.json': {'format': 'json'},
        }
    }

    def parse(self, response):
        # 提取主分類的名稱和連結
        rows = response.css('table.hit-table tr')
        for row in rows:
            name = row.css('h4 a::text').get()
            link = row.css('h4 a::attr(href)').get()

            if name and link:
                # 跟隨分類連結，處理分頁和子連結
                yield response.follow(link, self.parse_category, meta={
                    'category_name': name.strip(),
                    'category_link': response.urljoin(link),
                })

    def parse_category(self, response):
        # 抓取每個分頁中的所有子連結
        category_name = response.meta['category_name']
        category_link = response.meta['category_link']

        sublinks = response.css('table.hit-table h4 a')
        for sublink in sublinks:
            sub_name = sublink.css('::text').get()
            sub_link = sublink.css('::attr(href)').get()

            yield {
                'Category': category_name,
                'Category Link': category_link,
                'Resume Name': sub_name.strip() if sub_name else 'N/A',
                'Resume Link': response.urljoin(sub_link) if sub_link else 'N/A'
            }

        # 正確處理分頁邏輯
        next_page = response.css('ul.pagination-custom li a::attr(href)').getall()
        for page_link in next_page:
            if "page" in page_link:  # 確保是分頁連結
                next_page_url = response.urljoin(page_link)
                self.logger.info(f"Following next page: {next_page_url}")  # Log 下一頁
                yield response.follow(next_page_url, self.parse_category, meta={
                    'category_name': category_name,
                    'category_link': category_link,
                })


# 啟動 Scrapy
if __name__ == '__main__':
    process = CrawlerProcess()
    process.crawl(ResumeSpider)
    process.start()

    # 將結果轉換為 DataFrame 並保存為 CSV
    df = pd.read_json('resumes.json')
    df.to_csv('resumes.csv', index=False)
    print(df.head())


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
DEBUG:scrapy.core.scraper:Scraped from <200 https://www.hireitpeople.com/resume-database/72-web-developer-resumes/page/3605>
{'Category': 'Web Developer Resumes', 'Category Link': 'https://www.hireitpeople.com/resume-database/72-web-developer-resumes', 'Resume Name': 'Network Administrator Resume Profile', 'Resume Link': 'https://www.hireitpeople.com/resume-database/72-web-developer-resumes/31483--network-administrator-resume-profile'}
2025-01-03 15:05:47 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.hireitpeople.com/resume-database/72-web-developer-resumes/page/3605>
{'Category': 'Web Developer Resumes', 'Category Link': 'https://www.hireitpeople.com/resume-database/72-web-developer-resumes', 'Resume Name': 'Network Administrator Resume Profile', 'Resume Link': 'https://www.hireitpeople.com/resume-database/72-web-developer-resumes/31483--network-administrator-resume-profile'}
DEBUG:scrapy.core.scraper:Scrape

                Category                                      Category Link  \
0  Network Admin Resumes  https://www.hireitpeople.com/resume-database/7...   
1  Network Admin Resumes  https://www.hireitpeople.com/resume-database/7...   
2  Network Admin Resumes  https://www.hireitpeople.com/resume-database/7...   
3  Network Admin Resumes  https://www.hireitpeople.com/resume-database/7...   
4  Network Admin Resumes  https://www.hireitpeople.com/resume-database/7...   

                                         Resume Name  \
0  Senior Associate -VMWARE/STORAGE Resume Tampa, FL   
1               Embedded Software Engineer Resume CA   
2                            Network Engineer Resume   
3                            Network Engineer Resume   
4  Windows System Administrator Resume Bloomingto...   

                                         Resume Link  
0  https://www.hireitpeople.com/resume-database/7...  
1  https://www.hireitpeople.com/resume-database/7...  
2  https://www.hireitpe