In [13]:
import requests
import json
import threading
from lxml import etree
from queue import Queue

In [14]:
class CrawlThread(threading.Thread):
    '''
    爬虫线程类
    '''
    def __init__(self, thread_id, page_queue):
        super().__init__()
        self.thread_id = thread_id
        self.page_queue = page_queue
    
    def run(self):
        '''
        重写run()
        '''
        print(f'启动线程： {self.thread_id}')
        self.scheduler()
        print(f'结束线程： {self.thread_id}')
    
    #任务调度：
    def scheduler(self):
        while True:
            #队列为空不处理
            if self.page_queue.empty():
                break
            else:
                page = self.page_queue.get()
                print(f'下载线程{self.thread_id}, 下载页码{page}')
                url = f'https://book.douban.com/top250?start={page*25}'
                headers = {
                    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
                }
                try:
                    # downloader 下载器
                    response = requests.get(url, headers=headers)
                    dataQueue.put(response.text)
                except Exception as e:
                    print('出现异常：', e)
                    
    

In [15]:
class ParserThread(threading.Thread):
    '''
    页面内容分析线程类
    '''
    def __init__(self, thread_id, data_queue, file):
        super().__init__()
        self.thread_id = thread_id
        self.data_queue = data_queue
        self.file = file
    
    def run(self):
        print(f'启动线程{self.thread_id}')
        while not flag:
            try:
                item = self.data_queue.get(False)
                if not item:
                    pass
                self.parse_data(item)
                self.data_queue.task_done()  # get之后检测是否会阻塞
            except Exception as e:
                pass
        print(f'结束线程{self.thread_id}')
    
    #页面内容分析：
    def parse_data(self, item):
        '''
        解析网页内容的函数
        :param item:
        :return:
        '''
        try:
            html = etree.HTML(item)
            books = html.xpath('//div[@class="pl2"]')
            try:
                for book in books:
                    title = book.xpath('./a/text()')
                    link = book.xpath('./a/@href')
                    response={
                        'title': title,
                        'link': link
                    }
                    #解析方法和scrapy相同，再构造一个json
                    json.dump(response, fp=self.file, ensure_ascii=False)
            except Exception as e:
                print('Book Error.', e)
        except Exception as e:
            print('Page Error.', e)
    

In [16]:
#存放解析数据的queue
dataQueue = Queue()
#控制ParserThread标签：
flag = False

In [17]:
if __name__ == '__main__':
    # 将结果保存到一个json文件中
    output = open('books.json', 'a', encoding='utf-8')
    
    # 任务队列，存放网页的队列
    page_queue = Queue(20)
    for i in range(11):
        page_queue.put(i)
    
    # 爬虫线程
    crawl_threads = []
    crawl_name_list = ['crawl_1', 'crawl_2', 'crawl_3']
    for thread_id in crawl_name_list:
        thread = CrawlThread(thread_id, page_queue)
        thread.start()
        crawl_threads.append(thread)
    
    # 解析线程
    parse_threads = []
    parse_name_list = ['parse_1', 'parse_2', 'parse_3']
    for thread_id in parse_name_list:
        thread = ParserThread(thread_id, dataQueue, output)
        thread.start()
        parse_threads.append(thread)
        
    # 结束crawl线程
    for thread in crawl_threads:
        thread.join()
        
    # 结束parse线程
    flag = True
    for thread in parse_threads:
        thread.join()
        
    output.close()
    print('退出主线程。')

启动线程： crawl_1
下载线程crawl_1, 下载页码0
启动线程： crawl_2
下载线程crawl_2, 下载页码1
启动线程： crawl_3
下载线程crawl_3, 下载页码2
启动线程parse_1
启动线程parse_2
启动线程parse_3
下载线程crawl_1, 下载页码3
下载线程crawl_3, 下载页码4
下载线程crawl_2, 下载页码5
下载线程crawl_1, 下载页码6
下载线程crawl_3, 下载页码7
下载线程crawl_2, 下载页码8
下载线程crawl_3, 下载页码9
下载线程crawl_1, 下载页码10结束线程： crawl_2

结束线程： crawl_1
结束线程： crawl_3
结束线程parse_1结束线程parse_2

结束线程parse_3
退出主线程。
