# 盗墓笔记小说爬虫

## 1. 爬取页面

In [4]:
import requests
import books_bs_parser as bp

In [5]:
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'

In [6]:
user_agent

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'

In [7]:
headers = {'User-Agent': user_agent}
r = requests.get('http://seputu.com/', headers=headers)

In [8]:
r.encoding

'UTF-8'

## 2. 解析并存储页面

### 使用BeautifulSoup解析页面以及用JSON存储数据

In [6]:
!cat books_bs_parser.py

'''
基于BeautifulSoup和JSON的HTML静态页面解析和数据存储模块
'''
from bs4 import BeautifulSoup
import json

def parse_books(html_text):
    '''从HTML文本中解析出所有图书列表'''
    soup = BeautifulSoup(html_text, 'lxml')
    books = []
    book_divs = soup.find_all('div', class_="mulu")
    
    for book_div in book_divs:
        h2 = book_div.find('h2')
        if h2 != None:
            book_title = h2.text # 获取标题
            links = book_div.find_all('a')
            chapters = parse_chapters(links)
            books.append({'book_title': book_title, 'chapters': chapters})
        
        def parse_chapters(links):
            chapters = []
            for link in links:       
                href = link.get('href')
                title = link.get('title')
                chapters.append({'href':href, 'title':title})
            return chapters
    
    return books

def save_books_json(filename, books):
    '''将图书列表保存到JSON文件中'''
    with open(filename, 'w') as f:
            j

In [10]:
# 解析图书
books = bp.parse_books(r.text)
help(bp)

Help on module books_bs_parser:

NAME
    books_bs_parser - 基于BeautifulSoup和JSON的HTML静态页面解析和数据存储模块

FUNCTIONS
    parse_books(html_text)
        从HTML文本中解析出所有图书列表
    
    save_books(filename, books)
        将图书列表保存到JSON文件中

FILE
    /Users/xiaobai/Workspace/labs/python-samples/web_spider/books_bs_parser.py




In [11]:
# 保存图书为JSON文件
bp.save_books('books.json',books)

### 使用LXML解析页面以及用CSV存储数据

In [12]:
import books_xp_parser as bxp

In [16]:
!cat books_xp_parser.py

'''
基于lxml和CSV的HTML静态页面解析和数据存储模块
'''
from lxml import etree
import re, csv

def parse_books(html_text):
    '''
    从HTML文本中解析出所有图书列表
    '''
    tree = etree.HTML(html_text)
    book_divs = tree.xpath('.//div[@class="mulu"]')
    pattern = re.compile(r'\s*\[(.*)\]\s+(.*)')
    rows = []
    
    def parse_row(book_title, links):
        href = link.xpath('./@href')[0]
        title = link.xpath('./@title')[0]
        match = re.search(pattern, title)
        if match != None:
            date = match.group(1)
            title = match.group(2)
            row = (book_title, title, href, date)
            return row
        else:
            return None
        
    for book_div in book_divs:
        div_h2 = book_div.xpath('./div/center/h2/text()')
        if (len(div_h2) > 0):
            book_title = div_h2[0]
            # print(book_title)
            links = book_div.xpath('./div/ul/li/a')
            for link in links:
                row = pars

In [13]:
# 解析图书
books = bxp.parse_books(r.text) 

In [14]:
# 保存图书
bxp.save_books('books.csv', books)

In [15]:
# 图书数据分析
import pandas as pd
df = pd.read_csv('books.csv')
df

Unnamed: 0,book_title,title,href,date
0,盗墓笔记1七星鲁王宫,七星鲁王 第一章 血尸,http://seputu.com/biji1/1.html,2012-5-19 3:3:52
1,盗墓笔记1七星鲁王宫,七星鲁王 第二章 五十年后,http://seputu.com/biji1/2.html,2012-5-19 3:5:22
2,盗墓笔记1七星鲁王宫,七星鲁王 第三章 瓜子庙,http://seputu.com/biji1/3.html,2012-5-19 3:6:15
3,盗墓笔记1七星鲁王宫,七星鲁王 第四章 尸洞,http://seputu.com/biji1/4.html,2012-5-19 3:30:57
4,盗墓笔记1七星鲁王宫,七星鲁王 第五章 水影,http://seputu.com/biji1/5.html,2012-5-19 4:52:12
5,盗墓笔记1七星鲁王宫,七星鲁王 第六章 积尸地,http://seputu.com/biji1/6.html,2012-5-19 4:53:14
6,盗墓笔记1七星鲁王宫,七星鲁王 第七章 一百多个人头,http://seputu.com/biji1/7.html,2012-5-22 14:42:52
7,盗墓笔记1七星鲁王宫,七星鲁王 第八章 山谷,http://seputu.com/biji1/8.html,2012-5-22 14:43:24
8,盗墓笔记1七星鲁王宫,七星鲁王 第九章 古墓,http://seputu.com/biji1/9.html,2012-5-22 14:44:0
9,盗墓笔记1七星鲁王宫,七星鲁王 第十章 影子,http://seputu.com/biji1/10.html,2012-5-22 14:44:35
