-
Notifications
You must be signed in to change notification settings - Fork 1
/
douban_books.py
110 lines (87 loc) · 3.27 KB
/
douban_books.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import http.cookiejar
import re
import sys
import time
import urllib
from urllib import request
def get_html(url):
"""
抓取网页
"""
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'),
('Cookie', '4564564564564564565646540')]
urllib.request.install_opener(opener)
try:
while True:
page = urllib.request.urlopen(url)
html = page.read().decode("utf-8")
page.close()
anti_spider = re.findall(r'403 Forbidden', html)
if anti_spider:
print("反爬虫了,休息10分钟...")
time.sleep(600)
else:
return html
except Exception as e:
print(e)
sys.exit()
def get_books_info(html):
"""
获得一页上面所有书的信息
"""
books = []
one_page_books = re.findall(r'href="(https://book\.douban\.com/subject/\d+/)" title="(.*?)"', html)
for url, name in one_page_books:
one_book_html = get_html(url)
# print one_book_html
author, score = get_book_info(one_book_html)
# print author, score
if not author and not score:
break
name = "《" + name + "》"
print('{"name": %s, "author": %s, "score": %s, "url": %s}' % (name, author, score, url))
books.append({"name": name, "author": author, "score": score, "url": url})
# print("别爬太快,休息一下")
time.sleep(3)
return books
def get_book_info(html):
"""
获得一本书的评分和作者
"""
try:
score = re.findall(r'property="v:average"> (.*?) </strong>', html)[0]
author = re.findall(r'<span class="pl"> 作者</span>[\w\W]*?<a class="" href=".*?">(.*?)</a>', html)[0]
return author, score
except Exception as e:
print("评分和作者出了点小问题:")
print(e)
return "", ""
def main():
page = 0
books = []
while True:
url = "https://book.douban.com/tag/%%E5%%B0%%8F%%E8%%AF%%B4?start=%s&type=T" % str(page)
html = get_html(url)
one_page_books = get_books_info(html)
books.extend(one_page_books)
if len(one_page_books) == 0:
print("好啦只有这些书啦~")
break
page += 20
# 有的书评价人数太少,没有分数,怕信息不全,补全一下
for b in books:
b["author"] = b["author"] or "(佚名)"
b["score"] = b["score"] or "0.0"
# 排序
books.sort(key=lambda x: float(x["score"]), reverse=True)
# 保存书籍信息
with open("douban.csv", "w", encoding="gbk", errors="ignore") as f:
line_template = "%(name)s,%(author)s,%(score)s,%(url)s\n"
f.write(line_template % ({"name": "书名", "author": "作者", "score": "豆瓣评分", "url": "豆瓣链接"}))
for book in books:
f.write(line_template % book)
if __name__ == "__main__":
main()