-
Notifications
You must be signed in to change notification settings - Fork 201
/
movie_subject.py
41 lines (34 loc) · 1.33 KB
/
movie_subject.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import random
import string
from douban.items import Subject
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Request, Rule
class MovieSubjectSpider(CrawlSpider):
name = "movie_subject"
allowed_domains = ["m.douban.com"]
start_urls = ["https://m.douban.com/movie/subject/1292052/"]
rules = (
Rule(
LinkExtractor(allow=("movie/subject/(\\d)+\\?from=rec$")),
callback="parse_item",
follow=True,
process_request="cookie",
),
)
def cookie(self, request, response):
bid = "".join(random.choice(string.ascii_letters + string.digits) for x in range(11))
request.cookies["bid"] = bid
request = request.replace(url=request.url.replace("?", "/?"))
return request
def start_requests(self):
for url in self.start_urls:
bid = "".join(random.choice(string.ascii_letters + string.digits) for x in range(11))
yield Request(url, cookies={"bid": bid})
def set_douban_id(self, subject, response):
subject["douban_id"] = response.url[35:-10]
return subject
def parse_item(self, response):
subject = Subject()
self.set_douban_id(subject, response)
subject["type"] = "movie"
return subject