/
meituan.py
82 lines (71 loc) · 2.25 KB
/
meituan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# -*- coding: utf-8 -*-
import scrapy
import json
class MeituanSpider(scrapy.Spider):
name = 'meituan'
allowed_domains = ['zhaopin.meituan.com']
start_urls = ['http://zhaopin.meituan.com/']
positionUrl = 'http://zhaopin.meituan.com/search'
detailUrl = 'http://zhaopin.meituan.com/%d/jobDetail'
headers = {
'origin':
"http://zhaopin.meituan.com",
'user-agent':
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
'content-type':
"application/json",
'accept':
"*/*",
'dnt':
"1",
'accept-encoding':
"gzip, deflate",
'accept-language':
"zh-CN,zh;q=0.8,en;q=0.6",
}
pageIndex = 1
pageSize = 8
itemCount = 0
def start_requests(self):
return [self.next_request()]
def parse(self, response):
jdict = json.loads(response.body)
if jdict['code'] != 200:
return
pageInfo = jdict['pageInfo']
self.itemCount = pageInfo['itemCount']
pageList = jdict['pageList']
for page in pageList:
yield scrapy.http.FormRequest(
url=self.detailUrl % (page['id']),
method='POST',
headers=self.headers,
callback=self.parse_detail)
if self.pageIndex * self.pageSize < self.itemCount:
self.pageIndex += 1
yield self.next_request()
def parse_detail(self, response):
jdict = json.loads(response.body)
if jdict['code'] == 200:
jdata = jdict['data']
jdata['pid'] = jdata['id']
yield jdata
def next_request(self):
payload = json.dumps({
"pageInfo": {
"pageIndex": self.pageIndex,
"pageSize": self.pageSize,
"itemCount": self.itemCount,
},
"workLocations": [],
"jobTypes": [],
"jobSubTypes": [],
"keyword": ""
})
print(payload)
return scrapy.http.FormRequest(
url=self.positionUrl,
method='POST',
headers=self.headers,
body=payload,
callback=self.parse)