-
Notifications
You must be signed in to change notification settings - Fork 4
/
selenium_taobao.py
81 lines (61 loc) · 2.25 KB
/
selenium_taobao.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding: utf-8 -*-
from selenium import webdriver
from pyquery import PyQuery as pq
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
import pymongo
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
KEYWORD = 'iPhone'
def index_page(page):
print('正在爬取第', page, '页......')
try:
url = 'https://s.taobao.com/search?q=' + quote(KEYWORD)
browser.get(url)
if page > 1:
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > .input.J_Input')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
input.clear()
input.send_keys(page)
submit.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
get_products()
except TimeoutException:
index_page(page)
def get_products():
html = browser.page_source
doc = pq(html)
items = doc('.m-itemlist .items .item').items()
for item in items:
product = {
'image': item.find('.pic .img ').attr('data-src'),
'price': item.find('.price').text(),
'deal': item.find('.deal-cnt').text(),
'title': item.find('.title').text(),
'shop': item.find('.shop').text(),
'location': item.find('.location').text(),
}
print(product)
save_to_mongodb(product)
MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_COLLECTION = 'products'
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
def save_to_mongodb(result):
try:
if db[MONGO_COLLECTION].insert(result):
print('存储到MongoDB成功')
except Exception:
print('存储到MongoDB失败')
MAX_PAGE = 100
def main():
for i in range(1, MAX_PAGE + 1):
index_page(i)
if __name__ == '__main__':
main()
print('爬取结束')