-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
131 lines (105 loc) · 3.3 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Steps:
# 1. Make a request to the ebay.com and get a page
# 2. Collect data from each detail page
# 3. Collect all links to detail pages of each product
# 4. Write scraped data to a csv file
# reference: https://www.youtube.com/watch?v=m4hEAhHHykI
import requests
from bs4 import BeautifulSoup
import csv
import os
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded: ', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
try:
title = soup.find('h1', id='itemTitle').text.strip().replace(
'Details about \xa0', '')
except:
title = ''
try:
try:
p = soup.find('span', 'notranslate vi-VR-cvipPrice').text.strip()
except:
p = soup.find('span', class_='notranslate',
id='prcIsum').text.strip()
else:
p = soup.find(
'span', class_='notranslate vi-VR-cvipPrice').text.strip()
currency, price = p.split(' ')
except:
currency = ''
price = ''
try:
condition = soup.find('div', class_='u-flL condText').text.strip()
except:
condition = ''
features = {
'title': title,
'currency': currency,
'price': price,
'condition': condition
}
spec_keys = ['Processor',
'Screen Size',
'Color',
'RAM Size',
'SSD Capacity',
'GPU',
'Processor Speed',
'Brand',
'Series',
'Type',
'Maximum Resolution',
'Model',
'Operating System',
'Hard Drive Capacity',
'Storage Type',
'test']
specs_len = len(soup.find_all(
'div', class_='ux-labels-values__labels-content'))
i = 1
while i < specs_len:
spec_label = soup.find_all(
'div', class_='ux-labels-values__labels-content')[i].text.strip().replace(':', '')
if spec_label in spec_keys:
spec_value = soup.find_all(
'div', class_='ux-labels-values__values-content')[i].text.strip()
features[spec_label] = spec_value
else:
pass
i += 1
for spec in spec_keys:
if spec not in features.keys():
features[spec] = None
return features
def get_index_data(soup):
try:
links = soup.find_all('a', class_='s-item__link')
except:
links = []
urls = [item.get('href') for item in links]
return urls
def write_csv(data, url):
with open('output.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
row = list(data.values()) + [url]
writer.writerow(row)
def main():
pgn_max = 100
pgn = 1
while pgn <= pgn_max:
url = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=laptop&_sacat=0&rt=nc&LH_Sold=1&LH_Complete=1&_pgn=' + \
str(pgn)
products = get_index_data(get_page(url))
products_work = products[1:]
for link in products_work:
data = get_detail_data(get_page(link))
write_csv(data, link)
pgn += 1
if __name__ == '__main__':
main()