Skip to content

Commit

Permalink
Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…
Browse files Browse the repository at this point in the history
  • Loading branch information
baradwaaj committed Apr 22, 2015
0 parents commit a411042
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
39 changes: 39 additions & 0 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import scraperwiki
import json
import re
import urlparse
import lxml.html

def scrape_laptop(url):
html = scraperwiki.scrape(url)
tree = lxml.html.fromstring(html)
title = tree.find('.//h1')
price = tree.find('.//span[@id="fk-mprod-our-id"]')
data = {
'title': title.text if title is not None else '',
'url': url,
'price': price.text_content() if price is not None else ''
}
for row in tree.findall('.//table[@class="fk-specs-type2"]//tr'):
label = row.find('th')
value = row.find('td')
if label is not None and value is not None and label.text is not None:
# Ensure key is simple text.
key = re.sub(r'[^a-zA-Z0-9_\- ]+', '-', label.text)
data[key] = value.text
scraperwiki.sqlite.save(unique_keys=["url"], data=data)

start = 0
while True:
data = scraperwiki.scrape('http://www.flipkart.com/computers/laptops/all?response-type=json&inf-start=%d' % start)
if data['count'] <= 0:
break
tree = lxml.html.fromstring(data['html'])
for link in tree.findall('.//a[@class="prd-img"]'):
url = link.get('href', '')
if not url:
continue
parsed_url = urlparse.urlparse(url)
print parsed_url.path
scrape_laptop('http://www.flipkart.com' + parsed_url.path)
start += 20

0 comments on commit a411042

Please sign in to comment.