Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Added a setup script, sample config data, pipelines to modify and upload

data
  • Loading branch information...
commit 51c45172d7f251740cb3fe70202a13b46dcdfaec 1 parent 31e66e0
@aleximplode authored
View
2  README.markdown
@@ -2,6 +2,8 @@ d3scraper
---------
###Use
+1. Run `src/d3/d3/setup.py`
+1. Go in to the `src/d3` directory
1. Run `scrapy crawl d3`
1. Output of the scrape will be in the mySQL server and database specified in the config.
View
8 src/d3/d3/config.py
@@ -1,5 +1,5 @@
class Config:
- mysqlserver = ''
- mysqldatabase = ''
- mysqlusername = ''
- mysqlpassword = ''
+ mysqlserver = 'localhost'
+ mysqldatabase = 'd3scrape'
+ mysqlusername = 'd3'
+ mysqlpassword = 'd3'
View
20 src/d3/d3/pipelines.py
@@ -1,9 +1,11 @@
from d3.items import TypeItem
from d3.spiders.typespider import TypeSpider
+import MySQLdb
+from d3.config import Config
class TypeCleanerPipeline(object):
def process_item(self, item, spider):
- if item is TypeItem and spider is TypeSpider:
+ if isinstance(item, TypeItem) and isinstance(spider, TypeSpider):
item['category'] = item['category'].strip()
item['subcategory'] = item['subcategory'].strip()
item['name'] = item['name'].strip()
@@ -11,8 +13,22 @@ def process_item(self, item, spider):
return item
class MySQLPipeline(object):
+ db = None
+
def __init__(self):
- pass
+ self.db = MySQLdb.connect(host = Config.mysqlserver, user = Config.mysqlusername,
+ passwd = Config.mysqlpassword, db = Config.mysqldatabase)
def process_item(self, item, spider):
+ if isinstance(spider, TypeSpider) and isinstance(item, TypeItem):
+ cursor = self.db.cursor()
+ cursor.execute('''
+ INSERT INTO foundtypes
+ (category, subcategory, name, url)
+ VALUES(%s, %s, %s, %s)
+ ''',
+ (item['category'], item['subcategory'], item['name'], item['url'])
+ )
+ self.db.commit()
+ cursor.close()
return item
View
32 src/d3/d3/setup.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+import sys
+
+sys.path.append('.')
+
+import MySQLdb
+from d3.config import Config
+
+def setup():
+ db = MySQLdb.connect(host = Config.mysqlserver, user = Config.mysqlusername,
+ passwd = Config.mysqlpassword, db = Config.mysqldatabase)
+ cursor = db.cursor()
+
+ cursor.execute('''
+ CREATE TABLE IF NOT EXISTS foundtypes (
+ id INT AUTO_INCREMENT,
+ category NVARCHAR(100),
+ subcategory NVARCHAR(100),
+ name NVARCHAR(100),
+ url NVARCHAR(200),
+ PRIMARY KEY(id)
+ )
+ ENGINE = InnoDB,
+ AUTO_INCREMENT = 0
+ ''')
+ db.commit()
+
+ cursor.close()
+ db.close()
+
+if __name__ == '__main__':
+ setup()
View
22 src/d3/d3/spiders/typespider.py
@@ -15,20 +15,40 @@ def parse(self, response):
items = []
for category in categories:
- catname = category.select('h3/text()').extract()
+ catname = category.select('.//h3/text()').extract()
+ if len(catname) >= 1:
+ catname = catname[0]
+ else:
+ catname = ''
+
self.log('Category %s' % catname)
subcategories = category.select('.//div[@class="box"]')
self.log('Found %d subcategories' % len(subcategories))
for subcategory in subcategories:
subcatname = subcategory.select('.//h4/text()').extract()
+ if len(subcatname) >= 1:
+ subcatname = subcatname[0]
+ else:
+ subcatname = ''
+
self.log('Subcategory %s' % subcatname)
links = subcategory.select('.//a')
self.log('Found %d links' % len(links))
for link in links:
name = link.select('.//text()').extract()
+ if len(name) >= 1:
+ name = name[0]
+ else:
+ name = ''
+
url = link.select('.//@href').extract()
+ if len(url) >= 1:
+ url = url[0]
+ else:
+ url = ''
+
self.log('Link [%s]: %s' % (name, url))
item = TypeItem()
Please sign in to comment.
Something went wrong with that request. Please try again.