Permalink
Browse files
Added a setup script, sample config data, pipelines to modify and upload
- Loading branch information...
|
@@ -2,6 +2,8 @@ d3scraper |
|
|
---------
|
|
|
|
|
|
###Use
|
|
|
+1. Run `src/d3/d3/setup.py`
|
|
|
+1. Go in to the `src/d3` directory
|
|
|
1. Run `scrapy crawl d3`
|
|
|
1. Output of the scrape will be in the mySQL server and database specified in the config.
|
|
|
|
|
|
|
|
@@ -1,5 +1,5 @@ |
|
|
class Config:
|
|
|
- mysqlserver = ''
|
|
|
- mysqldatabase = ''
|
|
|
- mysqlusername = ''
|
|
|
- mysqlpassword = ''
|
|
|
+ mysqlserver = 'localhost'
|
|
|
+ mysqldatabase = 'd3scrape'
|
|
|
+ mysqlusername = 'd3'
|
|
|
+ mysqlpassword = 'd3'
|
|
|
@@ -1,18 +1,34 @@ |
|
|
from d3.items import TypeItem
|
|
|
from d3.spiders.typespider import TypeSpider
|
|
|
+import MySQLdb
|
|
|
+from d3.config import Config
|
|
|
|
|
|
class TypeCleanerPipeline(object):
|
|
|
def process_item(self, item, spider):
|
|
|
- if item is TypeItem and spider is TypeSpider:
|
|
|
+ if isinstance(item, TypeItem) and isinstance(spider, TypeSpider):
|
|
|
item['category'] = item['category'].strip()
|
|
|
item['subcategory'] = item['subcategory'].strip()
|
|
|
item['name'] = item['name'].strip()
|
|
|
|
|
|
return item
|
|
|
|
|
|
class MySQLPipeline(object):
|
|
|
+ db = None
|
|
|
+
|
|
|
def __init__(self):
|
|
|
- pass
|
|
|
+ self.db = MySQLdb.connect(host = Config.mysqlserver, user = Config.mysqlusername,
|
|
|
+ passwd = Config.mysqlpassword, db = Config.mysqldatabase)
|
|
|
|
|
|
def process_item(self, item, spider):
|
|
|
+ if isinstance(spider, TypeSpider) and isinstance(item, TypeItem):
|
|
|
+ cursor = self.db.cursor()
|
|
|
+ cursor.execute('''
|
|
|
+ INSERT INTO foundtypes
|
|
|
+ (category, subcategory, name, url)
|
|
|
+ VALUES(%s, %s, %s, %s)
|
|
|
+ ''',
|
|
|
+ (item['category'], item['subcategory'], item['name'], item['url'])
|
|
|
+ )
|
|
|
+ self.db.commit()
|
|
|
+ cursor.close()
|
|
|
return item
|
|
|
@@ -0,0 +1,32 @@ |
|
|
+#!/usr/bin/python
|
|
|
+import sys
|
|
|
+
|
|
|
+sys.path.append('.')
|
|
|
+
|
|
|
+import MySQLdb
|
|
|
+from d3.config import Config
|
|
|
+
|
|
|
+def setup():
|
|
|
+ db = MySQLdb.connect(host = Config.mysqlserver, user = Config.mysqlusername,
|
|
|
+ passwd = Config.mysqlpassword, db = Config.mysqldatabase)
|
|
|
+ cursor = db.cursor()
|
|
|
+
|
|
|
+ cursor.execute('''
|
|
|
+ CREATE TABLE IF NOT EXISTS foundtypes (
|
|
|
+ id INT AUTO_INCREMENT,
|
|
|
+ category NVARCHAR(100),
|
|
|
+ subcategory NVARCHAR(100),
|
|
|
+ name NVARCHAR(100),
|
|
|
+ url NVARCHAR(200),
|
|
|
+ PRIMARY KEY(id)
|
|
|
+ )
|
|
|
+ ENGINE = InnoDB,
|
|
|
+ AUTO_INCREMENT = 0
|
|
|
+ ''')
|
|
|
+ db.commit()
|
|
|
+
|
|
|
+ cursor.close()
|
|
|
+ db.close()
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ setup()
|
|
@@ -15,20 +15,40 @@ def parse(self, response): |
|
|
items = []
|
|
|
|
|
|
for category in categories:
|
|
|
- catname = category.select('h3/text()').extract()
|
|
|
+ catname = category.select('.//h3/text()').extract()
|
|
|
+ if len(catname) >= 1:
|
|
|
+ catname = catname[0]
|
|
|
+ else:
|
|
|
+ catname = ''
|
|
|
+
|
|
|
self.log('Category %s' % catname)
|
|
|
subcategories = category.select('.//div[@class="box"]')
|
|
|
self.log('Found %d subcategories' % len(subcategories))
|
|
|
|
|
|
for subcategory in subcategories:
|
|
|
subcatname = subcategory.select('.//h4/text()').extract()
|
|
|
+ if len(subcatname) >= 1:
|
|
|
+ subcatname = subcatname[0]
|
|
|
+ else:
|
|
|
+ subcatname = ''
|
|
|
+
|
|
|
self.log('Subcategory %s' % subcatname)
|
|
|
links = subcategory.select('.//a')
|
|
|
self.log('Found %d links' % len(links))
|
|
|
|
|
|
for link in links:
|
|
|
name = link.select('.//text()').extract()
|
|
|
+ if len(name) >= 1:
|
|
|
+ name = name[0]
|
|
|
+ else:
|
|
|
+ name = ''
|
|
|
+
|
|
|
url = link.select('.//@href').extract()
|
|
|
+ if len(url) >= 1:
|
|
|
+ url = url[0]
|
|
|
+ else:
|
|
|
+ url = ''
|
|
|
+
|
|
|
self.log('Link [%s]: %s' % (name, url))
|
|
|
|
|
|
item = TypeItem()
|
|
|
0 comments on commit
51c4517