Permalink
Browse files

Keep the url of the item

Add some DB structure to the setup script in order to prepare the DB to
store the items
  • Loading branch information...
1 parent 286f44d commit d5c5451f77da894e0cf7521f4ab11ac1bf396200 @aleximplode committed Apr 27, 2012
Showing with 40 additions and 1 deletion.
  1. +2 −1 src/d3/d3/items.py
  2. +4 −0 src/d3/d3/pipelines.py
  3. +33 −0 src/d3/d3/setup.py
  4. +1 −0 src/d3/d3/spiders/itemspider.py
View
3 src/d3/d3/items.py
@@ -19,4 +19,5 @@ class ItemItem(Item):
imgwizard = Field()
stats = Field()
effects = Field()
- extras = Field()
+ extras = Field()
+ url = Field()
View
4 src/d3/d3/pipelines.py
@@ -78,6 +78,7 @@ def __init__(self):
def process_item(self, item, spider):
if isinstance(spider, TypeSpider) and isinstance(item, TypeItem):
+ # Store types to scrape in the ItemSpider
cursor = self.db.cursor()
cursor.execute('''
INSERT INTO foundtypes
@@ -88,4 +89,7 @@ def process_item(self, item, spider):
)
self.db.commit()
cursor.close()
+ elif isinstance(spider, ItemSpider) and isinstance(item, ItemItem):
+ # Store an item
+ pass
return item
View
33 src/d3/d3/setup.py
@@ -23,6 +23,39 @@ def setup():
ENGINE = InnoDB,
AUTO_INCREMENT = 0
''')
+
+ cursor.execute('''
+ CREATE TABLE IF NOT EXISTS items (
+ id INT AUTO_INCREMENT,
+ category NVARCHAR(100),
+ subcategory NVARCHAR(100),
+ name NVARCHAR(100),
+ itemtype NVARCHAR(100),
+ level INT UNSIGNED,
+ imgbarb NVARCHAR(250),
+ imgdh NVARCHAR(250),
+ imgmonk NVARCHAR(250),
+ imgwd NVARCHAR(250),
+ imgwizard NVARCHAR(250),
+ url NVARCHAR(250),
+ PRIMARY KEY(id)
+ )
+ ENGINE = InnoDB,
+ AUTO_INCREMENT = 0
+ ''')
+
+ cursor.execute('''
+ CREATE TABLE IF NOT EXISTS details (
+ id INT AUTO_INCREMENT,
+ detail NVARCHAR(2000),
+ itemid INT,
+ type ENUM('stat', 'effect', 'extra'),
+ PRIMARY KEY(id),
+ FOREIGN KEY(itemid) REFERENCES items(id)
+ )
+ ENGINE = InnoDB,
+ AUTO_INCREMENT = 0
+ ''')
db.commit()
cursor.close()
View
1 src/d3/d3/spiders/itemspider.py
@@ -44,6 +44,7 @@ def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = ItemItem()
+ item['url'] = response.url
content = hxs.select('//div[@class="body-bot"]')
item['category'] = content.select('.//h2[@class="header "]/a/text()').extract()

0 comments on commit d5c5451

Please sign in to comment.