Skip to content


Browse files Browse the repository at this point in the history
  • Loading branch information
byukid committed Nov 24, 2015
1 parent 07444c7 commit 7f7f092
Showing 1 changed file with 224 additions and 24 deletions.
248 changes: 224 additions & 24 deletions
@@ -1,24 +1,224 @@
# This is a template for a Python scraper on (
# including some code snippets below that you should find helpful

# import scraperwiki
# import lxml.html
# # Read in a page
# html = scraperwiki.scrape("")
# # Find something on the page using css selectors
# root = lxml.html.fromstring(html)
# root.cssselect("div[align='left']")
# # Write out to the sqlite database using scraperwiki library
#['name'], data={"name": "susan", "occupation": "software developer"})
# # An arbitrary query against the database
#"* from data where 'name'='peter'")

# You don't have to do things with the ScraperWiki and lxml libraries.
# You can use whatever libraries you want:
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
#SQLite quirks, system takes query and wraps it in such a way that special things cannot happen
#scraperwiki does not allow indexes or require setting up cursors or committing
#also automatically opens your database

#Important Links,

#Hobyking link mapping,
#Categories, __488__484__Cars_Parts-1_10th_Scale.html -> pc=484&idCategory=488

#Use dictionaries for core storage because you can store initial values and then
#when updating records you can update the dictionary and the commit it

#Dicts can update Dicts...
#Converting dicts to lists the dict gets sorted

#Create unique index does not work

import scraperwiki as scraper
import scraperwiki.sqlite as lite
import lxml.html as parser
from datetime import datetime
from urlparse import parse_qsl, urlparse
from datetime import date
import re
import Queue
import threading

queue = Queue.Queue()

######################### GLOBAL PARAMETERS #################################

scrapesite = ""
scrapeprod = ""

catpagesrx = "curPage"
prodpagesrx = "(?<![0-9])__([0-9]*)__(?![0-9])"

scrapecat = [ #cod8, ukipcategory, ukcurpage, lstdate
('86', 1, "date")

CoreKeys = [
['cod8' , 'INT' ], #Product ID
['Value' , 'REAL'], #Colorway and product code

PageParser = [
['ukLink' , 'TEXT'], #Product links yet to be explored
['LstDate' , 'TEXT'] #Last date parsed, parse once per month

CategoryParser = [
['ukidCategory', 'INT' ], #url parameter
['ukcurPage' , 'INT' ], #Page that has been parsed
['LstDate' , 'TEXT'] #Last date parsed, parse once per day

######################## CLASSES ################################

class core_store(object):
def __init__(self,table,keys):
self._table = table
self._keys = keys

def setup(self):
if len(lite.table_info(self._table)) > 0: return 0

query = "CREATE TABLE IF NOT EXISTS "+self._table+" ("

for key in self._keys:
query = query+" "+key[0]+" "+key[1]

unique = ""
for key in self._keys:
if "uk" in key[0]:
unique = unique+" "+key[0]+","
print "Unique Keys in",self._table+":",unique

query = query+" UNIQUE("+unique+") ON CONFLICT IGNORE)"
query = query.replace("INT ","INT, ")
query = query.replace("REAL ","REAL, ")
query = query.replace("TEXT ","TEXT, ")
query = query.replace(",)", ")")

print query

def core_reset(self):
lite.execute("DROP TABLE if exists "+self._table)

def additem(self,data):
lite.execute("INSERT into "+self._table+" values (?"+(len(self._keys)-1)*", ?"+")", data+('0',)*(len(self._keys)-len(data)))

def updateitem(self,data):
print "Updating:",data
lite.execute("INSERT or REPLACE into "+self._table+" values (?"+(len(self._keys)-1)*", ?"+")", data+('0',)*(len(self._keys)-len(data)))

def getdata(self):
return lite.execute("select * from "+self._table)

def commit(self):

def __close__(self):

class ThreadUrl(threading.Thread):
def __init__(self, queue):
self.queue = queue

def run(self):
while True:
host = self.queue.get()
url = urllib2.urlopen(host)


######################## FUNCTIONS ################################

def urltodict(url): #convert url parameters to a dict
keylist = list()
valuelist = list()
urldict = dict()

for item in parse_qsl(urlparse(url).query):


return urldict

def dicttourl(url,urldict): #Convert a query in a dict to a useable url
query = ""
del urldict['LstDate']
for key in urldict:
query = query+"&"+key+"="+str(urldict[key])
out = url+"?"+query
out = out.replace("?&","?")
return out

def querytodictlist(data):
out = list()
for value in data['data']:
newval = dict()
for key in data['keys']:
newval[key] = value[data['keys'].index(key)]
return out

def ParseCat(link):
CatData = urltodict(link)
CatStore.additem((CatData['idCategory'], CatData['curPage'], 0))

def ParseLink(link):
LinkStore.additem((link, 0))

def ParsePage(page):
print "Parseing Page"
for link in page.iterlinks():
url = link[2]
if type( == 'SRE_Match': ParseCat(url)
elif type( == 'SRE_Match': ParseLink(url)

######################## CODE ################################

#1) Load scrapecat into db
#2) Load category page
#3) Look at the bottom of a page for links to more pages in category
#4) Store a list of the categories and the pages contained
#5) Page parser reads the list of known categories
#6) Page parser gets the product links from each category page
#7) Page parser gets desired information from product page
#8) Loops till complete

#a) Setup Environment
ProductStore = core_store('ProductLinks', CoreKeys)
LinkStore = core_store('ScrapedLinks', PageParser)
CatStore = core_store('CatPages', CategoryParser)

CatRegex = re.compile(catpagesrx)
PageRegex = re.compile(prodpagesrx)

#1) Load scrapecat into db
for item in scrapecat:

#2) Load category pages
CategoryPages = querytodictlist(CatStore.getdata())

for page in CategoryPages:
if not page['LstDate']==str(
url = dicttourl(scrapesite,page).replace('uk','')
print "URL:",url

html = scraper.scrape(url)
rawpage = parser.fromstring(html)


CatStore.updateitem((page['ukidCategory'], page['ukcurPage'], str(


0 comments on commit 7f7f092

Please sign in to comment.