Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
byukid committed Nov 24, 2015
1 parent 07444c7 commit 7f7f092
Showing 1 changed file with 224 additions and 24 deletions.
248 changes: 224 additions & 24 deletions scraper.py
@@ -1,24 +1,224 @@
# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

# import scraperwiki
# import lxml.html
#
# # Read in a page
# html = scraperwiki.scrape("http://foo.com")
#
# # Find something on the page using css selectors
# root = lxml.html.fromstring(html)
# root.cssselect("div[align='left']")
#
# # Write out to the sqlite database using scraperwiki library
# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
#
# # An arbitrary query against the database
# scraperwiki.sql.select("* from data where 'name'='peter'")

# You don't have to do things with the ScraperWiki and lxml libraries.
# You can use whatever libraries you want: https://morph.io/documentation/python
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
#SQLite quirks, system takes query and wraps it in such a way that special things cannot happen
#scraperwiki does not allow indexes or require setting up cursors or committing
#also automatically opens your database

#Important Links,
#http://www.hobbyking.com/hobbyking/store/uh_viewItem.asp?idProduct=$productId

#Hobyking link mapping,
#Categories, __488__484__Cars_Parts-1_10th_Scale.html -> pc=484&idCategory=488

#Use dictionaries for core storage because you can store initial values and then
#when updating records you can update the dictionary and the commit it

#Dicts can update Dicts...
#Converting dicts to lists the dict gets sorted

#Create unique index does not work


import scraperwiki as scraper
import scraperwiki.sqlite as lite
import lxml.html as parser
from datetime import datetime
from urlparse import parse_qsl, urlparse
from datetime import date
import re
import Queue
import threading

queue = Queue.Queue()


######################### GLOBAL PARAMETERS #################################

scrapesite = "http://secure.social.yoox.it/api/Items.API/ADIDASBY_US/ItemsWithAvailability/get.json"
scrapeprod = "http://secure.social.yoox.it/api/Items.API/ADIDASBY_US/ItemsWithAvailability/get.json?cod8="

catpagesrx = "curPage"
prodpagesrx = "(?<![0-9])__([0-9]*)__(?![0-9])"

scrapecat = [ #cod8, ukipcategory, ukcurpage, lstdate
('86', 1, "date")
]

CoreKeys = [
['cod8' , 'INT' ], #Product ID
['Value' , 'REAL'], #Colorway and product code
]

PageParser = [
['ukLink' , 'TEXT'], #Product links yet to be explored
['LstDate' , 'TEXT'] #Last date parsed, parse once per month
]

CategoryParser = [
['ukidCategory', 'INT' ], #url parameter
['ukcurPage' , 'INT' ], #Page that has been parsed
['LstDate' , 'TEXT'] #Last date parsed, parse once per day
]

######################## CLASSES ################################


class core_store(object):
def __init__(self,table,keys):
self._table = table
self._keys = keys
#self.core_reset()
self.setup()


def setup(self):
if len(lite.table_info(self._table)) > 0: return 0

query = "CREATE TABLE IF NOT EXISTS "+self._table+" ("

for key in self._keys:
query = query+" "+key[0]+" "+key[1]

unique = ""
for key in self._keys:
if "uk" in key[0]:
unique = unique+" "+key[0]+","
print "Unique Keys in",self._table+":",unique

query = query+" UNIQUE("+unique+") ON CONFLICT IGNORE)"
query = query.replace("INT ","INT, ")
query = query.replace("REAL ","REAL, ")
query = query.replace("TEXT ","TEXT, ")
query = query.replace(",)", ")")

print query
lite.execute(query)

def core_reset(self):
lite.execute("DROP TABLE if exists "+self._table)

def additem(self,data):
lite.execute("INSERT into "+self._table+" values (?"+(len(self._keys)-1)*", ?"+")", data+('0',)*(len(self._keys)-len(data)))

def updateitem(self,data):
print "Updating:",data
lite.execute("INSERT or REPLACE into "+self._table+" values (?"+(len(self._keys)-1)*", ?"+")", data+('0',)*(len(self._keys)-len(data)))

def getdata(self):
return lite.execute("select * from "+self._table)

def commit(self):
lite.commit()

def __close__(self):
lite.commit()



class ThreadUrl(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue

def run(self):
while True:
host = self.queue.get()
url = urllib2.urlopen(host)

self.queue.task_done()

######################## FUNCTIONS ################################

def urltodict(url): #convert url parameters to a dict
keylist = list()
valuelist = list()
urldict = dict()

for item in parse_qsl(urlparse(url).query):
keylist.append(item[0])
valuelist.append(item[1])

urldict.update(zip(keylist,valuelist))

return urldict

def dicttourl(url,urldict): #Convert a query in a dict to a useable url
query = ""
del urldict['LstDate']
for key in urldict:
query = query+"&"+key+"="+str(urldict[key])
out = url+"?"+query
out = out.replace("?&","?")
return out

def querytodictlist(data):
out = list()
for value in data['data']:
newval = dict()
for key in data['keys']:
newval[key] = value[data['keys'].index(key)]
out.append(newval)
return out

#http://hobbyking.com/hobbyking/store/uh_listCategoriesAndProducts.asp?whl=XX&pc=&idCategory=86&curPage=2&v=&sortlist=&sortMotor=&LiPoConfig=&CatSortOrder=desc
def ParseCat(link):
CatData = urltodict(link)
CatStore.additem((CatData['idCategory'], CatData['curPage'], 0))

def ParseLink(link):
LinkStore.additem((link, 0))

def ParsePage(page):
print "Parseing Page"
for link in page.iterlinks():
url = link[2]
if type(CatRegex.search(url)).__name__ == 'SRE_Match': ParseCat(url)
elif type(PageRegex.search(url)).__name__ == 'SRE_Match': ParseLink(url)

######################## CODE ################################

#1) Load scrapecat into db
#2) Load category page
#3) Look at the bottom of a page for links to more pages in category
#4) Store a list of the categories and the pages contained
#5) Page parser reads the list of known categories
#6) Page parser gets the product links from each category page
#7) Page parser gets desired information from product page
#8) Loops till complete


#a) Setup Environment
ProductStore = core_store('ProductLinks', CoreKeys)
LinkStore = core_store('ScrapedLinks', PageParser)
CatStore = core_store('CatPages', CategoryParser)

CatRegex = re.compile(catpagesrx)
PageRegex = re.compile(prodpagesrx)


#1) Load scrapecat into db
for item in scrapecat:
CatStore.additem(item)


#2) Load category pages
CategoryPages = querytodictlist(CatStore.getdata())


for page in CategoryPages:
if not page['LstDate']==str(date.today()):
url = dicttourl(scrapesite,page).replace('uk','')
print "URL:",url

html = scraper.scrape(url)
rawpage = parser.fromstring(html)

ParsePage(rawpage)

LinkStore.commit()
CatStore.updateitem((page['ukidCategory'], page['ukcurPage'], str(date.today())))




ProductStore.__close__()
LinkStore.__close__()
CatStore.__close__()

0 comments on commit 7f7f092

Please sign in to comment.