Skip to content

Commit

Permalink
Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…
Browse files Browse the repository at this point in the history
  • Loading branch information
ahvallejo committed Feb 19, 2015
0 parents commit 2eb01f2
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
64 changes: 64 additions & 0 deletions scraper.py
@@ -0,0 +1,64 @@
###############################################################################
# START HERE: Tutorial for scraping pages behind form, using the
# very powerful Mechanize library. Documentation is here:
# http://wwwsearch.sourceforge.net/mechanize/
###############################################################################
import mechanize
import scraperwiki
import lxml.html as lh
import urllib
import urllib2
import urllib2, cookielib
from BeautifulSoup import BeautifulSoup

# set things up
jar = cookielib.CookieJar()
handler = urllib2.HTTPCookieProcessor(jar)
opener = urllib2.build_opener(handler)
urllib2.install_opener(opener)



# for letra in range(ord("n"),ord("y")+1):

#http://www.pmstudy.com/2011PMP/sim1/1ans.asp
pmiurl = "http://www.pmstudy.com/memberlogin.asp"
br = mechanize.Browser()
response = br.open(pmiurl)
#print "All forms:", [ form.name for form in br.forms() ]
#page = br.post("http://www.pmstudy.com/verifylogin1.asp",{"payer_email" => "ahvallejohotmail.com","pass" => "hola1hola1"})
url = 'http://www.pmstudy.com/verifylogin1.asp'
values = {'payer_email' : 'ahvallejo@hotmail.com',
'pass' : 'hola1hola1' }
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
the_page = response.read()
#print "the login page is",the_page
inicio=100
for i in range(100):
ansurl="http://www.pmstudy.com/2011PMP/sim4/"
ansurl=ansurl + str(i+1+inicio) + "ans.asp"
#print ansurl
req = urllib2.Request(ansurl)
response = urllib2.urlopen(req)
the_page = response.read()
soup = BeautifulSoup(the_page)
soup.prettify()
#print "the page ",i+1+inicio,"is",the_page
print soup
data = { 'ans' : str(i+1+inicio), 'html' : soup }
scraperwiki.sqlite.save(unique_keys=['ans'], data=data);

#print response.read()
#br.select_form(name="myform")
#print br.form
#print response.read()
#br["payer_email"] = "ahvallejo@hotmail.com"
#br["pass"] = "hola1hola1"
#response = br.submit()
#print page
#doc=lh.fromstring(response.read())
#print doc
# scraperwiki.sqlite.save(unique_keys=['Indice'], data=data)

0 comments on commit 2eb01f2

Please sign in to comment.