forked from miura/Scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdfscraper.py
28 lines (25 loc) · 862 Bytes
/
pdfscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/usr/bin/env python
#
# Python script to download all linked PDF in a web page
import urllib, re
from BeautifulSoup import BeautifulSoup
location = "http://www1.doshisha.ac.jp/~mjin/R/"
#location = "http://x68000.q-e-d.net/~68user/unix/genre.html"
page = urllib.urlopen(location)
page1 = urllib.urlopen(location).read(20000)
soup = BeautifulSoup(page)
print page
# Find every occurrence of <a href="...">XLS</a> and download the file pointed to by href="...".
links = soup.findAll('a')
links = re.findall('<a href=(.*?)>.*?</a>',page1)
print len(links)
for link in links:
#if link.string == 'XLS':
#linkurl = link['href']
print link
#print linkurl
#if linkurl.endswith('.pdf'):
#filename = link.get('href')
#print("Retrieving " + filename)
##url = location + filename
##urllib.urlretrieve(url,filename)