Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 255 lines (239 sloc) 10.5 KB
#! /usr/bin/env python
# Author: Alex Ksikes (
# Using code from pycurl example
# - implement rotate ips
# - resolve the min conn min url choice
# - from file so no need to reread each time
# - with repeat all urls with problems are put at the end not sure it's the right approach
# - don't save empty files (the ones that errored)
# - when they are duplicate urls will break
# - in compress mode we had to close the directory
# >> alternatively we could catch ctrl-break and then close the repository
import hashlib
import os
import pycurl
import random
import repository
class Retriever:
def __init__(self, conn, cookie_path=''):
self.m = pycurl.CurlMulti()
self.m.handles = []
for i in range(conn):
c = pycurl.Curl()
c.fp = None
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.setopt(pycurl.CONNECTTIMEOUT, 30)
c.setopt(pycurl.TIMEOUT, 300)
c.setopt(pycurl.NOSIGNAL, 1)
#c.setopt(pycurl.USERAGENT, 'Googlebot/2.1 (+')
c.setopt(pycurl.USERAGENT, 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
if cookie_path:
c.setopt(pycurl.COOKIEFILE, cookie_path)
c.setopt(pycurl.COOKIEJAR, cookie_path)
def __init_urls(self, urls, shuffle=False, resume=[], no_duplicates=False, no_rename=False):
if no_duplicates:
urls = set(urls)
urls = list(urls)
if shuffle:
if resume:
resume = set(resume)
self.queue = []
for url in urls:
if url in resume:
if no_rename:
filename = url.split('/')[-1]
filename = hashlib.md5(url).hexdigest()
self.queue.append((url, filename))
self.num_urls = len(self.queue)
def __read(self, f):
# handle the repeat part as well
urls = []
for url in open(f):
url = url.split('\t')[0].strip()
if not url or url[0] == "#":
return urls
def __run(self, out_folder='.', min_size=0, repeat=False, store=False, compress=False): = store
if store:
self.repository = repository.Repository(root=out_folder, levels=store, compress=compress)
if repeat:
self.repeat = repeat
self.repeat_list = {}
freelist = self.m.handles[:]
num_processed = 0
while num_processed < self.num_urls:
# If there is an url to process and a free curl object, add to multi stack
while self.queue and freelist:
url, filename = self.queue.pop(0)
c = freelist.pop()
#c.fp = repository.RepositoryFile(os.path.join(out_folder, filename), self.repository)
c.fp = open(os.path.join(out_folder, filename), "wb")
c.setopt(pycurl.URL, url)
c.setopt(pycurl.WRITEDATA, c.fp)
# store some info
c.filename = filename
c.url = url
# Run the internal curl state machine for the multi stack
while 1:
ret, num_handles = self.m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
# Check for curl objects which have terminated, and add them to the freelist
while 1:
num_q, ok_list, err_list = self.m.info_read()
for c in ok_list:
c.fp = None
if store:
self.repository.add(os.path.join(out_folder, c.filename), remove=True)
# this makes the process slower but needed to keep integrity of the zip fies
s = "\t".join([c.url, c.filename, "SUCCESS", '', ''])
if repeat:
s += '\t' + str(self.__handle_repeat(c.url, c.filename, False))
print s
for c, errno, errmsg in err_list:
c.fp = None
if store:
self.repository.add(os.path.join(out_folder, c.filename), remove=True)
# this makes the process slower but needed to keep integrity of the zip fies
s = "\t".join([c.url, c.filename, "FAILED", str(errno), errmsg])
if repeat:
s += '\t' + str(self.__handle_repeat(c.url, c.filename))
print s
num_processed = num_processed + len(ok_list) + len(err_list)
if num_q == 0:
# Currently no more I/O is pending, could do something in the meantime
# (display a progress bar, etc.).
# We just call select() to sleep until some more data is available.
def __handle_repeat(self, url, filename, failed=True):
count = self.repeat_list.get(url, 0)
if failed:
if count < self.repeat:
self.queue.append((url, filename))
self.num_urls += 1
self.repeat_list[url] = count + 1
return count
def __clean_up(self):
for c in self.m.handles:
if c.fp is not None:
c.fp = None
# make sure we close the repository in compress mode
def dnl(self, urls, out_folder, shuffle, min_size, resume, repeat, store, compress, no_duplicates, no_rename):
if isinstance(urls, str):
urls = self.__read(urls)
if isinstance(resume, str):
resume = self.__read(resume)
self.__init_urls(urls, shuffle, resume, no_duplicates, no_rename)
self.__run(out_folder, min_size, repeat, store, compress)
def dnl(urls, conn=10, out_folder='.',
shuffle=False, min_size=0, resume=[],
repeat=False, store=False, compress=False,
no_duplicates=False, no_rename=False, cookie_path=''):
Retriever(conn, cookie_path=cookie_path).dnl(urls, out_folder, shuffle, min_size, resume, repeat, store, compress, no_duplicates, no_rename)
def usage():
print "Usage:"
print " python [options] <list_of_urls>"
print "Description:"
print " Mass download a list of urls using various options."
print " The list of urls is either a file (one line per url)"
print " or comma separated from the command line"
print " or taken from stdin if set to '-'."
print "Options:"
print " -c, --conn <num_conn> : number of concurrent connections"
print " -o, --out-folder <folder> : folder to store the retrieved files"
print " -s, --shuffle : shuffle the list of urls first"
print " -m, --min-file-size <bytes> : min file size before considered as an error"
print " -l, --sleep <num_failed> <sec> : sleep for x seconds after num_failed failures"
print " -r, --resume <resume_file> : resume download where it was left"
print " -p, --repeat <num_times> : attempt to re-download the urls which failed"
print " -t, --store [num_levels] : spread retrieved results in multiple directories"
print " -z, --compress : compress the result set"
print " -d, --no_duplicates : remove duplicate urls"
print " -n, --no_rename : take the end of the url path as filename"
print " -i, --rotate <ip_1,...,ip_n> <sec> : rotate outgoing ip every x sec (not implemented)"
print " -k, --use_cookie <path> : use existing cookie file"
print " -h, --help : this help message"
print "Email bugs/suggestions to Alex Ksikes ("
import sys, getopt, cStringIO
def main():
opts, args = getopt.getopt(sys.argv[1:], "c:o:sm:l:r:p:t:zdnk:h",
["conn=", "out-folder=", "shuffle=", "min-file-size=",
"sleep=", "resume=", "repeat=", "store=", "compress",
"no-duplicates", "no-rename", "use_cookie=", "help"])
except getopt.GetoptError:
usage(); sys.exit(2)
conn, out_folder, min_size, resume = 10, '.', 0, []
shuffle = sleep = repeat = store = compress = no_duplicates = no_rename = False
cookie_path = ''
for o, a in opts:
if o in ("-c", "--conn"):
conn = int(a)
elif o in ("-o", "--out-folder"):
out_folder = a
elif o in ("-s", "--shuffle"):
shuffle = True
elif o in ("-m", "--min-file-size"):
min_zize = int(a)
elif o in ("-l", "--sleep"):
sleep = map(a.split())
elif o in ("-r", "--resume"):
resume = a
elif o in ("-p", "--repeat"):
repeat = int(a)
elif o in ("-t", "--store"):
store = int(a)
elif o in ("-z", "--compress"):
compress = True
elif o in ("-d", "--no-duplicates"):
no_duplicates = True
elif o in ("-n", "--no-rename"):
no_rename = True
elif o in ("-k", "--use_cookie"):
cookie_path = a
elif o in ("-h", "--help"):
if len(args) < 1:
urls = sys.argv[-1]
if urls == "-":
urls = cStringIO.StringIO(
elif ',' in urls or urls.startswith('http://'):
urls = urls.split(',')
dnl(urls, conn=conn, out_folder=out_folder,
shuffle=shuffle, min_size=min_size, resume=resume,
repeat=repeat, store=store, compress=compress,
no_duplicates=no_duplicates, no_rename=no_rename,
if __name__ == '__main__':