-
Notifications
You must be signed in to change notification settings - Fork 17
/
retrieve.py
executable file
·254 lines (239 loc) · 10.5 KB
/
retrieve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#! /usr/bin/env python
# Author: Alex Ksikes (alex.ksikes@gmail.com)
# Using code from pycurl retriever-multi.py example
# TODO:
# - implement rotate ips
# - resolve the min conn min url choice
# - from file so no need to reread each time
# - with repeat all urls with problems are put at the end not sure it's the right approach
# - don't save empty files (the ones that errored)
# - when they are duplicate urls repository.py will break
# - in compress mode we had to close the directory
# >> alternatively we could catch ctrl-break and then close the repository
import hashlib
import os
import pycurl
import random
import repository
class Retriever:
def __init__(self, conn, cookie_path=''):
self.m = pycurl.CurlMulti()
self.m.handles = []
for i in range(conn):
c = pycurl.Curl()
c.fp = None
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.setopt(pycurl.CONNECTTIMEOUT, 30)
c.setopt(pycurl.TIMEOUT, 300)
c.setopt(pycurl.NOSIGNAL, 1)
#c.setopt(pycurl.USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)')
c.setopt(pycurl.USERAGENT, 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
if cookie_path:
c.setopt(pycurl.COOKIEFILE, cookie_path)
c.setopt(pycurl.COOKIEJAR, cookie_path)
self.m.handles.append(c)
def __init_urls(self, urls, shuffle=False, resume=[], no_duplicates=False, no_rename=False):
if no_duplicates:
urls = set(urls)
urls = list(urls)
if shuffle:
random.shuffle(urls)
if resume:
resume = set(resume)
self.queue = []
for url in urls:
if url in resume:
continue
if no_rename:
filename = url.split('/')[-1]
else:
filename = hashlib.md5(url).hexdigest()
self.queue.append((url, filename))
self.num_urls = len(self.queue)
def __read(self, f):
# handle the repeat part as well
urls = []
for url in open(f):
url = url.split('\t')[0].strip()
if not url or url[0] == "#":
continue
urls.append(url)
return urls
def __run(self, out_folder='.', min_size=0, repeat=False, store=False, compress=False):
self.store = store
if store:
self.repository = repository.Repository(root=out_folder, levels=store, compress=compress)
if repeat:
self.repeat = repeat
self.repeat_list = {}
freelist = self.m.handles[:]
num_processed = 0
while num_processed < self.num_urls:
# If there is an url to process and a free curl object, add to multi stack
while self.queue and freelist:
url, filename = self.queue.pop(0)
c = freelist.pop()
#c.fp = repository.RepositoryFile(os.path.join(out_folder, filename), self.repository)
c.fp = open(os.path.join(out_folder, filename), "wb")
c.setopt(pycurl.URL, url)
c.setopt(pycurl.WRITEDATA, c.fp)
self.m.add_handle(c)
# store some info
c.filename = filename
c.url = url
# Run the internal curl state machine for the multi stack
while 1:
ret, num_handles = self.m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Check for curl objects which have terminated, and add them to the freelist
while 1:
num_q, ok_list, err_list = self.m.info_read()
for c in ok_list:
c.fp.close()
c.fp = None
if store:
self.repository.add(os.path.join(out_folder, c.filename), remove=True)
# this makes the process slower but needed to keep integrity of the zip fies
self.repository.close()
self.m.remove_handle(c)
s = "\t".join([c.url, c.filename, "SUCCESS", '', ''])
if repeat:
s += '\t' + str(self.__handle_repeat(c.url, c.filename, False))
print s
freelist.append(c)
for c, errno, errmsg in err_list:
c.fp.close()
c.fp = None
if store:
self.repository.add(os.path.join(out_folder, c.filename), remove=True)
# this makes the process slower but needed to keep integrity of the zip fies
self.repository.close()
self.m.remove_handle(c)
s = "\t".join([c.url, c.filename, "FAILED", str(errno), errmsg])
if repeat:
s += '\t' + str(self.__handle_repeat(c.url, c.filename))
print s
freelist.append(c)
num_processed = num_processed + len(ok_list) + len(err_list)
if num_q == 0:
break
# Currently no more I/O is pending, could do something in the meantime
# (display a progress bar, etc.).
# We just call select() to sleep until some more data is available.
self.m.select(1.0)
def __handle_repeat(self, url, filename, failed=True):
count = self.repeat_list.get(url, 0)
if failed:
if count < self.repeat:
self.queue.append((url, filename))
self.num_urls += 1
self.repeat_list[url] = count + 1
return count
def __clean_up(self):
for c in self.m.handles:
if c.fp is not None:
c.fp.close()
c.fp = None
c.close()
self.m.close()
# make sure we close the repository in compress mode
if self.store:
self.repository.close()
def dnl(self, urls, out_folder, shuffle, min_size, resume, repeat, store, compress, no_duplicates, no_rename):
if isinstance(urls, str):
urls = self.__read(urls)
if isinstance(resume, str):
resume = self.__read(resume)
self.__init_urls(urls, shuffle, resume, no_duplicates, no_rename)
self.__run(out_folder, min_size, repeat, store, compress)
self.__clean_up()
def dnl(urls, conn=10, out_folder='.',
shuffle=False, min_size=0, resume=[],
repeat=False, store=False, compress=False,
no_duplicates=False, no_rename=False, cookie_path=''):
Retriever(conn, cookie_path=cookie_path).dnl(urls, out_folder, shuffle, min_size, resume, repeat, store, compress, no_duplicates, no_rename)
def usage():
print "Usage:"
print " python retrieve.py [options] <list_of_urls>"
print
print "Description:"
print " Mass download a list of urls using various options."
print
print " The list of urls is either a file (one line per url)"
print " or comma separated from the command line"
print " or taken from stdin if set to '-'."
print
print "Options:"
print " -c, --conn <num_conn> : number of concurrent connections"
print " -o, --out-folder <folder> : folder to store the retrieved files"
print " -s, --shuffle : shuffle the list of urls first"
print " -m, --min-file-size <bytes> : min file size before considered as an error"
print " -l, --sleep <num_failed> <sec> : sleep for x seconds after num_failed failures"
print " -r, --resume <resume_file> : resume download where it was left"
print " -p, --repeat <num_times> : attempt to re-download the urls which failed"
print " -t, --store [num_levels] : spread retrieved results in multiple directories"
print " -z, --compress : compress the result set"
print " -d, --no_duplicates : remove duplicate urls"
print " -n, --no_rename : take the end of the url path as filename"
print " -i, --rotate <ip_1,...,ip_n> <sec> : rotate outgoing ip every x sec (not implemented)"
print " -k, --use_cookie <path> : use existing cookie file"
print " -h, --help : this help message"
print
print "Email bugs/suggestions to Alex Ksikes (alex.ksikes@gmail.com)"
import sys, getopt, cStringIO
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "c:o:sm:l:r:p:t:zdnk:h",
["conn=", "out-folder=", "shuffle=", "min-file-size=",
"sleep=", "resume=", "repeat=", "store=", "compress",
"no-duplicates", "no-rename", "use_cookie=", "help"])
except getopt.GetoptError:
usage(); sys.exit(2)
conn, out_folder, min_size, resume = 10, '.', 0, []
shuffle = sleep = repeat = store = compress = no_duplicates = no_rename = False
cookie_path = ''
for o, a in opts:
if o in ("-c", "--conn"):
conn = int(a)
elif o in ("-o", "--out-folder"):
out_folder = a
elif o in ("-s", "--shuffle"):
shuffle = True
elif o in ("-m", "--min-file-size"):
min_zize = int(a)
elif o in ("-l", "--sleep"):
sleep = map(a.split())
elif o in ("-r", "--resume"):
resume = a
elif o in ("-p", "--repeat"):
repeat = int(a)
elif o in ("-t", "--store"):
store = int(a)
elif o in ("-z", "--compress"):
compress = True
elif o in ("-d", "--no-duplicates"):
no_duplicates = True
elif o in ("-n", "--no-rename"):
no_rename = True
elif o in ("-k", "--use_cookie"):
cookie_path = a
elif o in ("-h", "--help"):
usage()
sys.exit()
if len(args) < 1:
usage()
else:
urls = sys.argv[-1]
if urls == "-":
urls = cStringIO.StringIO(sys.stdin.read())
elif ',' in urls or urls.startswith('http://'):
urls = urls.split(',')
dnl(urls, conn=conn, out_folder=out_folder,
shuffle=shuffle, min_size=min_size, resume=resume,
repeat=repeat, store=store, compress=compress,
no_duplicates=no_duplicates, no_rename=no_rename,
cookie_path=cookie_path)
if __name__ == '__main__':
main()