/
archiver.py
243 lines (200 loc) · 7.18 KB
/
archiver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
from functools import partial
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import argparse
import logging
import multiprocessing as mp
import re
import requests
import time
import xml.etree.ElementTree as ET
# Library version
__version__ = "1.7.2"
# String used to prefix local sitemaps
LOCAL_PREFIX = "file://"
def format_archive_url(url):
"""Given a URL, constructs an Archive URL to submit the archive request."""
logging.debug("Creating archive URL for %s", url)
SAVE_URL = "https://web.archive.org/save/"
request_url = SAVE_URL + url
return request_url
def call_archiver(request_url, rate_limit_wait, session):
"""Submit a url to the Internet Archive to archive."""
if rate_limit_wait > 0:
logging.debug("Sleeping for %s", rate_limit_wait)
time.sleep(rate_limit_wait)
logging.info("Calling archive url %s", request_url)
r = session.head(request_url, allow_redirects=True)
try:
# Raise `requests.exceptions.HTTPError` if 4XX or 5XX status
r.raise_for_status()
except requests.exceptions.HTTPError as e:
logging.exception(e)
raise
def get_namespace(element):
"""Extract the namespace using a regular expression."""
match = re.match(r"\{.*\}", element.tag)
return match.group(0) if match else ""
def download_remote_sitemap(sitemap_url, session):
"""Download the sitemap of the target website."""
logging.debug("Downloading: %s", sitemap_url)
r = session.get(sitemap_url)
return r.text.encode("utf-8")
def load_local_sitemap(sitemap_filepath):
"""Load a local sitemap and return it as a string."""
logging.debug("Loading local sitemap: %s", sitemap_filepath)
if sitemap_filepath.startswith(LOCAL_PREFIX):
sitemap_filepath = sitemap_filepath[len(LOCAL_PREFIX):]
# Try to open the file, error on failure
try:
logging.debug("Opening local file '%s'", sitemap_filepath)
with open(sitemap_filepath, "r") as fp:
contents = fp.read()
except IOError as e:
logging.exception(e)
raise
return contents
def sitemap_is_local(sitemap_url):
"""Returns True if we believe a URI to be local, False otherwise."""
return sitemap_url.startswith(LOCAL_PREFIX) or sitemap_url.startswith("/")
def extract_pages_from_sitemap(site_map_text):
"""Extract the various pages from the sitemap text. """
root = ET.fromstring(site_map_text)
# Sitemaps use a namespace in the XML, which we need to read
namespace = get_namespace(root)
urls = []
for loc_node in root.findall(".//{}loc".format(namespace)):
urls.append(loc_node.text)
return set(urls)
def main():
# Command line parsing
parser = argparse.ArgumentParser(
prog="Wayback Machine Archiver",
description="A script to backup a web pages with Internet Archive",
)
parser.add_argument(
"--version",
action="version",
version="%(prog)s {version}".format(version=__version__),
)
parser.add_argument(
"urls",
nargs="*",
default=[],
help="the URLs of the pages to archive",
)
parser.add_argument(
"--file",
help="path to a file containing urls to save (one url per line)",
required=False,
)
parser.add_argument(
"--sitemaps",
nargs="+",
default=[],
help="one or more URIs to sitemaps listing pages to archive; local paths must be prefixed with '{f}'".format(f=LOCAL_PREFIX),
required=False,
)
parser.add_argument(
"--log",
help="set the logging level, defaults to WARNING",
dest="log_level",
default=logging.WARNING,
choices=[
"DEBUG",
"INFO",
"WARNING",
"ERROR",
"CRITICAL",
],
)
parser.add_argument(
"--log-to-file",
help="redirect logs to a file",
dest="log_file",
default=None,
)
parser.add_argument(
"--archive-sitemap-also",
help="also submit the URL of the sitemap to be archived",
dest="archive_sitemap",
default=False,
action="store_true",
)
parser.add_argument(
"--jobs",
"-j",
help="run this many concurrent URL submissions, defaults to 1",
default=1,
type=int,
)
parser.add_argument(
"--rate-limit-wait",
help="number of seconds to wait between page requests to avoid flooding the archive site, defaults to 5; also used as the backoff factor for retries",
dest="rate_limit_in_sec",
default=5,
type=int,
)
args = parser.parse_args()
# Set the logging level based on the arguments
#
# If `filename` is None, the constructor will set up a stream, otherwise it
# will use the file specified.
logging.basicConfig(level=args.log_level, filename=args.log_file)
logging.debug("Arguments: %s", args)
archive_urls = []
# Add the regular pages
if args.urls:
logging.info("Adding page URLs to archive")
logging.debug("Page URLs to archive: %s", args.urls)
archive_urls += map(format_archive_url, args.urls)
# Set up retry and backoff
session = requests.Session()
retries = Retry(
total=5,
backoff_factor=args.rate_limit_in_sec,
status_forcelist=[500, 502, 503, 504],
)
session.mount("https://", HTTPAdapter(max_retries=retries))
session.mount("http://", HTTPAdapter(max_retries=retries))
# Download and process the sitemaps
remote_sitemaps = set()
logging.info("Parsing sitemaps")
for sitemap_url in args.sitemaps:
# Save the remote ones, incase the user wants us to backthem up
if sitemap_is_local(sitemap_url):
logging.debug("The sitemap '%s' is local.", sitemap_url)
sitemap_xml = load_local_sitemap(sitemap_url)
else:
logging.debug("The sitemap '%s' is remote.", sitemap_url)
if args.archive_sitemap:
remote_sitemaps.add(sitemap_url)
sitemap_xml = download_remote_sitemap(sitemap_url, session=session)
for url in extract_pages_from_sitemap(sitemap_xml):
archive_urls.append(format_archive_url(url))
# Archive the sitemap as well, if requested
if args.archive_sitemap:
logging.info("Archiving sitemaps")
if remote_sitemaps:
archive_urls += map(format_archive_url, remote_sitemaps)
else:
logging.debug("No remote sitemaps to backup.")
# And URLs from file
if args.file:
logging.info("Reading urls from file: %s", args.file)
with open(args.file) as file:
urls_from_file = (u.strip() for u in file.readlines() if u.strip())
archive_urls += map(format_archive_url, urls_from_file)
# Deduplicate URLs
archive_urls = set(archive_urls)
# Archive the URLs
logging.debug("Archive URLs: %s", archive_urls)
pool = mp.Pool(processes=args.jobs)
partial_call = partial(
call_archiver, rate_limit_wait=args.rate_limit_in_sec, session=session
)
pool.map(partial_call, archive_urls)
pool.close()
pool.join()
if __name__ == "__main__":
main()