#!/usr/bin/env python
# Check a website (perhaps localhost) against a local mirror.
# Find broken links and orphaned files.
# You must specify both the directory, and a web URL to a server
# (e.g. localhost) that is serving that directory.
import sys, os
import posixpath
import re
import urllib2, urlparse, urllib
# from bs4 import BeautifulSoup
from BeautifulSoup import BeautifulSoup
class Spider:
def __init__(self, rootdir, starturl):
self.debug = False
self.starturl = starturl
self.rootdir = os.path.normpath(rootdir)
if not os.path.isdir(rootdir):
# It's not a directory, so take the dirname, but save the filename.
self.rootdir, rootfile = os.path.split(rootdir)
# It's already a directory, so self.rootdir is fine.
rootfile = None
# XXX This next bit isn't platform-agnostic:
if not self.rootdir.endswith('/'):
self.rootdir += '/'
# Now we need to get the true root url. The starturl may have
# something like /index.html appended to it; we need something
# we can prepend to paths.
# Extract any path information from the root url:
parsed = urlparse.urlparse(starturl)
self.scheme = parsed.scheme = parsed.netloc
self.rooturlpath = posixpath.normpath(parsed.path)
dirpart, basepart = posixpath.split(self.rooturlpath)
# If the path is a directory and ends in / (as it should)
# then posixpath will split on that slash, not the previous one.
if not basepart:
dirpart, basepart = posixpath.split(dirpart)
# Now basepart is the last part of the path, which might
# be a directory name on the server or it might be index.*
# Compare it to the last part of self.rootdir, which is
# guaranteed to be a directory.
# But we have to split it twice, because self.rootdir ends in /
# so the first split will return '' as the basename.
lastdir = posixpath.basename(posixpath.dirname(self.rootdir))
if basepart != lastdir:
self.rooturlpath = posixpath.dirname(self.rooturlpath)
if not self.rooturlpath.endswith('/'):
self.rooturlpath += '/'
# Now we're confident self.rooturlpath is the base directory.
# Add the schema and host back on.
self.rooturl = urlparse.urlunsplit((self.scheme,,
self.rooturlpath, None, None))
if not self.rooturl.endswith('/'):
self.rooturl += '/'
print "rootdir:", self.rootdir
print "rooturl:", self.rooturl
print "rooturlpath:", self.rooturlpath
print "scheme:", self.scheme
print "host:",
self.urls_to_check = [ self.rooturl ]
self.urls_succeeded = []
self.urls_failed = []
self.outside_urls = []
self.files_succeeded = []
# Eventually, the list of excludes should be a commandline argument.
# For now, let's just make sure all the .git objects aren't orphaned,
# nor web stats or archived files.
self.excludes = [ ".git", "stats", "0-pre2011", "0-calendars" ]
# Files that aren't explicitly referenced by the website,
# but might be needed for other purposes.
self.nonorphans = [ "favicon.ico", "robots.txt", ".htaccess" ]
def spide(self):
'''Check all urls in urls_to_check, which has new urls
being added to it during the spidering process.
while self.urls_to_check:
print "Done spiding"
def check_orphans(self):
'''Assuming we already have self.files_succeeded,
find all files in self.rootdir that weren't in succeeded.
self.orphans = []
for root, dirs, files in os.walk(self.rootdir, topdown=True):
dirs[:] = [d for d in dirs if d not in self.excludes]
for filename in files:
if filename in self.nonorphans:
f = os.path.join(root, filename)
if f not in self.files_succeeded:
def print_summary(self):
print "URLs succeeded:"
print '\n'.join(self.urls_succeeded)
print "Outside URLs:"
print '\n'.join(self.outside_urls)
print "URLs failed:"
print '\n'.join(self.urls_failed)
print "Orphans:"
print '\n'.join(self.orphans)
print len(self.urls_succeeded), "good links,", \
len(self.outside_urls), "external urls not checked,", \
len(self.urls_failed), "bad links,", \
len(self.orphans), "orphaned files."
def get_local_for_url(self, urlpath):
'''Get a local file path for a path parsed from an absolute URL.
# Now compare parsed.path with self.rooturlpath
if self.rooturlpath not in urlpath:
return None
return os.path.normpath(urlpath.replace(self.rooturlpath,
def make_absolute(self, url, relative_to):
'''Make a URL absolute. If it's a relative path,
then make it relative to relative_to
which must be an absolute path on the webhost.
parsed = urlparse.urlparse(url)
if parsed.scheme: # already has an http://host specified
# XXX If we ever extend this to check validity of
# external URLs, this next condition is the one to change.
if parsed.netloc !=
if self.debug:
print "Ignoring external link", url
return None
return url
# So there's no scheme. Add one.
if parsed.path.startswith('/'):
# The results of urlparse() aren't modifiable, but
# if we turn them into a list we can modify them
# then turn them back into a URL.
lurl = list(parsed)
lurl[0] = self.scheme
lurl[1] =
return urlparse.urlunparse(lurl)
# Otherwise it's relative to urldir. Make it absolute, normalized.
lurl = list(parsed)
lurl[0] = self.scheme
lurl[1] =
lurl[2] = posixpath.normpath(posixpath.join(relative_to, parsed.path))
return urlparse.urlunparse(lurl)
def check_url(self, url):
'''Check a URL. This should be an absolute URL on the server.'''
# If we got this far, we'll be comparing links.
# So we'll need to know the parsed parts of this url.
urlparsed = urlparse.urlparse(url)
if not urlparsed.scheme or not urlparsed.path.startswith('/'):
print "EEK! Non-relative URL passed to check_url, bailing"
# URL encode special characters like spaces:
urlpath = urllib.quote(urlparsed.path)
# This check must come after the special char substitution.
if urlpath in self.urls_succeeded or urlpath in self.urls_failed:
if self.debug:
print "=============================== Checking", url
# Now we need just the directory part. This might be
# dirname(urlparsed.path), if the url is a file, or it
# might just be urlparsed.path if that's already a directory.
# The only way to know is to check on the local filesystem.
# But here's the tricky part: to get the absolute path,
# we need to know what relative links are relative_to,
# but if they themselves XXX
localpath = self.get_local_for_url(urlparsed.path)
if self.debug:
print "=== local for", urlpath, "is", localpath
if not localpath:
if self.debug:
print urlparsed.path, "is outside original directory; skipping"
if url not in self.outside_urls:
if not os.path.exists(localpath):
if self.debug:
print "Local path '%s' doesn't exist! %s" % (localpath, url)
# If we substituted any special characters, rebuild the URL:
if urlpath != urlparsed.path:
lurl = list(urlparsed)
lurl[2] = urlpath
url = urlparse.urlunparse(lurl)
if self.debug:
print "Substituted characters, recombined to", url
if os.path.isdir(localpath):
# The web server will substitute index.something,
# so we'd better do that too or else the index file
# will show up as an orphan.
localdir = localpath
localpath = None
for ext in ( "php", "cgi", "html" ):
indexfile = os.path.join(localdir, "index." + ext)
if os.path.exists(indexfile):
localpath = indexfile
if not localpath:
print "Can't find an index file inside", localdir
urldir = urlpath
localdir = os.path.dirname(localpath)
urldir = posixpath.dirname(urlpath)
if self.debug:
print "localpath", localpath, "localdir", localdir
print "urldir:", urldir
request = urllib2.Request(url)
handle = urllib2.build_opener()
except IOError:
return None
if not handle:
print "Can't open", url
# request.add_header("User-Agent", AGENT)
response =
info =
if 'content-type' not in info.keys() or \
not info['content-type'].startswith('text/html'):
if self.debug:
print url, "isn't HTML; skipping"
content = unicode(, "utf-8", errors="replace")
except urllib2.HTTPError, error:
if error.code == 404:
print "ERROR: %s -> %s" % (error, error.url)
print "ERROR: %s" % error
except urllib2.URLError, error:
print "ERROR: %s" % error
ctype = response.headers['content-type']
if not ctype.startswith("text/html"):
if self.debug:
print url, "isn't HTML (%s); not reading content" % ctype
soup = BeautifulSoup(content)
for tag in soup.findAll('a', href=True):
href = tag.get("href")
if not href:
if href[0] == '#':
href = self.make_absolute(href, urldir)
if not href:
# It's probably an external URL. Skip it.
href = tag.get("href")
if href not in self.outside_urls:
# This check won't get everything, because href
# hasn't been special char substituted yet.
if href not in self.urls_to_check and \
href not in self.urls_succeeded and \
href not in self.urls_failed:
for tag in soup.findAll('img', src=True):
src = self.make_absolute(tag.get('src'), urldir)
if not src:
# self.urls_succeeded.append(src)
urlparsed = urlparse.urlparse(src)
localpath = self.get_local_for_url(urlparsed.path)
if __name__ == '__main__':
if len(sys.argv) < 3:
print "Usage: %s local_dir url" % os.path.basename(sys.argv[0])
spider = Spider(sys.argv[1], sys.argv[2])
except KeyboardInterrupt:
print "Interrupt"