-
Notifications
You must be signed in to change notification settings - Fork 2
/
faviconlookup.py
64 lines (57 loc) · 2.05 KB
/
faviconlookup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#
from google.appengine.api import urlfetch
from BeautifulSoup import BeautifulSoup
import logging
import urlparse
import helpers
import urllib
def find_favicon(html):
soup = BeautifulSoup(html)
icons = soup.findAll('link', rel="shortcut icon")
if icons:
return [v for k,v in icons[0].attrs if k=='href'][0]
icons = soup.findAll('link', rel="icon")
if icons:
return [v for k,v in icons[0].attrs if k=='href'][0]
return None
def format_url(url):
if not url.startswith('http://'):
url = 'http://'+url
return urllib.unquote(url)
def fetch_url(url):
logging.info("Fetching: %s" % (format_url(url)))
result = urlfetch.fetch(format_url(url))
logging.info("Got %d response code" % (result.status_code))
if is_valid_response(result.status_code):
return result
return None
def is_valid_response(code):
if code / 100 == 2 or code / 100 == 3:
return True
return False
@helpers.autocached
def getfavicon(url):
logging.info("Working out favicon for %s" % (url))
result = fetch_url(url)
favicon_url = None
if result:
favicon_url = find_favicon(result.content)
if favicon_url:
if favicon_url.startswith('/'):
favicon_url = urlparse.urlparse(format_url(result.final_url or url)).netloc + favicon_url
if fetch_url(favicon_url):
return favicon_url
# At this point try site/favicon.ico
# We may have been redirected, if so check the root of the site we got redirected to
if not url.endswith('/favicon.ico'):
# If we got a 404 then result is None and doesn't have a final_url, but /favicon might still exist
if result and result.final_url:
root = urlparse.urlparse(format_url(result.final_url)).netloc
else:
root = urlparse.urlparse(format_url(url)).netloc
default_url = root+"/favicon.ico"
if default_url == "feedproxy.google.com/favicon.ico":
return None
result = fetch_url(default_url)
if result:
return default_url