Skip to content

Commit

Permalink
Fixed handling of relative URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
mnlipp committed Feb 22, 2019
1 parent 4cfda02 commit b1f2f03
Showing 1 changed file with 18 additions and 8 deletions.
26 changes: 18 additions & 8 deletions crawler.py
Expand Up @@ -3,7 +3,7 @@

import config
import logging
from urllib.parse import urljoin, urlunparse
from urllib.parse import urljoin, urlunparse, urlsplit, urlunsplit

import re
from urllib.parse import urlparse
Expand Down Expand Up @@ -256,7 +256,6 @@ def __crawl(self, current_url):
links = self.linkregex.findall(msg)
for link in links:
link = link.decode("utf-8", errors="ignore")
link = self.clean_link(link)
logging.debug("Found : {0}".format(link))

if link.startswith('/'):
Expand All @@ -266,7 +265,7 @@ def __crawl(self, current_url):
elif link.startswith(("mailto", "tel")):
continue
elif not link.startswith(('http', "https")):
link = url.scheme + '://' + url[1] + '/' + link
link = self.clean_link(urljoin(current_url, link))

# Remove the anchor part if needed
if "#" in link:
Expand Down Expand Up @@ -323,11 +322,22 @@ def __crawl(self, current_url):


def clean_link(self, link):
l = urlparse(link)
l_res = list(l)
l_res[2] = l_res[2].replace("./", "/")
l_res[2] = l_res[2].replace("//", "/")
return urlunparse(l_res)
parts = list(urlsplit(link))
parts[2] = self.resolve_url_path(parts[2])
return urlunsplit(parts)

def resolve_url_path(self, path):
# From https://stackoverflow.com/questions/4317242/python-how-to-resolve-urls-containing/40536115#40536115
segments = path.split('/')
segments = [segment + '/' for segment in segments[:-1]] + [segments[-1]]
resolved = []
for segment in segments:
if segment in ('../', '..'):
if resolved[1:]:
resolved.pop()
elif segment not in ('./', '.'):
resolved.append(segment)
return ''.join(resolved)

@staticmethod
def is_image(path):
Expand Down

0 comments on commit b1f2f03

Please sign in to comment.