-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
84 lines (62 loc) · 2.37 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
import string
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import defaultdict
class Downloader:
@staticmethod
def get_page(url):
# TODO: 200, 404, etc are handled but a timeout could be added
response = requests.get(url)
return response.content if response.status_code == requests.codes.ok else None
class Parser:
def __init__(self, html, source_url):
self.soup = BeautifulSoup(html)
self.url = source_url
def get_links(self):
links = [urljoin(self.url, link.attrs['href']) for link in self.soup.find_all('a') if 'href' in link.attrs]
return links
def get_content(self):
text = self.soup.body.get_text()
return text
# translate (which is faster than regex) is used to remove all punctuation
# table = str.maketrans("", "", string.punctuation)
# return text.translate(table).split()
class Frontier:
def __init__(self, seed_urls):
#TODO: validate urls
self.visted_pages = []
self.upcoming_pages = seed_urls
def add_page(self, url):
if not url in self.visted_pages and not url in self.upcoming_pages:
self.upcoming_pages.append(url)
def add_pages(self, urls):
for url in urls:
self.add_page(url)
def has_next_page(self):
return len(self.upcoming_pages) > 0
def get_next_page(self):
if not self.upcoming_pages: return None
url = self.upcoming_pages.pop()
self.visted_pages.append(url)
return url
class Crawler:
def __init__(self, seed_urls):
self.webgraph_out = {}
self.webgraph_in = defaultdict(list)
self.contents = []
self._crawl_pages(seed_urls)
self._generate_ingraph()
def _crawl_pages(self, seed_urls):
front = Frontier(seed_urls)
while front.has_next_page():
next_url = front.get_next_page()
p = Parser(Downloader.get_page(next_url), next_url)
links = p.get_links()
front.add_pages(links)
self.contents.append((next_url, p.get_content()))
self.webgraph_out[next_url] = tuple(links)
def _generate_ingraph(self,):
for url, links in self.webgraph_out.items():
for link in links:
self.webgraph_in[link].append(url)